reliability.rs

  1use anyhow::{Context, Result};
  2use backtrace::{self, Backtrace};
  3use chrono::Utc;
  4use db::kvp::KEY_VALUE_STORE;
  5use gpui::{App, AppContext, SemanticVersion};
  6use isahc::config::Configurable;
  7
  8use paths::{CRASHES_DIR, CRASHES_RETIRED_DIR};
  9use release_channel::ReleaseChannel;
 10use release_channel::RELEASE_CHANNEL;
 11use serde::{Deserialize, Serialize};
 12use settings::Settings;
 13use smol::stream::StreamExt;
 14use std::{
 15    env,
 16    ffi::OsStr,
 17    sync::{atomic::Ordering, Arc},
 18};
 19use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
 20use util::{
 21    http::{self, HttpClient, HttpClientWithUrl},
 22    paths, ResultExt,
 23};
 24
 25use crate::stdout_is_a_pty;
 26
 27#[derive(Serialize, Deserialize)]
 28struct LocationData {
 29    file: String,
 30    line: u32,
 31}
 32
 33#[derive(Serialize, Deserialize)]
 34struct Panic {
 35    thread: String,
 36    payload: String,
 37    #[serde(skip_serializing_if = "Option::is_none")]
 38    location_data: Option<LocationData>,
 39    backtrace: Vec<String>,
 40    app_version: String,
 41    release_channel: String,
 42    os_name: String,
 43    os_version: Option<String>,
 44    architecture: String,
 45    panicked_on: i64,
 46    #[serde(skip_serializing_if = "Option::is_none")]
 47    installation_id: Option<String>,
 48    session_id: String,
 49}
 50
 51#[derive(Serialize)]
 52struct PanicRequest {
 53    panic: Panic,
 54}
 55
 56static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 57
 58pub fn init_panic_hook(app: &App, installation_id: Option<String>, session_id: String) {
 59    let is_pty = stdout_is_a_pty();
 60    let app_metadata = app.metadata();
 61
 62    panic::set_hook(Box::new(move |info| {
 63        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
 64        if prior_panic_count > 0 {
 65            // Give the panic-ing thread time to write the panic file
 66            loop {
 67                std::thread::yield_now();
 68            }
 69        }
 70
 71        let thread = thread::current();
 72        let thread_name = thread.name().unwrap_or("<unnamed>");
 73
 74        let payload = info
 75            .payload()
 76            .downcast_ref::<&str>()
 77            .map(|s| s.to_string())
 78            .or_else(|| info.payload().downcast_ref::<String>().map(|s| s.clone()))
 79            .unwrap_or_else(|| "Box<Any>".to_string());
 80
 81        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
 82            let location = info.location().unwrap();
 83            let backtrace = Backtrace::new();
 84            eprintln!(
 85                "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}",
 86                thread_name,
 87                payload,
 88                location.file(),
 89                location.line(),
 90                location.column(),
 91                backtrace,
 92            );
 93            std::process::exit(-1);
 94        }
 95
 96        let app_version = if let Some(version) = app_metadata.app_version {
 97            version.to_string()
 98        } else {
 99            option_env!("CARGO_PKG_VERSION")
100                .unwrap_or("dev")
101                .to_string()
102        };
103
104        let backtrace = Backtrace::new();
105        let mut backtrace = backtrace
106            .frames()
107            .iter()
108            .flat_map(|frame| {
109                frame
110                    .symbols()
111                    .iter()
112                    .filter_map(|frame| Some(format!("{:#}", frame.name()?)))
113            })
114            .collect::<Vec<_>>();
115
116        // Strip out leading stack frames for rust panic-handling.
117        if let Some(ix) = backtrace
118            .iter()
119            .position(|name| name == "rust_begin_unwind")
120        {
121            backtrace.drain(0..=ix);
122        }
123
124        let panic_data = Panic {
125            thread: thread_name.into(),
126            payload,
127            location_data: info.location().map(|location| LocationData {
128                file: location.file().into(),
129                line: location.line(),
130            }),
131            app_version: app_version.to_string(),
132            release_channel: RELEASE_CHANNEL.display_name().into(),
133            os_name: app_metadata.os_name.into(),
134            os_version: app_metadata
135                .os_version
136                .as_ref()
137                .map(SemanticVersion::to_string),
138            architecture: env::consts::ARCH.into(),
139            panicked_on: Utc::now().timestamp_millis(),
140            backtrace,
141            installation_id: installation_id.clone(),
142            session_id: session_id.clone(),
143        };
144
145        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
146            log::error!("{}", panic_data_json);
147        }
148
149        if !is_pty {
150            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
151                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
152                let panic_file_path = paths::LOGS_DIR.join(format!("zed-{}.panic", timestamp));
153                let panic_file = std::fs::OpenOptions::new()
154                    .append(true)
155                    .create(true)
156                    .open(&panic_file_path)
157                    .log_err();
158                if let Some(mut panic_file) = panic_file {
159                    writeln!(&mut panic_file, "{}", panic_data_json).log_err();
160                    panic_file.flush().log_err();
161                }
162            }
163        }
164
165        std::process::abort();
166    }));
167}
168
169pub fn init(
170    http_client: Arc<HttpClientWithUrl>,
171    installation_id: Option<String>,
172    cx: &mut AppContext,
173) {
174    #[cfg(target_os = "macos")]
175    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
176
177    upload_panics_and_crashes(http_client, installation_id, cx)
178}
179
180#[cfg(target_os = "macos")]
181pub fn monitor_main_thread_hangs(
182    http_client: Arc<HttpClientWithUrl>,
183    installation_id: Option<String>,
184    cx: &AppContext,
185) {
186    use nix::sys::signal::{
187        sigaction, SaFlags, SigAction, SigHandler, SigSet,
188        Signal::{self, SIGUSR2},
189    };
190
191    use parking_lot::Mutex;
192
193    use std::{
194        ffi::c_int,
195        sync::{mpsc, OnceLock},
196        time::Duration,
197    };
198    use telemetry_events::{BacktraceFrame, HangReport};
199    use util::http::Method;
200
201    use nix::sys::pthread;
202
203    let foreground_executor = cx.foreground_executor();
204    let background_executor = cx.background_executor();
205    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
206    let metadata = cx.app_metadata();
207
208    // Initialize SIGUSR2 handler to send a backrace to a channel.
209    let (backtrace_tx, backtrace_rx) = mpsc::channel();
210    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
211    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
212    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
213    BACKTRACE.lock().reserve(100);
214
215    fn handle_backtrace_signal() {
216        unsafe {
217            extern "C" fn handle_sigusr2(_i: c_int) {
218                unsafe {
219                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
220                    // which can only be triggered by This signal handler. In addition,
221                    // this signal handler is immediately removed by SA_RESETHAND, and this
222                    // signal handler cannot be re-entrant due to to the SIGUSR2 mask defined
223                    // below
224                    let mut bt = BACKTRACE.lock();
225                    bt.clear();
226                    backtrace::trace_unsynchronized(|frame| {
227                        if bt.len() < bt.capacity() {
228                            bt.push(frame.clone());
229                            true
230                        } else {
231                            false
232                        }
233                    });
234                }
235
236                BACKTRACE_SENDER.get().unwrap().send(()).ok();
237            }
238
239            let mut mask = SigSet::empty();
240            mask.add(SIGUSR2);
241            sigaction(
242                Signal::SIGUSR2,
243                &SigAction::new(
244                    SigHandler::Handler(handle_sigusr2),
245                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
246                    mask,
247                ),
248            )
249            .log_err();
250        }
251    }
252
253    handle_backtrace_signal();
254    let main_thread = pthread::pthread_self();
255
256    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
257    foreground_executor
258        .spawn(async move { while let Some(_) = rx.next().await {} })
259        .detach();
260
261    background_executor
262        .spawn({
263            let background_executor = background_executor.clone();
264            async move {
265                loop {
266                    background_executor.timer(Duration::from_secs(1)).await;
267                    match tx.try_send(()) {
268                        Ok(_) => continue,
269                        Err(e) => {
270                            if e.into_send_error().is_full() {
271                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
272                            }
273                            // Only detect the first hang
274                            break;
275                        }
276                    }
277                }
278            }
279        })
280        .detach();
281
282    background_executor
283        .clone()
284        .spawn(async move {
285            loop {
286                while let Some(_) = backtrace_rx.recv().ok() {
287                    if !telemetry_settings.diagnostics {
288                        return;
289                    }
290
291                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
292                    // the backtrace transmitter has fired, which itself is only done
293                    // by the signal handler. And due to SA_RESETHAND  the signal handler
294                    // will not run again until `handle_backtrace_signal` is called.
295                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
296                    let backtrace: Vec<_> = raw_backtrace
297                        .into_iter()
298                        .map(|frame| {
299                            let mut btf = BacktraceFrame {
300                                ip: frame.ip() as usize,
301                                symbol_addr: frame.symbol_address() as usize,
302                                base: frame.module_base_address().map(|addr| addr as usize),
303                                symbols: vec![],
304                            };
305
306                            backtrace::resolve_frame(&frame, |symbol| {
307                                if let Some(name) = symbol.name() {
308                                    btf.symbols.push(name.to_string());
309                                }
310                            });
311
312                            btf
313                        })
314                        .collect();
315
316                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
317                    handle_backtrace_signal();
318
319                    log::error!(
320                        "Suspected hang on main thread:\n{}",
321                        backtrace
322                            .iter()
323                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
324                            .collect::<Vec<_>>()
325                            .join("\n")
326                    );
327
328                    let report = HangReport {
329                        backtrace,
330                        app_version: metadata.app_version,
331                        os_name: metadata.os_name.to_owned(),
332                        os_version: metadata.os_version,
333                        architecture: env::consts::ARCH.into(),
334                        installation_id: installation_id.clone(),
335                    };
336
337                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
338                        continue;
339                    };
340
341                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
342                    else {
343                        continue;
344                    };
345
346                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
347                        continue;
348                    };
349
350                    let Ok(request) = http::Request::builder()
351                        .method(Method::POST)
352                        .uri(url.as_ref())
353                        .header("x-zed-checksum", checksum)
354                        .body(json_bytes.into())
355                    else {
356                        continue;
357                    };
358
359                    if let Some(response) = http_client.send(request).await.log_err() {
360                        if response.status() != 200 {
361                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
362                        }
363                    }
364                }
365            }
366        })
367        .detach()
368}
369
370fn upload_panics_and_crashes(
371    http: Arc<HttpClientWithUrl>,
372    installation_id: Option<String>,
373    cx: &mut AppContext,
374) {
375    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
376    cx.background_executor()
377        .spawn(async move {
378            let most_recent_panic = upload_previous_panics(http.clone(), telemetry_settings)
379                .await
380                .log_err()
381                .flatten();
382            upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
383                .await
384                .log_err()
385        })
386        .detach()
387}
388
389/// Uploads panics via `zed.dev`.
390async fn upload_previous_panics(
391    http: Arc<HttpClientWithUrl>,
392    telemetry_settings: client::TelemetrySettings,
393) -> Result<Option<(i64, String)>> {
394    let panic_report_url = http.build_url("/api/panic");
395    let mut children = smol::fs::read_dir(&*paths::LOGS_DIR).await?;
396
397    let mut most_recent_panic = None;
398
399    while let Some(child) = children.next().await {
400        let child = child?;
401        let child_path = child.path();
402
403        if child_path.extension() != Some(OsStr::new("panic")) {
404            continue;
405        }
406        let filename = if let Some(filename) = child_path.file_name() {
407            filename.to_string_lossy()
408        } else {
409            continue;
410        };
411
412        if !filename.starts_with("zed") {
413            continue;
414        }
415
416        if telemetry_settings.diagnostics {
417            let panic_file_content = smol::fs::read_to_string(&child_path)
418                .await
419                .context("error reading panic file")?;
420
421            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
422                .ok()
423                .or_else(|| {
424                    panic_file_content
425                        .lines()
426                        .next()
427                        .and_then(|line| serde_json::from_str(line).ok())
428                })
429                .unwrap_or_else(|| {
430                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
431                    None
432                });
433
434            if let Some(panic) = panic {
435                most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
436
437                let body = serde_json::to_string(&PanicRequest { panic }).unwrap();
438
439                let request = http::Request::post(&panic_report_url)
440                    .redirect_policy(isahc::config::RedirectPolicy::Follow)
441                    .header("Content-Type", "application/json")
442                    .body(body.into())?;
443                let response = http.send(request).await.context("error sending panic")?;
444                if !response.status().is_success() {
445                    log::error!("Error uploading panic to server: {}", response.status());
446                }
447            }
448        }
449
450        // We've done what we can, delete the file
451        std::fs::remove_file(child_path)
452            .context("error removing panic")
453            .log_err();
454    }
455    Ok::<_, anyhow::Error>(most_recent_panic)
456}
457
458static LAST_CRASH_UPLOADED: &'static str = "LAST_CRASH_UPLOADED";
459
460/// upload crashes from apple's diagnostic reports to our server.
461/// (only if telemetry is enabled)
462async fn upload_previous_crashes(
463    http: Arc<HttpClientWithUrl>,
464    most_recent_panic: Option<(i64, String)>,
465    installation_id: Option<String>,
466    telemetry_settings: client::TelemetrySettings,
467) -> Result<()> {
468    if !telemetry_settings.diagnostics {
469        return Ok(());
470    }
471    let last_uploaded = KEY_VALUE_STORE
472        .read_kvp(LAST_CRASH_UPLOADED)?
473        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
474    let mut uploaded = last_uploaded.clone();
475
476    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
477
478    // crash directories are only set on MacOS
479    for dir in [&*CRASHES_DIR, &*CRASHES_RETIRED_DIR]
480        .iter()
481        .filter_map(|d| d.as_deref())
482    {
483        let mut children = smol::fs::read_dir(&dir).await?;
484        while let Some(child) = children.next().await {
485            let child = child?;
486            let Some(filename) = child
487                .path()
488                .file_name()
489                .map(|f| f.to_string_lossy().to_lowercase())
490            else {
491                continue;
492            };
493
494            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
495                continue;
496            }
497
498            if filename <= last_uploaded {
499                continue;
500            }
501
502            let body = smol::fs::read_to_string(&child.path())
503                .await
504                .context("error reading crash file")?;
505
506            let mut request = http::Request::post(&crash_report_url.to_string())
507                .redirect_policy(isahc::config::RedirectPolicy::Follow)
508                .header("Content-Type", "text/plain");
509
510            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
511                request = request
512                    .header("x-zed-panicked-on", format!("{}", panicked_on))
513                    .header("x-zed-panic", payload)
514            }
515            if let Some(installation_id) = installation_id.as_ref() {
516                request = request.header("x-zed-installation-id", installation_id);
517            }
518
519            let request = request.body(body.into())?;
520
521            let response = http.send(request).await.context("error sending crash")?;
522            if !response.status().is_success() {
523                log::error!("Error uploading crash to server: {}", response.status());
524            }
525
526            if uploaded < filename {
527                uploaded = filename.clone();
528                KEY_VALUE_STORE
529                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
530                    .await?;
531            }
532        }
533    }
534
535    Ok(())
536}