Hang diagnostics (#11190)

Conrad Irwin and Mikayla created

Release Notes:

- Added diagnostics for main-thread hangs on macOS. These are only
enabled if you've opted into diagnostics.

---------

Co-authored-by: Mikayla <mikayla@zed.dev>

Change summary

Cargo.lock                                      |   2 
Cargo.toml                                      |   1 
crates/client/src/telemetry.rs                  |  32 
crates/collab/src/api/events.rs                 | 128 ++++
crates/telemetry_events/src/telemetry_events.rs |  18 
crates/zed/Cargo.toml                           |   2 
crates/zed/src/main.rs                          | 358 -----------
crates/zed/src/reliability.rs                   | 536 +++++++++++++++++++
8 files changed, 721 insertions(+), 356 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -12701,6 +12701,7 @@ dependencies = [
  "markdown_preview",
  "menu",
  "mimalloc",
+ "nix 0.28.0",
  "node_runtime",
  "notifications",
  "outline",
@@ -12723,6 +12724,7 @@ dependencies = [
  "tab_switcher",
  "task",
  "tasks_ui",
+ "telemetry_events",
  "terminal_view",
  "theme",
  "theme_selector",

Cargo.toml 🔗

@@ -284,6 +284,7 @@ lazy_static = "1.4.0"
 linkify = "0.10.0"
 log = { version = "0.4.16", features = ["kv_unstable_serde"] }
 nanoid = "0.4"
+nix = "0.28"
 ordered-float = "2.1.1"
 palette = { version = "0.7.5", default-features = false, features = ["std"] }
 parking_lot = "0.12.1"

crates/client/src/telemetry.rs 🔗

@@ -421,7 +421,7 @@ impl Telemetry {
             return;
         }
 
-        let Some(checksum_seed) = &*ZED_CLIENT_CHECKSUM_SEED else {
+        if ZED_CLIENT_CHECKSUM_SEED.is_none() {
             return;
         };
 
@@ -466,15 +466,9 @@ impl Telemetry {
                         serde_json::to_writer(&mut json_bytes, &request_body)?;
                     }
 
-                    let mut summer = Sha256::new();
-                    summer.update(checksum_seed);
-                    summer.update(&json_bytes);
-                    summer.update(checksum_seed);
-                    let mut checksum = String::new();
-                    for byte in summer.finalize().as_slice() {
-                        use std::fmt::Write;
-                        write!(&mut checksum, "{:02x}", byte).unwrap();
-                    }
+                    let Some(checksum) = calculate_json_checksum(&json_bytes) else {
+                        return Ok(());
+                    };
 
                     let request = http::Request::builder()
                         .method(Method::POST)
@@ -657,3 +651,21 @@ mod tests {
             && telemetry.state.lock().first_event_date_time.is_none()
     }
 }
+
+pub fn calculate_json_checksum(json: &impl AsRef<[u8]>) -> Option<String> {
+    let Some(checksum_seed) = &*ZED_CLIENT_CHECKSUM_SEED else {
+        return None;
+    };
+
+    let mut summer = Sha256::new();
+    summer.update(checksum_seed);
+    summer.update(&json);
+    summer.update(checksum_seed);
+    let mut checksum = String::new();
+    for byte in summer.finalize().as_slice() {
+        use std::fmt::Write;
+        write!(&mut checksum, "{:02x}", byte).unwrap();
+    }
+
+    Some(checksum)
+}

crates/collab/src/api/events.rs 🔗

@@ -18,11 +18,15 @@ use telemetry_events::{
     ActionEvent, AppEvent, AssistantEvent, CallEvent, CopilotEvent, CpuEvent, EditEvent,
     EditorEvent, Event, EventRequestBody, EventWrapper, ExtensionEvent, MemoryEvent, SettingEvent,
 };
+use uuid::Uuid;
+
+static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";
 
 pub fn router() -> Router {
     Router::new()
         .route("/telemetry/events", post(post_events))
         .route("/telemetry/crashes", post(post_crash))
+        .route("/telemetry/hangs", post(post_hang))
 }
 
 pub struct ZedChecksumHeader(Vec<u8>);
@@ -85,8 +89,6 @@ pub async fn post_crash(
     headers: HeaderMap,
     body: Bytes,
 ) -> Result<()> {
-    static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";
-
     let report = IpsFile::parse(&body)?;
     let version_threshold = SemanticVersion::new(0, 123, 0);
 
@@ -222,6 +224,107 @@ pub async fn post_crash(
     Ok(())
 }
 
+pub async fn post_hang(
+    Extension(app): Extension<Arc<AppState>>,
+    TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
+    body: Bytes,
+) -> Result<()> {
+    let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
+        return Err(Error::Http(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            "events not enabled".into(),
+        ))?;
+    };
+
+    if checksum != expected {
+        return Err(Error::Http(
+            StatusCode::BAD_REQUEST,
+            "invalid checksum".into(),
+        ))?;
+    }
+
+    let incident_id = Uuid::new_v4().to_string();
+
+    // dump JSON into S3 so we can get frame offsets if we need to.
+    if let Some(blob_store_client) = app.blob_store_client.as_ref() {
+        blob_store_client
+            .put_object()
+            .bucket(CRASH_REPORTS_BUCKET)
+            .key(incident_id.clone() + ".hang.json")
+            .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
+            .body(ByteStream::from(body.to_vec()))
+            .send()
+            .await
+            .map_err(|e| log::error!("Failed to upload crash: {}", e))
+            .ok();
+    }
+
+    let report: telemetry_events::HangReport = serde_json::from_slice(&body).map_err(|err| {
+        log::error!("can't parse report json: {err}");
+        Error::Internal(anyhow!(err))
+    })?;
+
+    let mut backtrace = "Possible hang detected on main threadL".to_string();
+    let unknown = "<unknown>".to_string();
+    for frame in report.backtrace.iter() {
+        backtrace.push_str(&format!("\n{}", frame.symbols.first().unwrap_or(&unknown)));
+    }
+
+    tracing::error!(
+        service = "client",
+        version = %report.app_version.unwrap_or_default().to_string(),
+        os_name = %report.os_name,
+        os_version = report.os_version.unwrap_or_default().to_string(),
+        incident_id = %incident_id,
+        installation_id = %report.installation_id.unwrap_or_default(),
+        backtrace = %backtrace,
+        "hang report");
+
+    if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
+        let payload = slack::WebhookBody::new(|w| {
+            w.add_section(|s| s.text(slack::Text::markdown("Possible Hang".to_string())))
+                .add_section(|s| {
+                    s.add_field(slack::Text::markdown(format!(
+                        "*Version:*\n {} ",
+                        report.app_version.unwrap_or_default()
+                    )))
+                    .add_field({
+                        let hostname = app.config.blob_store_url.clone().unwrap_or_default();
+                        let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
+                            hostname.strip_prefix("http://").unwrap_or_default()
+                        });
+
+                        slack::Text::markdown(format!(
+                            "*Incident:*\n<https://{}.{}/{}.hang.json|{}…>",
+                            CRASH_REPORTS_BUCKET,
+                            hostname,
+                            incident_id,
+                            incident_id.chars().take(8).collect::<String>(),
+                        ))
+                    })
+                })
+                .add_rich_text(|r| r.add_preformatted(|p| p.add_text(backtrace)))
+        });
+        let payload_json = serde_json::to_string(&payload).map_err(|err| {
+            log::error!("Failed to serialize payload to JSON: {err}");
+            Error::Internal(anyhow!(err))
+        })?;
+
+        reqwest::Client::new()
+            .post(slack_panics_webhook)
+            .header("Content-Type", "application/json")
+            .body(payload_json)
+            .send()
+            .await
+            .map_err(|err| {
+                log::error!("Failed to send payload to Slack: {err}");
+                Error::Internal(anyhow!(err))
+            })?;
+    }
+
+    Ok(())
+}
+
 pub async fn post_events(
     Extension(app): Extension<Arc<AppState>>,
     TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
@@ -235,19 +338,14 @@ pub async fn post_events(
         ))?
     };
 
-    let Some(checksum_seed) = app.config.zed_client_checksum_seed.as_ref() else {
+    let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
         return Err(Error::Http(
             StatusCode::INTERNAL_SERVER_ERROR,
             "events not enabled".into(),
         ))?;
     };
 
-    let mut summer = Sha256::new();
-    summer.update(checksum_seed);
-    summer.update(&body);
-    summer.update(checksum_seed);
-
-    if &checksum != &summer.finalize()[..] {
+    if checksum != expected {
         return Err(Error::Http(
             StatusCode::BAD_REQUEST,
             "invalid checksum".into(),
@@ -1061,3 +1159,15 @@ impl ActionEventRow {
         }
     }
 }
+
+pub fn calculate_json_checksum(app: Arc<AppState>, json: &impl AsRef<[u8]>) -> Option<Vec<u8>> {
+    let Some(checksum_seed) = app.config.zed_client_checksum_seed.as_ref() else {
+        return None;
+    };
+
+    let mut summer = Sha256::new();
+    summer.update(checksum_seed);
+    summer.update(&json);
+    summer.update(checksum_seed);
+    Some(summer.finalize().into_iter().collect())
+}

crates/telemetry_events/src/telemetry_events.rs 🔗

@@ -135,3 +135,21 @@ pub struct ExtensionEvent {
 pub struct AppEvent {
     pub operation: String,
 }
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct BacktraceFrame {
+    pub ip: usize,
+    pub symbol_addr: usize,
+    pub base: Option<usize>,
+    pub symbols: Vec<String>,
+}
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct HangReport {
+    pub backtrace: Vec<BacktraceFrame>,
+    pub app_version: Option<SemanticVersion>,
+    pub os_name: String,
+    pub os_version: Option<SemanticVersion>,
+    pub architecture: String,
+    pub installation_id: Option<String>,
+}

crates/zed/Cargo.toml 🔗

@@ -62,6 +62,7 @@ log.workspace = true
 markdown_preview.workspace = true
 menu.workspace = true
 mimalloc = "0.1"
+nix = {workspace = true, features = ["pthread"] }
 node_runtime.workspace = true
 notifications.workspace = true
 outline.workspace = true
@@ -84,6 +85,7 @@ smol.workspace = true
 tab_switcher.workspace = true
 task.workspace = true
 tasks_ui.workspace = true
+telemetry_events.workspace = true
 terminal_view.workspace = true
 theme.workspace = true
 theme_selector.workspace = true

crates/zed/src/main.rs 🔗

@@ -3,11 +3,10 @@
 // Disable command line from opening on release mode
 #![cfg_attr(not(debug_assertions), windows_subsystem = "windows")]
 
+mod reliability;
 mod zed;
 
 use anyhow::{anyhow, Context as _, Result};
-use backtrace::Backtrace;
-use chrono::Utc;
 use clap::{command, Parser};
 use cli::FORCE_CLI_MODE_ENV_VAR_NAME;
 use client::{parse_zed_link, telemetry::Telemetry, Client, DevServerToken, UserStore};
@@ -19,11 +18,8 @@ use editor::{Editor, EditorMode};
 use env_logger::Builder;
 use fs::RealFs;
 use futures::{future, StreamExt};
-use gpui::{
-    App, AppContext, AsyncAppContext, Context, SemanticVersion, Task, ViewContext, VisualContext,
-};
+use gpui::{App, AppContext, AsyncAppContext, Context, Task, ViewContext, VisualContext};
 use image_viewer;
-use isahc::{prelude::Configurable, Request};
 use language::LanguageRegistry;
 use log::LevelFilter;
 
@@ -31,8 +27,7 @@ use assets::Assets;
 use mimalloc::MiMalloc;
 use node_runtime::RealNodeRuntime;
 use parking_lot::Mutex;
-use release_channel::{AppCommitSha, ReleaseChannel, RELEASE_CHANNEL};
-use serde::{Deserialize, Serialize};
+use release_channel::AppCommitSha;
 use settings::{
     default_settings, handle_settings_file_changes, watch_config_file, Settings, SettingsStore,
 };
@@ -40,22 +35,16 @@ use simplelog::ConfigBuilder;
 use smol::process::Command;
 use std::{
     env,
-    ffi::OsStr,
     fs::OpenOptions,
     io::{IsTerminal, Write},
-    panic,
     path::Path,
-    sync::{
-        atomic::{AtomicU32, Ordering},
-        Arc,
-    },
-    thread,
+    sync::Arc,
 };
 use theme::{ActiveTheme, SystemAppearance, ThemeRegistry, ThemeSettings};
 use util::{
-    http::{HttpClient, HttpClientWithUrl},
+    http::HttpClientWithUrl,
     maybe, parse_env_output,
-    paths::{self, CRASHES_DIR, CRASHES_RETIRED_DIR},
+    paths::{self},
     ResultExt, TryFutureExt,
 };
 use uuid::Uuid;
@@ -93,7 +82,18 @@ fn init_headless(dev_server_token: DevServerToken) {
     }
     init_logger();
 
-    App::new().run(|cx| {
+    let app = App::new();
+
+    let session_id = Uuid::new_v4().to_string();
+    let (installation_id, _) = app
+        .background_executor()
+        .block(installation_id())
+        .ok()
+        .unzip();
+
+    reliability::init_panic_hook(&app, installation_id.clone(), session_id.clone());
+
+    app.run(|cx| {
         release_channel::init(env!("CARGO_PKG_VERSION"), cx);
         if let Some(build_sha) = option_env!("ZED_COMMIT_SHA") {
             AppCommitSha::set_global(AppCommitSha(build_sha.into()), cx);
@@ -145,12 +145,7 @@ fn init_headless(dev_server_token: DevServerToken) {
         );
         handle_settings_file_changes(user_settings_file_rx, cx);
 
-        let (installation_id, _) = cx
-            .background_executor()
-            .block(installation_id())
-            .ok()
-            .unzip();
-        upload_panics_and_crashes(client.http_client(), installation_id, cx);
+        reliability::init(client.http_client(), installation_id, cx);
 
         headless::init(
             client.clone(),
@@ -189,7 +184,7 @@ fn init_ui(args: Args) {
         .ok()
         .unzip();
     let session_id = Uuid::new_v4().to_string();
-    init_panic_hook(&app, installation_id.clone(), session_id.clone());
+    reliability::init_panic_hook(&app, installation_id.clone(), session_id.clone());
 
     let git_binary_path = if option_env!("ZED_BUNDLE").as_deref() == Some("true") {
         app.path_for_auxiliary_executable("git")
@@ -386,7 +381,7 @@ fn init_ui(args: Args) {
         cx.set_menus(app_menus());
         initialize_workspace(app_state.clone(), cx);
 
-        upload_panics_and_crashes(client.http_client(), installation_id, cx);
+        reliability::init(client.http_client(), installation_id, cx);
 
         cx.activate(true);
 
@@ -688,317 +683,6 @@ fn init_stdout_logger() {
         })
         .init();
 }
-
-#[derive(Serialize, Deserialize)]
-struct LocationData {
-    file: String,
-    line: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-struct Panic {
-    thread: String,
-    payload: String,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    location_data: Option<LocationData>,
-    backtrace: Vec<String>,
-    app_version: String,
-    release_channel: String,
-    os_name: String,
-    os_version: Option<String>,
-    architecture: String,
-    panicked_on: i64,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    installation_id: Option<String>,
-    session_id: String,
-}
-
-#[derive(Serialize)]
-struct PanicRequest {
-    panic: Panic,
-}
-
-static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
-
-fn init_panic_hook(app: &App, installation_id: Option<String>, session_id: String) {
-    let is_pty = stdout_is_a_pty();
-    let app_metadata = app.metadata();
-
-    panic::set_hook(Box::new(move |info| {
-        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
-        if prior_panic_count > 0 {
-            // Give the panic-ing thread time to write the panic file
-            loop {
-                std::thread::yield_now();
-            }
-        }
-
-        let thread = thread::current();
-        let thread_name = thread.name().unwrap_or("<unnamed>");
-
-        let payload = info
-            .payload()
-            .downcast_ref::<&str>()
-            .map(|s| s.to_string())
-            .or_else(|| info.payload().downcast_ref::<String>().map(|s| s.clone()))
-            .unwrap_or_else(|| "Box<Any>".to_string());
-
-        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
-            let location = info.location().unwrap();
-            let backtrace = Backtrace::new();
-            eprintln!(
-                "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}",
-                thread_name,
-                payload,
-                location.file(),
-                location.line(),
-                location.column(),
-                backtrace,
-            );
-            std::process::exit(-1);
-        }
-
-        let app_version = if let Some(version) = app_metadata.app_version {
-            version.to_string()
-        } else {
-            option_env!("CARGO_PKG_VERSION")
-                .unwrap_or("dev")
-                .to_string()
-        };
-
-        let backtrace = Backtrace::new();
-        let mut backtrace = backtrace
-            .frames()
-            .iter()
-            .flat_map(|frame| {
-                frame
-                    .symbols()
-                    .iter()
-                    .filter_map(|frame| Some(format!("{:#}", frame.name()?)))
-            })
-            .collect::<Vec<_>>();
-
-        // Strip out leading stack frames for rust panic-handling.
-        if let Some(ix) = backtrace
-            .iter()
-            .position(|name| name == "rust_begin_unwind")
-        {
-            backtrace.drain(0..=ix);
-        }
-
-        let panic_data = Panic {
-            thread: thread_name.into(),
-            payload,
-            location_data: info.location().map(|location| LocationData {
-                file: location.file().into(),
-                line: location.line(),
-            }),
-            app_version: app_version.to_string(),
-            release_channel: RELEASE_CHANNEL.display_name().into(),
-            os_name: app_metadata.os_name.into(),
-            os_version: app_metadata
-                .os_version
-                .as_ref()
-                .map(SemanticVersion::to_string),
-            architecture: env::consts::ARCH.into(),
-            panicked_on: Utc::now().timestamp_millis(),
-            backtrace,
-            installation_id: installation_id.clone(),
-            session_id: session_id.clone(),
-        };
-
-        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
-            log::error!("{}", panic_data_json);
-        }
-
-        if !is_pty {
-            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
-                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
-                let panic_file_path = paths::LOGS_DIR.join(format!("zed-{}.panic", timestamp));
-                let panic_file = std::fs::OpenOptions::new()
-                    .append(true)
-                    .create(true)
-                    .open(&panic_file_path)
-                    .log_err();
-                if let Some(mut panic_file) = panic_file {
-                    writeln!(&mut panic_file, "{}", panic_data_json).log_err();
-                    panic_file.flush().log_err();
-                }
-            }
-        }
-
-        std::process::abort();
-    }));
-}
-
-fn upload_panics_and_crashes(
-    http: Arc<HttpClientWithUrl>,
-    installation_id: Option<String>,
-    cx: &mut AppContext,
-) {
-    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
-    cx.background_executor()
-        .spawn(async move {
-            let most_recent_panic = upload_previous_panics(http.clone(), telemetry_settings)
-                .await
-                .log_err()
-                .flatten();
-            upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
-                .await
-                .log_err()
-        })
-        .detach()
-}
-
-/// Uploads panics via `zed.dev`.
-async fn upload_previous_panics(
-    http: Arc<HttpClientWithUrl>,
-    telemetry_settings: client::TelemetrySettings,
-) -> Result<Option<(i64, String)>> {
-    let panic_report_url = http.build_url("/api/panic");
-    let mut children = smol::fs::read_dir(&*paths::LOGS_DIR).await?;
-
-    let mut most_recent_panic = None;
-
-    while let Some(child) = children.next().await {
-        let child = child?;
-        let child_path = child.path();
-
-        if child_path.extension() != Some(OsStr::new("panic")) {
-            continue;
-        }
-        let filename = if let Some(filename) = child_path.file_name() {
-            filename.to_string_lossy()
-        } else {
-            continue;
-        };
-
-        if !filename.starts_with("zed") {
-            continue;
-        }
-
-        if telemetry_settings.diagnostics {
-            let panic_file_content = smol::fs::read_to_string(&child_path)
-                .await
-                .context("error reading panic file")?;
-
-            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
-                .ok()
-                .or_else(|| {
-                    panic_file_content
-                        .lines()
-                        .next()
-                        .and_then(|line| serde_json::from_str(line).ok())
-                })
-                .unwrap_or_else(|| {
-                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
-                    None
-                });
-
-            if let Some(panic) = panic {
-                most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
-
-                let body = serde_json::to_string(&PanicRequest { panic }).unwrap();
-
-                let request = Request::post(&panic_report_url)
-                    .redirect_policy(isahc::config::RedirectPolicy::Follow)
-                    .header("Content-Type", "application/json")
-                    .body(body.into())?;
-                let response = http.send(request).await.context("error sending panic")?;
-                if !response.status().is_success() {
-                    log::error!("Error uploading panic to server: {}", response.status());
-                }
-            }
-        }
-
-        // We've done what we can, delete the file
-        std::fs::remove_file(child_path)
-            .context("error removing panic")
-            .log_err();
-    }
-    Ok::<_, anyhow::Error>(most_recent_panic)
-}
-
-static LAST_CRASH_UPLOADED: &'static str = "LAST_CRASH_UPLOADED";
-
-/// upload crashes from apple's diagnostic reports to our server.
-/// (only if telemetry is enabled)
-async fn upload_previous_crashes(
-    http: Arc<HttpClientWithUrl>,
-    most_recent_panic: Option<(i64, String)>,
-    installation_id: Option<String>,
-    telemetry_settings: client::TelemetrySettings,
-) -> Result<()> {
-    if !telemetry_settings.diagnostics {
-        return Ok(());
-    }
-    let last_uploaded = KEY_VALUE_STORE
-        .read_kvp(LAST_CRASH_UPLOADED)?
-        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
-    let mut uploaded = last_uploaded.clone();
-
-    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
-
-    // crash directories are only set on MacOS
-    for dir in [&*CRASHES_DIR, &*CRASHES_RETIRED_DIR]
-        .iter()
-        .filter_map(|d| d.as_deref())
-    {
-        let mut children = smol::fs::read_dir(&dir).await?;
-        while let Some(child) = children.next().await {
-            let child = child?;
-            let Some(filename) = child
-                .path()
-                .file_name()
-                .map(|f| f.to_string_lossy().to_lowercase())
-            else {
-                continue;
-            };
-
-            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
-                continue;
-            }
-
-            if filename <= last_uploaded {
-                continue;
-            }
-
-            let body = smol::fs::read_to_string(&child.path())
-                .await
-                .context("error reading crash file")?;
-
-            let mut request = Request::post(&crash_report_url.to_string())
-                .redirect_policy(isahc::config::RedirectPolicy::Follow)
-                .header("Content-Type", "text/plain");
-
-            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
-                request = request
-                    .header("x-zed-panicked-on", format!("{}", panicked_on))
-                    .header("x-zed-panic", payload)
-            }
-            if let Some(installation_id) = installation_id.as_ref() {
-                request = request.header("x-zed-installation-id", installation_id);
-            }
-
-            let request = request.body(body.into())?;
-
-            let response = http.send(request).await.context("error sending crash")?;
-            if !response.status().is_success() {
-                log::error!("Error uploading crash to server: {}", response.status());
-            }
-
-            if uploaded < filename {
-                uploaded = filename.clone();
-                KEY_VALUE_STORE
-                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
-                    .await?;
-            }
-        }
-    }
-
-    Ok(())
-}
-
 async fn load_login_shell_environment() -> Result<()> {
     let marker = "ZED_LOGIN_SHELL_START";
     let shell = env::var("SHELL").context(

crates/zed/src/reliability.rs 🔗

@@ -0,0 +1,536 @@
+use anyhow::{Context, Result};
+use backtrace::{self, Backtrace};
+use chrono::Utc;
+use db::kvp::KEY_VALUE_STORE;
+use gpui::{App, AppContext, SemanticVersion};
+use isahc::config::Configurable;
+
+use paths::{CRASHES_DIR, CRASHES_RETIRED_DIR};
+use release_channel::ReleaseChannel;
+use release_channel::RELEASE_CHANNEL;
+use serde::{Deserialize, Serialize};
+use settings::Settings;
+use smol::stream::StreamExt;
+use std::{
+    env,
+    ffi::OsStr,
+    sync::{atomic::Ordering, Arc},
+};
+use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
+use util::{
+    http::{self, HttpClient, HttpClientWithUrl},
+    paths, ResultExt,
+};
+
+use crate::stdout_is_a_pty;
+
+#[derive(Serialize, Deserialize)]
+struct LocationData {
+    file: String,
+    line: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+struct Panic {
+    thread: String,
+    payload: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    location_data: Option<LocationData>,
+    backtrace: Vec<String>,
+    app_version: String,
+    release_channel: String,
+    os_name: String,
+    os_version: Option<String>,
+    architecture: String,
+    panicked_on: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    installation_id: Option<String>,
+    session_id: String,
+}
+
+#[derive(Serialize)]
+struct PanicRequest {
+    panic: Panic,
+}
+
+static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
+
+pub fn init_panic_hook(app: &App, installation_id: Option<String>, session_id: String) {
+    let is_pty = stdout_is_a_pty();
+    let app_metadata = app.metadata();
+
+    panic::set_hook(Box::new(move |info| {
+        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
+        if prior_panic_count > 0 {
+            // Give the panic-ing thread time to write the panic file
+            loop {
+                std::thread::yield_now();
+            }
+        }
+
+        let thread = thread::current();
+        let thread_name = thread.name().unwrap_or("<unnamed>");
+
+        let payload = info
+            .payload()
+            .downcast_ref::<&str>()
+            .map(|s| s.to_string())
+            .or_else(|| info.payload().downcast_ref::<String>().map(|s| s.clone()))
+            .unwrap_or_else(|| "Box<Any>".to_string());
+
+        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
+            let location = info.location().unwrap();
+            let backtrace = Backtrace::new();
+            eprintln!(
+                "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}",
+                thread_name,
+                payload,
+                location.file(),
+                location.line(),
+                location.column(),
+                backtrace,
+            );
+            std::process::exit(-1);
+        }
+
+        let app_version = if let Some(version) = app_metadata.app_version {
+            version.to_string()
+        } else {
+            option_env!("CARGO_PKG_VERSION")
+                .unwrap_or("dev")
+                .to_string()
+        };
+
+        let backtrace = Backtrace::new();
+        let mut backtrace = backtrace
+            .frames()
+            .iter()
+            .flat_map(|frame| {
+                frame
+                    .symbols()
+                    .iter()
+                    .filter_map(|frame| Some(format!("{:#}", frame.name()?)))
+            })
+            .collect::<Vec<_>>();
+
+        // Strip out leading stack frames for rust panic-handling.
+        if let Some(ix) = backtrace
+            .iter()
+            .position(|name| name == "rust_begin_unwind")
+        {
+            backtrace.drain(0..=ix);
+        }
+
+        let panic_data = Panic {
+            thread: thread_name.into(),
+            payload,
+            location_data: info.location().map(|location| LocationData {
+                file: location.file().into(),
+                line: location.line(),
+            }),
+            app_version: app_version.to_string(),
+            release_channel: RELEASE_CHANNEL.display_name().into(),
+            os_name: app_metadata.os_name.into(),
+            os_version: app_metadata
+                .os_version
+                .as_ref()
+                .map(SemanticVersion::to_string),
+            architecture: env::consts::ARCH.into(),
+            panicked_on: Utc::now().timestamp_millis(),
+            backtrace,
+            installation_id: installation_id.clone(),
+            session_id: session_id.clone(),
+        };
+
+        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
+            log::error!("{}", panic_data_json);
+        }
+
+        if !is_pty {
+            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
+                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
+                let panic_file_path = paths::LOGS_DIR.join(format!("zed-{}.panic", timestamp));
+                let panic_file = std::fs::OpenOptions::new()
+                    .append(true)
+                    .create(true)
+                    .open(&panic_file_path)
+                    .log_err();
+                if let Some(mut panic_file) = panic_file {
+                    writeln!(&mut panic_file, "{}", panic_data_json).log_err();
+                    panic_file.flush().log_err();
+                }
+            }
+        }
+
+        std::process::abort();
+    }));
+}
+
+pub fn init(
+    http_client: Arc<HttpClientWithUrl>,
+    installation_id: Option<String>,
+    cx: &mut AppContext,
+) {
+    #[cfg(target_os = "macos")]
+    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
+
+    upload_panics_and_crashes(http_client, installation_id, cx)
+}
+
+#[cfg(target_os = "macos")]
+pub fn monitor_main_thread_hangs(
+    http_client: Arc<HttpClientWithUrl>,
+    installation_id: Option<String>,
+    cx: &AppContext,
+) {
+    use nix::sys::signal::{
+        sigaction, SaFlags, SigAction, SigHandler, SigSet,
+        Signal::{self, SIGUSR2},
+    };
+
+    use parking_lot::Mutex;
+
+    use std::{
+        ffi::c_int,
+        sync::{mpsc, OnceLock},
+        time::Duration,
+    };
+    use telemetry_events::{BacktraceFrame, HangReport};
+    use util::http::Method;
+
+    use nix::sys::pthread;
+
+    let foreground_executor = cx.foreground_executor();
+    let background_executor = cx.background_executor();
+    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
+    let metadata = cx.app_metadata();
+
+    // Initialize SIGUSR2 handler to send a backrace to a channel.
+    let (backtrace_tx, backtrace_rx) = mpsc::channel();
+    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
+    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
+    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
+    BACKTRACE.lock().reserve(100);
+
+    fn handle_backtrace_signal() {
+        unsafe {
+            extern "C" fn handle_sigusr2(_i: c_int) {
+                unsafe {
+                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
+                    // which can only be triggered by This signal handler. In addition,
+                    // this signal handler is immediately removed by SA_RESETHAND, and this
+                    // signal handler cannot be re-entrant due to to the SIGUSR2 mask defined
+                    // below
+                    let mut bt = BACKTRACE.lock();
+                    bt.clear();
+                    backtrace::trace_unsynchronized(|frame| {
+                        if bt.len() < bt.capacity() {
+                            bt.push(frame.clone());
+                            true
+                        } else {
+                            false
+                        }
+                    });
+                }
+
+                BACKTRACE_SENDER.get().unwrap().send(()).ok();
+            }
+
+            let mut mask = SigSet::empty();
+            mask.add(SIGUSR2);
+            sigaction(
+                Signal::SIGUSR2,
+                &SigAction::new(
+                    SigHandler::Handler(handle_sigusr2),
+                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
+                    mask,
+                ),
+            )
+            .log_err();
+        }
+    }
+
+    handle_backtrace_signal();
+    let main_thread = pthread::pthread_self();
+
+    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
+    foreground_executor
+        .spawn(async move { while let Some(_) = rx.next().await {} })
+        .detach();
+
+    background_executor
+        .spawn({
+            let background_executor = background_executor.clone();
+            async move {
+                loop {
+                    background_executor.timer(Duration::from_secs(1)).await;
+                    match tx.try_send(()) {
+                        Ok(_) => continue,
+                        Err(e) => {
+                            if e.into_send_error().is_full() {
+                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
+                            }
+                            // Only detect the first hang
+                            break;
+                        }
+                    }
+                }
+            }
+        })
+        .detach();
+
+    background_executor
+        .clone()
+        .spawn(async move {
+            loop {
+                while let Some(_) = backtrace_rx.recv().ok() {
+                    if !telemetry_settings.diagnostics {
+                        return;
+                    }
+
+                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
+                    // the backtrace transmitter has fired, which itself is only done
+                    // by the signal handler. And due to SA_RESETHAND  the signal handler
+                    // will not run again until `handle_backtrace_signal` is called.
+                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
+                    let backtrace: Vec<_> = raw_backtrace
+                        .into_iter()
+                        .map(|frame| {
+                            let mut btf = BacktraceFrame {
+                                ip: frame.ip() as usize,
+                                symbol_addr: frame.symbol_address() as usize,
+                                base: frame.module_base_address().map(|addr| addr as usize),
+                                symbols: vec![],
+                            };
+
+                            backtrace::resolve_frame(&frame, |symbol| {
+                                if let Some(name) = symbol.name() {
+                                    btf.symbols.push(name.to_string());
+                                }
+                            });
+
+                            btf
+                        })
+                        .collect();
+
+                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
+                    handle_backtrace_signal();
+
+                    log::error!(
+                        "Suspected hang on main thread:\n{}",
+                        backtrace
+                            .iter()
+                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
+                            .collect::<Vec<_>>()
+                            .join("\n")
+                    );
+
+                    let report = HangReport {
+                        backtrace,
+                        app_version: metadata.app_version,
+                        os_name: metadata.os_name.to_owned(),
+                        os_version: metadata.os_version,
+                        architecture: env::consts::ARCH.into(),
+                        installation_id: installation_id.clone(),
+                    };
+
+                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
+                        continue;
+                    };
+
+                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
+                    else {
+                        continue;
+                    };
+
+                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
+                        continue;
+                    };
+
+                    let Ok(request) = http::Request::builder()
+                        .method(Method::POST)
+                        .uri(url.as_ref())
+                        .header("x-zed-checksum", checksum)
+                        .body(json_bytes.into())
+                    else {
+                        continue;
+                    };
+
+                    if let Some(response) = http_client.send(request).await.log_err() {
+                        if response.status() != 200 {
+                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
+                        }
+                    }
+                }
+            }
+        })
+        .detach()
+}
+
+fn upload_panics_and_crashes(
+    http: Arc<HttpClientWithUrl>,
+    installation_id: Option<String>,
+    cx: &mut AppContext,
+) {
+    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
+    cx.background_executor()
+        .spawn(async move {
+            let most_recent_panic = upload_previous_panics(http.clone(), telemetry_settings)
+                .await
+                .log_err()
+                .flatten();
+            upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
+                .await
+                .log_err()
+        })
+        .detach()
+}
+
+/// Uploads panics via `zed.dev`.
+async fn upload_previous_panics(
+    http: Arc<HttpClientWithUrl>,
+    telemetry_settings: client::TelemetrySettings,
+) -> Result<Option<(i64, String)>> {
+    let panic_report_url = http.build_url("/api/panic");
+    let mut children = smol::fs::read_dir(&*paths::LOGS_DIR).await?;
+
+    let mut most_recent_panic = None;
+
+    while let Some(child) = children.next().await {
+        let child = child?;
+        let child_path = child.path();
+
+        if child_path.extension() != Some(OsStr::new("panic")) {
+            continue;
+        }
+        let filename = if let Some(filename) = child_path.file_name() {
+            filename.to_string_lossy()
+        } else {
+            continue;
+        };
+
+        if !filename.starts_with("zed") {
+            continue;
+        }
+
+        if telemetry_settings.diagnostics {
+            let panic_file_content = smol::fs::read_to_string(&child_path)
+                .await
+                .context("error reading panic file")?;
+
+            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
+                .ok()
+                .or_else(|| {
+                    panic_file_content
+                        .lines()
+                        .next()
+                        .and_then(|line| serde_json::from_str(line).ok())
+                })
+                .unwrap_or_else(|| {
+                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
+                    None
+                });
+
+            if let Some(panic) = panic {
+                most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
+
+                let body = serde_json::to_string(&PanicRequest { panic }).unwrap();
+
+                let request = http::Request::post(&panic_report_url)
+                    .redirect_policy(isahc::config::RedirectPolicy::Follow)
+                    .header("Content-Type", "application/json")
+                    .body(body.into())?;
+                let response = http.send(request).await.context("error sending panic")?;
+                if !response.status().is_success() {
+                    log::error!("Error uploading panic to server: {}", response.status());
+                }
+            }
+        }
+
+        // We've done what we can, delete the file
+        std::fs::remove_file(child_path)
+            .context("error removing panic")
+            .log_err();
+    }
+    Ok::<_, anyhow::Error>(most_recent_panic)
+}
+
+static LAST_CRASH_UPLOADED: &'static str = "LAST_CRASH_UPLOADED";
+
+/// upload crashes from apple's diagnostic reports to our server.
+/// (only if telemetry is enabled)
+async fn upload_previous_crashes(
+    http: Arc<HttpClientWithUrl>,
+    most_recent_panic: Option<(i64, String)>,
+    installation_id: Option<String>,
+    telemetry_settings: client::TelemetrySettings,
+) -> Result<()> {
+    if !telemetry_settings.diagnostics {
+        return Ok(());
+    }
+    let last_uploaded = KEY_VALUE_STORE
+        .read_kvp(LAST_CRASH_UPLOADED)?
+        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
+    let mut uploaded = last_uploaded.clone();
+
+    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
+
+    // crash directories are only set on MacOS
+    for dir in [&*CRASHES_DIR, &*CRASHES_RETIRED_DIR]
+        .iter()
+        .filter_map(|d| d.as_deref())
+    {
+        let mut children = smol::fs::read_dir(&dir).await?;
+        while let Some(child) = children.next().await {
+            let child = child?;
+            let Some(filename) = child
+                .path()
+                .file_name()
+                .map(|f| f.to_string_lossy().to_lowercase())
+            else {
+                continue;
+            };
+
+            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
+                continue;
+            }
+
+            if filename <= last_uploaded {
+                continue;
+            }
+
+            let body = smol::fs::read_to_string(&child.path())
+                .await
+                .context("error reading crash file")?;
+
+            let mut request = http::Request::post(&crash_report_url.to_string())
+                .redirect_policy(isahc::config::RedirectPolicy::Follow)
+                .header("Content-Type", "text/plain");
+
+            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
+                request = request
+                    .header("x-zed-panicked-on", format!("{}", panicked_on))
+                    .header("x-zed-panic", payload)
+            }
+            if let Some(installation_id) = installation_id.as_ref() {
+                request = request.header("x-zed-installation-id", installation_id);
+            }
+
+            let request = request.body(body.into())?;
+
+            let response = http.send(request).await.context("error sending crash")?;
+            if !response.status().is_success() {
+                log::error!("Error uploading crash to server: {}", response.status());
+            }
+
+            if uploaded < filename {
+                uploaded = filename.clone();
+                KEY_VALUE_STORE
+                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
+                    .await?;
+            }
+        }
+    }
+
+    Ok(())
+}