Improve Linux panic reporting (#22202)

Cole Miller and Conrad created

- [x] Upload separate debug symbols for Linux binaries to DigitalOcean
- [x] Send raw offsets with panic report JSON on Linux
- [x] Update `symbolicate` script to handle Linux crashes
- [x] Demangle backtraces 🎉 
- [x] Check that it works
- [x] Improve deduplication (?)
 
Release Notes:

- N/A

---------

Co-authored-by: Conrad <conrad@zed.dev>

Change summary

.github/workflows/ci.yml                        |  4 +
crates/collab/src/api/events.rs                 | 46 +++++++++++
crates/remote_server/build.rs                   |  4 +
crates/remote_server/src/unix.rs                |  1 
crates/telemetry_events/src/telemetry_events.rs |  1 
crates/zed/build.rs                             |  4 +
crates/zed/src/reliability.rs                   | 63 +++++++++++----
script/bundle-linux                             | 25 ++++-
script/symbolicate                              | 76 ++++++++++++------
9 files changed, 171 insertions(+), 53 deletions(-)

Detailed changes

.github/workflows/ci.yml 🔗

@@ -364,6 +364,8 @@ jobs:
     env:
       ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
       ZED_CLOUD_PROVIDER_ADDITIONAL_MODELS_JSON: ${{ secrets.ZED_CLOUD_PROVIDER_ADDITIONAL_MODELS_JSON }}
+      DIGITALOCEAN_SPACES_ACCESS_KEY: ${{ secrets.DIGITALOCEAN_SPACES_ACCESS_KEY }}
+      DIGITALOCEAN_SPACES_SECRET_KEY: ${{ secrets.DIGITALOCEAN_SPACES_SECRET_KEY }}
     steps:
       - name: Checkout repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
@@ -410,6 +412,8 @@ jobs:
     env:
       ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
       ZED_CLOUD_PROVIDER_ADDITIONAL_MODELS_JSON: ${{ secrets.ZED_CLOUD_PROVIDER_ADDITIONAL_MODELS_JSON }}
+      DIGITALOCEAN_SPACES_ACCESS_KEY: ${{ secrets.DIGITALOCEAN_SPACES_ACCESS_KEY }}
+      DIGITALOCEAN_SPACES_SECRET_KEY: ${{ secrets.DIGITALOCEAN_SPACES_SECRET_KEY }}
     steps:
       - name: Checkout repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4

crates/collab/src/api/events.rs 🔗

@@ -279,6 +279,7 @@ pub async fn post_panic(
 
     let report: telemetry_events::PanicRequest = serde_json::from_slice(&body)
         .map_err(|_| Error::http(StatusCode::BAD_REQUEST, "invalid json".into()))?;
+    let incident_id = uuid::Uuid::new_v4().to_string();
     let panic = report.panic;
 
     if panic.os_name == "Linux" && panic.os_version == Some("1.0.0".to_string()) {
@@ -288,11 +289,37 @@ pub async fn post_panic(
         ))?;
     }
 
+    if let Some(blob_store_client) = app.blob_store_client.as_ref() {
+        let response = blob_store_client
+            .head_object()
+            .bucket(CRASH_REPORTS_BUCKET)
+            .key(incident_id.clone() + ".json")
+            .send()
+            .await;
+
+        if response.is_ok() {
+            log::info!("We've already uploaded this crash");
+            return Ok(());
+        }
+
+        blob_store_client
+            .put_object()
+            .bucket(CRASH_REPORTS_BUCKET)
+            .key(incident_id.clone() + ".json")
+            .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
+            .body(ByteStream::from(body.to_vec()))
+            .send()
+            .await
+            .map_err(|e| log::error!("Failed to upload crash: {}", e))
+            .ok();
+    }
+
     tracing::error!(
         service = "client",
         version = %panic.app_version,
         os_name = %panic.os_name,
         os_version = %panic.os_version.clone().unwrap_or_default(),
+        incident_id = %incident_id,
         installation_id = %panic.installation_id.clone().unwrap_or_default(),
         description = %panic.payload,
         backtrace = %panic.backtrace.join("\n"),
@@ -331,10 +358,19 @@ pub async fn post_panic(
                         panic.app_version
                     )))
                     .add_field({
+                        let hostname = app.config.blob_store_url.clone().unwrap_or_default();
+                        let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
+                            hostname.strip_prefix("http://").unwrap_or_default()
+                        });
+
                         slack::Text::markdown(format!(
-                            "*OS:*\n{} {}",
+                            "*{} {}:*\n<https://{}.{}/{}.json|{}…>",
                             panic.os_name,
-                            panic.os_version.unwrap_or_default()
+                            panic.os_version.unwrap_or_default(),
+                            CRASH_REPORTS_BUCKET,
+                            hostname,
+                            incident_id,
+                            incident_id.chars().take(8).collect::<String>(),
                         ))
                     })
                 })
@@ -361,6 +397,12 @@ pub async fn post_panic(
 }
 
 fn report_to_slack(panic: &Panic) -> bool {
+    // Panics on macOS should make their way to Slack as a crash report,
+    // so we don't need to send them a second time via this channel.
+    if panic.os_name == "macOS" {
+        return false;
+    }
+
     if panic.payload.contains("ERROR_SURFACE_LOST_KHR") {
         return false;
     }

crates/remote_server/build.rs 🔗

@@ -9,6 +9,10 @@ fn main() {
         "cargo:rustc-env=ZED_PKG_VERSION={}",
         zed_cargo_toml.package.unwrap().version.unwrap()
     );
+    println!(
+        "cargo:rustc-env=TARGET={}",
+        std::env::var("TARGET").unwrap()
+    );
 
     // If we're building this for nightly, we want to set the ZED_COMMIT_SHA
     if let Some(release_channel) = std::env::var("ZED_RELEASE_CHANNEL").ok() {

crates/remote_server/src/unix.rs 🔗

@@ -160,6 +160,7 @@ fn init_panic_hook() {
                 option_env!("ZED_COMMIT_SHA").unwrap_or(&env!("ZED_PKG_VERSION"))
             ),
             release_channel: release_channel::RELEASE_CHANNEL.display_name().into(),
+            target: env!("TARGET").to_owned().into(),
             os_name: telemetry::os_name(),
             os_version: Some(telemetry::os_version()),
             architecture: env::consts::ARCH.into(),

crates/telemetry_events/src/telemetry_events.rs 🔗

@@ -269,6 +269,7 @@ pub struct Panic {
     pub app_version: String,
     /// Zed release channel (stable, preview, dev)
     pub release_channel: String,
+    pub target: Option<String>,
     pub os_name: String,
     pub os_version: Option<String>,
     pub architecture: String,

crates/zed/build.rs 🔗

@@ -33,6 +33,10 @@ fn main() {
 
     // Populate git sha environment variable if git is available
     println!("cargo:rerun-if-changed=../../.git/logs/HEAD");
+    println!(
+        "cargo:rustc-env=TARGET={}",
+        std::env::var("TARGET").unwrap()
+    );
     if let Ok(output) = Command::new("git").args(["rev-parse", "HEAD"]).output() {
         if output.status.success() {
             let git_sha = String::from_utf8_lossy(&output.stdout);

crates/zed/src/reliability.rs 🔗

@@ -1,31 +1,26 @@
+use crate::stdout_is_a_pty;
 use anyhow::{Context, Result};
 use backtrace::{self, Backtrace};
 use chrono::Utc;
 use client::{telemetry, TelemetrySettings};
 use db::kvp::KEY_VALUE_STORE;
 use gpui::{AppContext, SemanticVersion};
-use http_client::{HttpRequestExt, Method};
-
-use http_client::{self, HttpClient, HttpClientWithUrl};
+use http_client::{self, HttpClient, HttpClientWithUrl, HttpRequestExt, Method};
 use paths::{crashes_dir, crashes_retired_dir};
 use project::Project;
-use release_channel::ReleaseChannel;
-use release_channel::RELEASE_CHANNEL;
+use release_channel::{ReleaseChannel, RELEASE_CHANNEL};
 use settings::Settings;
 use smol::stream::StreamExt;
 use std::{
     env,
-    ffi::OsStr,
+    ffi::{c_void, OsStr},
     sync::{atomic::Ordering, Arc},
 };
 use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
-use telemetry_events::LocationData;
-use telemetry_events::Panic;
-use telemetry_events::PanicRequest;
+use telemetry_events::{LocationData, Panic, PanicRequest};
 use url::Url;
 use util::ResultExt;
 
-use crate::stdout_is_a_pty;
 static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 
 pub fn init_panic_hook(
@@ -69,25 +64,35 @@ pub fn init_panic_hook(
             );
             std::process::exit(-1);
         }
+        let main_module_base_address = get_main_module_base_address();
 
         let backtrace = Backtrace::new();
-        let mut backtrace = backtrace
+        let mut symbols = backtrace
             .frames()
             .iter()
             .flat_map(|frame| {
-                frame
-                    .symbols()
-                    .iter()
-                    .filter_map(|frame| Some(format!("{:#}", frame.name()?)))
+                let base = frame
+                    .module_base_address()
+                    .unwrap_or(main_module_base_address);
+                frame.symbols().iter().map(move |symbol| {
+                    format!(
+                        "{}+{}",
+                        symbol
+                            .name()
+                            .as_ref()
+                            .map_or("<unknown>".to_owned(), <_>::to_string),
+                        (frame.ip() as isize).saturating_sub(base as isize)
+                    )
+                })
             })
             .collect::<Vec<_>>();
 
         // Strip out leading stack frames for rust panic-handling.
-        if let Some(ix) = backtrace
+        if let Some(ix) = symbols
             .iter()
             .position(|name| name == "rust_begin_unwind" || name == "_rust_begin_unwind")
         {
-            backtrace.drain(0..=ix);
+            symbols.drain(0..=ix);
         }
 
         let panic_data = telemetry_events::Panic {
@@ -98,12 +103,13 @@ pub fn init_panic_hook(
                 line: location.line(),
             }),
             app_version: app_version.to_string(),
-            release_channel: RELEASE_CHANNEL.display_name().into(),
+            release_channel: RELEASE_CHANNEL.dev_name().into(),
+            target: env!("TARGET").to_owned().into(),
             os_name: telemetry::os_name(),
             os_version: Some(telemetry::os_version()),
             architecture: env::consts::ARCH.into(),
             panicked_on: Utc::now().timestamp_millis(),
-            backtrace,
+            backtrace: symbols,
             system_id: system_id.clone(),
             installation_id: installation_id.clone(),
             session_id: session_id.clone(),
@@ -133,6 +139,25 @@ pub fn init_panic_hook(
     }));
 }
 
+#[cfg(not(target_os = "windows"))]
+fn get_main_module_base_address() -> *mut c_void {
+    let mut dl_info = libc::Dl_info {
+        dli_fname: std::ptr::null(),
+        dli_fbase: std::ptr::null_mut(),
+        dli_sname: std::ptr::null(),
+        dli_saddr: std::ptr::null_mut(),
+    };
+    unsafe {
+        libc::dladdr(get_main_module_base_address as _, &mut dl_info);
+    }
+    dl_info.dli_fbase
+}
+
+#[cfg(target_os = "windows")]
+fn get_main_module_base_address() -> *mut c_void {
+    std::ptr::null_mut()
+}
+
 pub fn init(
     http_client: Arc<HttpClientWithUrl>,
     system_id: Option<String>,

script/bundle-linux 🔗

@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
 set -euxo pipefail
+source script/lib/blob-store.sh
 
 # Function for displaying help info
 help_info() {
@@ -61,12 +62,24 @@ if [[ "$remote_server_triple" == "$musl_triple" ]]; then
 fi
 cargo build --release --target "${remote_server_triple}" --package remote_server
 
-# Strip the binary of all debug symbols
-# Later, we probably want to do something like this: https://github.com/GabrielMajeri/separate-symbols
-strip --strip-debug "${target_dir}/${target_triple}/release/zed"
-strip --strip-debug "${target_dir}/${target_triple}/release/cli"
-strip --strip-debug "${target_dir}/${remote_server_triple}/release/remote_server"
-
+# Strip debug symbols and save them for upload to DigitalOcean
+objcopy --only-keep-debug "${target_dir}/${target_triple}/release/zed" "${target_dir}/${target_triple}/release/zed.dbg"
+objcopy --only-keep-debug "${target_dir}/${remote_server_triple}/release/remote_server" "${target_dir}/${remote_server_triple}/release/remote_server.dbg"
+objcopy --strip-debug "${target_dir}/${target_triple}/release/zed"
+objcopy --strip-debug "${target_dir}/${target_triple}/release/cli"
+objcopy --strip-debug "${target_dir}/${remote_server_triple}/release/remote_server"
+
+gzip "${target_dir}/${target_triple}/release/zed.dbg"
+upload_to_blob_store_public \
+    "zed-debug-symbols" \
+    "${target_dir}/${target_triple}/release/zed.dbg.gz" \
+    "$channel/zed-$version-${target_triple}.dbg.gz"
+
+gzip "${target_dir}/${remote_server_triple}/release/remote_server.dbg"
+upload_to_blob_store_public \
+    "zed-debug-symbols" \
+    "${target_dir}/${remote_server_triple}/release/remote_server.dbg.gz" \
+    "$channel/remote_server-$version-${remote_server_triple}.dbg.gz"
 
 # Ensure that remote_server does not depend on libssl nor libcrypto, as we got rid of these deps.
 if ldd "${target_dir}/${remote_server_triple}/release/remote_server" | grep -q 'libcrypto\|libssl'; then

script/symbolicate 🔗

@@ -2,40 +2,64 @@
 
 set -eu
 if [[ $# -eq 0 ]] || [[ "$1" == "--help" ]]; then
-  echo "Usage: $(basename $0) <path_to_ips_file>"
-  echo "This script symbolicates the provided .ips file using the appropriate dSYM file from digital ocean"
+  echo "Usage: $(basename $0) <path_to_ips_file_or_json>"
+  echo "This script symbolicates the provided .ips file or .json panic report using the appropriate debug symbols from DigitalOcean"
   echo ""
   exit 1
 fi
 
-ips_file=$1;
+input_file=$1;
 
-version=$(cat $ips_file | head -n 1 | jq -r .app_version)
-bundle_id=$(cat $ips_file | head -n 1 | jq -r .bundleID)
-cpu_type=$(cat $ips_file | tail -n+2 | jq -r .cpuType)
+if [[ "$input_file" == *.json ]]; then
+    version=$(cat $input_file | jq -r .app_version)
+    channel=$(cat $input_file | jq -r .release_channel)
+    target_triple=$(cat $input_file | jq -r .target)
 
-which symbolicate >/dev/null || cargo install symbolicate
+    which llvm-symbolizer rustfilt >dev/null || echo Need to install llvm-symbolizer and rustfilt
 
-arch="x86_64-apple-darwin"
-if [[ "$cpu_type" == *ARM-64* ]]; then
-    arch="aarch64-apple-darwin"
-fi
-echo $bundle_id;
+    echo $channel;
 
-channel="stable"
-if [[ "$bundle_id" == *Nightly* ]]; then
-    channel="nightly"
-elif [[ "$bundle_id" == *Preview* ]]; then
-    channel="preview"
-fi
+    mkdir -p target/dsyms/$channel
 
-mkdir -p target/dsyms/$channel
+    dsym="$channel/zed-$version-$target_triple.dbg"
+    if [[ ! -f target/dsyms/$dsym ]]; then
+        echo "Downloading $dsym..."
+        curl -o target/dsyms/$dsym.gz "https://zed-debug-symbols.nyc3.digitaloceanspaces.com/$dsym.gz"
+        gunzip  target/dsyms/$dsym.gz
+    fi
 
-dsym="$channel/Zed-$version-$arch.dwarf"
-if [[ ! -f target/dsyms/$dsym ]]; then
-    echo "Downloading $dsym..."
-    curl -o target/dsyms/$dsym.gz "https://zed-debug-symbols.nyc3.digitaloceanspaces.com/$channel/Zed-$version-$arch.dwarf.gz"
-    gunzip  target/dsyms/$dsym.gz
-fi
+    cat $input_file | jq -r .backtrace[] | sed s'/.*+//' | llvm-symbolizer --no-demangle --obj=target/dsyms/$dsym | rustfilt
+
+else # ips file
+
+    version=$(cat $input_file | head -n 1 | jq -r .app_version)
+    bundle_id=$(cat $input_file | head -n 1 | jq -r .bundleID)
+    cpu_type=$(cat $input_file | tail -n+2 | jq -r .cpuType)
+
+    which symbolicate >/dev/null || cargo install symbolicate
 
-symbolicate $ips_file target/dsyms/$dsym
+    arch="x86_64-apple-darwin"
+    if [[ "$cpu_type" == *ARM-64* ]]; then
+        arch="aarch64-apple-darwin"
+    fi
+    echo $bundle_id;
+
+    channel="stable"
+    if [[ "$bundle_id" == *Nightly* ]]; then
+        channel="nightly"
+    elif [[ "$bundle_id" == *Preview* ]]; then
+        channel="preview"
+    fi
+
+    mkdir -p target/dsyms/$channel
+
+    dsym="$channel/Zed-$version-$arch.dwarf"
+    if [[ ! -f target/dsyms/$dsym ]]; then
+        echo "Downloading $dsym..."
+        curl -o target/dsyms/$dsym.gz "https://zed-debug-symbols.nyc3.digitaloceanspaces.com/$channel/Zed-$version-$arch.dwarf.gz"
+        gunzip  target/dsyms/$dsym.gz
+    fi
+
+    symbolicate $input_file target/dsyms/$dsym
+
+fi