Fix wasmtime panic handling (#49944)

Conrad Irwin created

We see a number of crashes in Sentry that appear to be crashes in
wasmtime.
This shouldn't happen, as wasmtime is designed to run untrusted code
"safely".

Looking into this, it seems likely that the problem is that we race with
wasmtime
when installing signal handlers. If wasmtime's handlers are installed
before ours,
then any signals that it intends to handle (like out of bounds memory
access) will
reach our handlers before its; which causes us to assume the app has
crashed.

This changes fixes our crash handler initialization to ensure we always
create
our signal handler first, and reverts a previous attempt to fix this
from #40883

Closes #ISSUE

Before you mark this PR as ready for review, make sure that you have:
- [ ] Added a solid test coverage and/or screenshots from doing manual
testing
- [ ] Done a self-review taking into account security and performance
aspects
- [ ] Aligned any UI changes with the [UI
checklist](https://github.com/zed-industries/zed/blob/main/CONTRIBUTING.md#uiux-checklist)

Release Notes:

- Linux: Fixed crashes that could happen due to our crash handler
erroneously catching signals intended for wasmtime.

Change summary

Cargo.lock                             |   2 
crates/crashes/Cargo.toml              |   3 
crates/crashes/src/crashes.rs          | 140 ++++++++++++++-------------
crates/extension_host/src/wasm_host.rs |  16 ---
crates/remote_server/src/server.rs     |  31 +++--
crates/zed/src/main.rs                 |  11 +
6 files changed, 103 insertions(+), 100 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -4086,7 +4086,7 @@ dependencies = [
  "bincode",
  "cfg-if",
  "crash-handler",
- "extension_host",
+ "futures 0.3.31",
  "log",
  "mach2 0.5.0",
  "minidumper",

crates/crashes/Cargo.toml 🔗

@@ -9,9 +9,10 @@ license = "GPL-3.0-or-later"
 bincode.workspace = true
 cfg-if.workspace = true
 crash-handler.workspace = true
-extension_host.workspace = true
+futures.workspace = true
 log.workspace = true
 minidumper.workspace = true
+
 paths.workspace = true
 release_channel.workspace = true
 smol.workspace = true

crates/crashes/src/crashes.rs 🔗

@@ -1,8 +1,10 @@
 use crash_handler::{CrashEventResult, CrashHandler};
+use futures::future::BoxFuture;
 use log::info;
 use minidumper::{Client, LoopAction, MinidumpBinary};
 use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
 use serde::{Deserialize, Serialize};
+use std::mem;
 
 #[cfg(not(target_os = "windows"))]
 use smol::process::Command;
@@ -34,43 +36,76 @@ const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 #[cfg(target_os = "macos")]
 static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 
-pub async fn init(crash_init: InitCrashHandler) {
-    let gen_var = match env::var("ZED_GENERATE_MINIDUMPS") {
-        Ok(v) => {
-            if v == "false" || v == "0" {
-                Some(false)
-            } else {
-                Some(true)
-            }
-        }
-        Err(_) => None,
-    };
+fn should_install_crash_handler() -> bool {
+    if let Ok(value) = env::var("ZED_GENERATE_MINIDUMPS") {
+        return value == "true" || value == "1";
+    }
 
-    match (gen_var, *RELEASE_CHANNEL) {
-        (Some(false), _) | (None, ReleaseChannel::Dev) => {
-            let old_hook = panic::take_hook();
-            panic::set_hook(Box::new(move |info| {
-                unsafe { env::set_var("RUST_BACKTRACE", "1") };
-                old_hook(info);
-                // prevent the macOS crash dialog from popping up
-                if cfg!(target_os = "macos") {
-                    std::process::exit(1);
-                }
-            }));
-            return;
-        }
-        _ => {
-            panic::set_hook(Box::new(panic_hook));
-        }
+    if *RELEASE_CHANNEL == ReleaseChannel::Dev {
+        return false;
+    }
+
+    true
+}
+
+/// Install crash signal handlers and spawn the crash-handler subprocess.
+///
+/// The synchronous portion (signal handlers, panic hook) runs inline.
+/// The async keepalive task is passed to `spawn` so the caller decides
+/// which executor to schedule it on.
+pub fn init(crash_init: InitCrashHandler, spawn: impl FnOnce(BoxFuture<'static, ()>)) {
+    if !should_install_crash_handler() {
+        let old_hook = panic::take_hook();
+        panic::set_hook(Box::new(move |info| {
+            unsafe { env::set_var("RUST_BACKTRACE", "1") };
+            old_hook(info);
+            // prevent the macOS crash dialog from popping up
+            if cfg!(target_os = "macos") {
+                std::process::exit(1);
+            }
+        }));
+        return;
     }
 
+    panic::set_hook(Box::new(panic_hook));
+
+    let handler = CrashHandler::attach(unsafe {
+        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
+            let Some(client) = CRASH_HANDLER.get() else {
+                return CrashEventResult::Handled(false);
+            };
+
+            // only request a minidump once
+            let res = if REQUESTED_MINIDUMP
+                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
+                .is_ok()
+            {
+                #[cfg(target_os = "macos")]
+                suspend_all_other_threads();
+
+                // on macos this "ping" is needed to ensure that all our
+                // `client.send_message` calls have been processed before we trigger the
+                // minidump request.
+                client.ping().ok();
+                client.request_dump(crash_context).is_ok()
+            } else {
+                true
+            };
+            CrashEventResult::Handled(res)
+        })
+    })
+    .expect("failed to attach signal handler");
+
+    info!("crash signal handlers installed");
+
+    spawn(Box::pin(connect_and_keepalive(crash_init, handler)));
+}
+
+/// Spawn the crash-handler subprocess, connect the IPC client, and run the
+/// keepalive ping loop. Called on a background executor by [`init`].
+async fn connect_and_keepalive(crash_init: InitCrashHandler, handler: CrashHandler) {
     let exe = env::current_exe().expect("unable to find ourselves");
     let zed_pid = process::id();
-    // TODO: we should be able to get away with using 1 crash-handler process per machine,
-    // but for now we append the PID of the current process which makes it unique per remote
-    // server or interactive zed instance. This solves an issue where occasionally the socket
-    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
-    // system and block further attempts to initialize crash handlers with that socket path.
     let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
     #[cfg(not(target_os = "windows"))]
     let _crash_handler = Command::new(exe)
@@ -82,8 +117,6 @@ pub async fn init(crash_init: InitCrashHandler) {
     #[cfg(target_os = "windows")]
     spawn_crash_handler_windows(&exe, &socket_name);
 
-    #[cfg(target_os = "linux")]
-    let server_pid = _crash_handler.id();
     info!("spawning crash handler process");
 
     let mut elapsed = Duration::ZERO;
@@ -106,36 +139,15 @@ pub async fn init(crash_init: InitCrashHandler) {
         .unwrap();
 
     let client = Arc::new(client);
-    let handler = CrashHandler::attach(unsafe {
-        let client = client.clone();
-        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
-            // only request a minidump once
-            let res = if REQUESTED_MINIDUMP
-                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
-                .is_ok()
-            {
-                #[cfg(target_os = "macos")]
-                suspend_all_other_threads();
-
-                // on macos this "ping" is needed to ensure that all our
-                // `client.send_message` calls have been processed before we trigger the
-                // minidump request.
-                client.ping().ok();
-                client.request_dump(crash_context).is_ok()
-            } else {
-                true
-            };
-            CrashEventResult::Handled(res)
-        })
-    })
-    .expect("failed to attach signal handler");
 
     #[cfg(target_os = "linux")]
-    {
-        handler.set_ptracer(Some(server_pid));
-    }
+    handler.set_ptracer(Some(_crash_handler.id()));
+
+    // Publishing the client to the OnceLock makes it visible to the signal
+    // handler callback installed earlier.
     CRASH_HANDLER.set(client.clone()).ok();
-    std::mem::forget(handler);
+    // mem::forget so that the drop is not called
+    mem::forget(handler);
     info!("crash handler registered");
 
     loop {
@@ -300,12 +312,6 @@ impl minidumper::ServerHandler for CrashServer {
 }
 
 pub fn panic_hook(info: &PanicHookInfo) {
-    // Don't handle a panic on threads that are not relevant to the main execution.
-    if extension_host::wasm_host::IS_WASM_THREAD.with(|v| v.load(Ordering::Acquire)) {
-        log::error!("wasm thread panicked!");
-        return;
-    }
-
     let message = info.payload_as_str().unwrap_or("Box<Any>").to_owned();
 
     let span = info

crates/extension_host/src/wasm_host.rs 🔗

@@ -33,10 +33,7 @@ use settings::Settings;
 use std::{
     borrow::Cow,
     path::{Path, PathBuf},
-    sync::{
-        Arc, LazyLock, OnceLock,
-        atomic::{AtomicBool, Ordering},
-    },
+    sync::{Arc, LazyLock, OnceLock},
     time::Duration,
 };
 use task::{DebugScenario, SpawnInTerminal, TaskTemplate, ZedDebugConfig};
@@ -498,11 +495,6 @@ pub struct WasmState {
     pub(crate) capability_granter: CapabilityGranter,
 }
 
-std::thread_local! {
-    /// Used by the crash handler to ignore panics in extension-related threads.
-    pub static IS_WASM_THREAD: AtomicBool = const { AtomicBool::new(false) };
-}
-
 type MainThreadCall = Box<dyn Send + for<'a> FnOnce(&'a mut AsyncApp) -> LocalBoxFuture<'a, ()>>;
 
 type ExtensionCall = Box<
@@ -656,12 +648,6 @@ impl WasmHost {
 
             let (tx, mut rx) = mpsc::unbounded::<ExtensionCall>();
             let extension_task = async move {
-                // note: Setting the thread local here will slowly "poison" all tokio threads
-                // causing us to not record their panics any longer.
-                //
-                // This is fine though, the main zed binary only uses tokio for livekit and wasm extensions.
-                // Livekit seldom (if ever) panics 🤞 so the likelihood of us missing a panic in sentry is very low.
-                IS_WASM_THREAD.with(|v| v.store(true, Ordering::Release));
                 while let Some(call) = rx.next().await {
                     (call)(&mut extension, &mut store).await;
                 }

crates/remote_server/src/server.rs 🔗

@@ -452,15 +452,18 @@ pub fn execute_run(
     let app = gpui_platform::headless();
     let pid = std::process::id();
     let id = pid.to_string();
-    app.background_executor()
-        .spawn(crashes::init(crashes::InitCrashHandler {
+    crashes::init(
+        crashes::InitCrashHandler {
             session_id: id,
             zed_version: VERSION.to_owned(),
             binary: "zed-remote-server".to_string(),
             release_channel: release_channel::RELEASE_CHANNEL_NAME.clone(),
             commit_sha: option_env!("ZED_COMMIT_SHA").unwrap_or("no_sha").to_owned(),
-        }))
-        .detach();
+        },
+        |task| {
+            app.background_executor().spawn(task).detach();
+        },
+    );
     let log_rx = init_logging_server(&log_file)?;
     log::info!(
         "starting up with PID {}:\npid_file: {:?}, log_file: {:?}, stdin_socket: {:?}, stdout_socket: {:?}, stderr_socket: {:?}",
@@ -704,14 +707,18 @@ pub(crate) fn execute_proxy(
     let server_paths = ServerPaths::new(&identifier)?;
 
     let id = std::process::id().to_string();
-    smol::spawn(crashes::init(crashes::InitCrashHandler {
-        session_id: id,
-        zed_version: VERSION.to_owned(),
-        binary: "zed-remote-server".to_string(),
-        release_channel: release_channel::RELEASE_CHANNEL_NAME.clone(),
-        commit_sha: option_env!("ZED_COMMIT_SHA").unwrap_or("no_sha").to_owned(),
-    }))
-    .detach();
+    crashes::init(
+        crashes::InitCrashHandler {
+            session_id: id,
+            zed_version: VERSION.to_owned(),
+            binary: "zed-remote-server".to_string(),
+            release_channel: release_channel::RELEASE_CHANNEL_NAME.clone(),
+            commit_sha: option_env!("ZED_COMMIT_SHA").unwrap_or("no_sha").to_owned(),
+        },
+        |task| {
+            smol::spawn(task).detach();
+        },
+    );
 
     log::info!("starting proxy process. PID: {}", std::process::id());
     let server_pid = {

crates/zed/src/main.rs 🔗

@@ -332,8 +332,8 @@ fn main() {
         .background_executor()
         .spawn(Session::new(session_id.clone()));
 
-    app.background_executor()
-        .spawn(crashes::init(InitCrashHandler {
+    crashes::init(
+        InitCrashHandler {
             session_id,
             zed_version: app_version.to_string(),
             binary: "zed".to_string(),
@@ -342,8 +342,11 @@ fn main() {
                 .as_ref()
                 .map(|sha| sha.full())
                 .unwrap_or_else(|| "no sha".to_owned()),
-        }))
-        .detach();
+        },
+        |task| {
+            app.background_executor().spawn(task).detach();
+        },
+    );
 
     let (open_listener, mut open_rx) = OpenListener::new();