From 61e7746d4c513affc5de86a8e6a9876f658a2a2e Mon Sep 17 00:00:00 2001 From: Conrad Irwin Date: Mon, 23 Feb 2026 21:44:20 -0700 Subject: [PATCH] Fix wasmtime panic handling (#49944) We see a number of crashes in Sentry that appear to be crashes in wasmtime. This shouldn't happen, as wasmtime is designed to run untrusted code "safely". Looking into this, it seems likely that the problem is that we race with wasmtime when installing signal handlers. If wasmtime's handlers are installed before ours, then any signals that it intends to handle (like out of bounds memory access) will reach our handlers before its; which causes us to assume the app has crashed. This changes fixes our crash handler initialization to ensure we always create our signal handler first, and reverts a previous attempt to fix this from #40883 Closes #ISSUE Before you mark this PR as ready for review, make sure that you have: - [ ] Added a solid test coverage and/or screenshots from doing manual testing - [ ] Done a self-review taking into account security and performance aspects - [ ] Aligned any UI changes with the [UI checklist](https://github.com/zed-industries/zed/blob/main/CONTRIBUTING.md#uiux-checklist) Release Notes: - Linux: Fixed crashes that could happen due to our crash handler erroneously catching signals intended for wasmtime. --- Cargo.lock | 2 +- crates/crashes/Cargo.toml | 3 +- crates/crashes/src/crashes.rs | 140 +++++++++++++------------ crates/extension_host/src/wasm_host.rs | 16 +-- crates/remote_server/src/server.rs | 31 +++--- crates/zed/src/main.rs | 11 +- 6 files changed, 103 insertions(+), 100 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 430cbe0f580e9ad94767441fca687d69ce404099..964fce6bf3acaadff8a539df9937c84b1d0bdb74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4086,7 +4086,7 @@ dependencies = [ "bincode", "cfg-if", "crash-handler", - "extension_host", + "futures 0.3.31", "log", "mach2 0.5.0", "minidumper", diff --git a/crates/crashes/Cargo.toml b/crates/crashes/Cargo.toml index bd1c1121848e34349b5cd58c0fa033d380fa791b..5e451853a925d86ffcc1491a5c95af1f94e6ed05 100644 --- a/crates/crashes/Cargo.toml +++ b/crates/crashes/Cargo.toml @@ -9,9 +9,10 @@ license = "GPL-3.0-or-later" bincode.workspace = true cfg-if.workspace = true crash-handler.workspace = true -extension_host.workspace = true +futures.workspace = true log.workspace = true minidumper.workspace = true + paths.workspace = true release_channel.workspace = true smol.workspace = true diff --git a/crates/crashes/src/crashes.rs b/crates/crashes/src/crashes.rs index 967b2b846461a701f377ceacefbbaaa9d811091d..a1a43dbb88198b7afd4b89141f7578c0a5bc25ce 100644 --- a/crates/crashes/src/crashes.rs +++ b/crates/crashes/src/crashes.rs @@ -1,8 +1,10 @@ use crash_handler::{CrashEventResult, CrashHandler}; +use futures::future::BoxFuture; use log::info; use minidumper::{Client, LoopAction, MinidumpBinary}; use release_channel::{RELEASE_CHANNEL, ReleaseChannel}; use serde::{Deserialize, Serialize}; +use std::mem; #[cfg(not(target_os = "windows"))] use smol::process::Command; @@ -34,43 +36,76 @@ const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10); #[cfg(target_os = "macos")] static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0); -pub async fn init(crash_init: InitCrashHandler) { - let gen_var = match env::var("ZED_GENERATE_MINIDUMPS") { - Ok(v) => { - if v == "false" || v == "0" { - Some(false) - } else { - Some(true) - } - } - Err(_) => None, - }; +fn should_install_crash_handler() -> bool { + if let Ok(value) = env::var("ZED_GENERATE_MINIDUMPS") { + return value == "true" || value == "1"; + } - match (gen_var, *RELEASE_CHANNEL) { - (Some(false), _) | (None, ReleaseChannel::Dev) => { - let old_hook = panic::take_hook(); - panic::set_hook(Box::new(move |info| { - unsafe { env::set_var("RUST_BACKTRACE", "1") }; - old_hook(info); - // prevent the macOS crash dialog from popping up - if cfg!(target_os = "macos") { - std::process::exit(1); - } - })); - return; - } - _ => { - panic::set_hook(Box::new(panic_hook)); - } + if *RELEASE_CHANNEL == ReleaseChannel::Dev { + return false; + } + + true +} + +/// Install crash signal handlers and spawn the crash-handler subprocess. +/// +/// The synchronous portion (signal handlers, panic hook) runs inline. +/// The async keepalive task is passed to `spawn` so the caller decides +/// which executor to schedule it on. +pub fn init(crash_init: InitCrashHandler, spawn: impl FnOnce(BoxFuture<'static, ()>)) { + if !should_install_crash_handler() { + let old_hook = panic::take_hook(); + panic::set_hook(Box::new(move |info| { + unsafe { env::set_var("RUST_BACKTRACE", "1") }; + old_hook(info); + // prevent the macOS crash dialog from popping up + if cfg!(target_os = "macos") { + std::process::exit(1); + } + })); + return; } + panic::set_hook(Box::new(panic_hook)); + + let handler = CrashHandler::attach(unsafe { + crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| { + let Some(client) = CRASH_HANDLER.get() else { + return CrashEventResult::Handled(false); + }; + + // only request a minidump once + let res = if REQUESTED_MINIDUMP + .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) + .is_ok() + { + #[cfg(target_os = "macos")] + suspend_all_other_threads(); + + // on macos this "ping" is needed to ensure that all our + // `client.send_message` calls have been processed before we trigger the + // minidump request. + client.ping().ok(); + client.request_dump(crash_context).is_ok() + } else { + true + }; + CrashEventResult::Handled(res) + }) + }) + .expect("failed to attach signal handler"); + + info!("crash signal handlers installed"); + + spawn(Box::pin(connect_and_keepalive(crash_init, handler))); +} + +/// Spawn the crash-handler subprocess, connect the IPC client, and run the +/// keepalive ping loop. Called on a background executor by [`init`]. +async fn connect_and_keepalive(crash_init: InitCrashHandler, handler: CrashHandler) { let exe = env::current_exe().expect("unable to find ourselves"); let zed_pid = process::id(); - // TODO: we should be able to get away with using 1 crash-handler process per machine, - // but for now we append the PID of the current process which makes it unique per remote - // server or interactive zed instance. This solves an issue where occasionally the socket - // used by the crash handler isn't destroyed correctly which causes it to stay on the file - // system and block further attempts to initialize crash handlers with that socket path. let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}")); #[cfg(not(target_os = "windows"))] let _crash_handler = Command::new(exe) @@ -82,8 +117,6 @@ pub async fn init(crash_init: InitCrashHandler) { #[cfg(target_os = "windows")] spawn_crash_handler_windows(&exe, &socket_name); - #[cfg(target_os = "linux")] - let server_pid = _crash_handler.id(); info!("spawning crash handler process"); let mut elapsed = Duration::ZERO; @@ -106,36 +139,15 @@ pub async fn init(crash_init: InitCrashHandler) { .unwrap(); let client = Arc::new(client); - let handler = CrashHandler::attach(unsafe { - let client = client.clone(); - crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| { - // only request a minidump once - let res = if REQUESTED_MINIDUMP - .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) - .is_ok() - { - #[cfg(target_os = "macos")] - suspend_all_other_threads(); - - // on macos this "ping" is needed to ensure that all our - // `client.send_message` calls have been processed before we trigger the - // minidump request. - client.ping().ok(); - client.request_dump(crash_context).is_ok() - } else { - true - }; - CrashEventResult::Handled(res) - }) - }) - .expect("failed to attach signal handler"); #[cfg(target_os = "linux")] - { - handler.set_ptracer(Some(server_pid)); - } + handler.set_ptracer(Some(_crash_handler.id())); + + // Publishing the client to the OnceLock makes it visible to the signal + // handler callback installed earlier. CRASH_HANDLER.set(client.clone()).ok(); - std::mem::forget(handler); + // mem::forget so that the drop is not called + mem::forget(handler); info!("crash handler registered"); loop { @@ -300,12 +312,6 @@ impl minidumper::ServerHandler for CrashServer { } pub fn panic_hook(info: &PanicHookInfo) { - // Don't handle a panic on threads that are not relevant to the main execution. - if extension_host::wasm_host::IS_WASM_THREAD.with(|v| v.load(Ordering::Acquire)) { - log::error!("wasm thread panicked!"); - return; - } - let message = info.payload_as_str().unwrap_or("Box").to_owned(); let span = info diff --git a/crates/extension_host/src/wasm_host.rs b/crates/extension_host/src/wasm_host.rs index a5994f8438236b39ae9bf9b600557f3f55360cfc..fe3c11de3ae78115b8e5db08884b7e07be152324 100644 --- a/crates/extension_host/src/wasm_host.rs +++ b/crates/extension_host/src/wasm_host.rs @@ -33,10 +33,7 @@ use settings::Settings; use std::{ borrow::Cow, path::{Path, PathBuf}, - sync::{ - Arc, LazyLock, OnceLock, - atomic::{AtomicBool, Ordering}, - }, + sync::{Arc, LazyLock, OnceLock}, time::Duration, }; use task::{DebugScenario, SpawnInTerminal, TaskTemplate, ZedDebugConfig}; @@ -498,11 +495,6 @@ pub struct WasmState { pub(crate) capability_granter: CapabilityGranter, } -std::thread_local! { - /// Used by the crash handler to ignore panics in extension-related threads. - pub static IS_WASM_THREAD: AtomicBool = const { AtomicBool::new(false) }; -} - type MainThreadCall = Box FnOnce(&'a mut AsyncApp) -> LocalBoxFuture<'a, ()>>; type ExtensionCall = Box< @@ -656,12 +648,6 @@ impl WasmHost { let (tx, mut rx) = mpsc::unbounded::(); let extension_task = async move { - // note: Setting the thread local here will slowly "poison" all tokio threads - // causing us to not record their panics any longer. - // - // This is fine though, the main zed binary only uses tokio for livekit and wasm extensions. - // Livekit seldom (if ever) panics 🤞 so the likelihood of us missing a panic in sentry is very low. - IS_WASM_THREAD.with(|v| v.store(true, Ordering::Release)); while let Some(call) = rx.next().await { (call)(&mut extension, &mut store).await; } diff --git a/crates/remote_server/src/server.rs b/crates/remote_server/src/server.rs index 6f0cf3003309988bdc276e5e8ed9d34ed4c64c81..6784f5fc1d221989aeaf1ecbd34da65f8f923a87 100644 --- a/crates/remote_server/src/server.rs +++ b/crates/remote_server/src/server.rs @@ -452,15 +452,18 @@ pub fn execute_run( let app = gpui_platform::headless(); let pid = std::process::id(); let id = pid.to_string(); - app.background_executor() - .spawn(crashes::init(crashes::InitCrashHandler { + crashes::init( + crashes::InitCrashHandler { session_id: id, zed_version: VERSION.to_owned(), binary: "zed-remote-server".to_string(), release_channel: release_channel::RELEASE_CHANNEL_NAME.clone(), commit_sha: option_env!("ZED_COMMIT_SHA").unwrap_or("no_sha").to_owned(), - })) - .detach(); + }, + |task| { + app.background_executor().spawn(task).detach(); + }, + ); let log_rx = init_logging_server(&log_file)?; log::info!( "starting up with PID {}:\npid_file: {:?}, log_file: {:?}, stdin_socket: {:?}, stdout_socket: {:?}, stderr_socket: {:?}", @@ -704,14 +707,18 @@ pub(crate) fn execute_proxy( let server_paths = ServerPaths::new(&identifier)?; let id = std::process::id().to_string(); - smol::spawn(crashes::init(crashes::InitCrashHandler { - session_id: id, - zed_version: VERSION.to_owned(), - binary: "zed-remote-server".to_string(), - release_channel: release_channel::RELEASE_CHANNEL_NAME.clone(), - commit_sha: option_env!("ZED_COMMIT_SHA").unwrap_or("no_sha").to_owned(), - })) - .detach(); + crashes::init( + crashes::InitCrashHandler { + session_id: id, + zed_version: VERSION.to_owned(), + binary: "zed-remote-server".to_string(), + release_channel: release_channel::RELEASE_CHANNEL_NAME.clone(), + commit_sha: option_env!("ZED_COMMIT_SHA").unwrap_or("no_sha").to_owned(), + }, + |task| { + smol::spawn(task).detach(); + }, + ); log::info!("starting proxy process. PID: {}", std::process::id()); let server_pid = { diff --git a/crates/zed/src/main.rs b/crates/zed/src/main.rs index 9a76cee4079053953dd263439de66b29749ec425..95ff6f03b1b7902e254c5e405c5d8b50e1f48773 100644 --- a/crates/zed/src/main.rs +++ b/crates/zed/src/main.rs @@ -332,8 +332,8 @@ fn main() { .background_executor() .spawn(Session::new(session_id.clone())); - app.background_executor() - .spawn(crashes::init(InitCrashHandler { + crashes::init( + InitCrashHandler { session_id, zed_version: app_version.to_string(), binary: "zed".to_string(), @@ -342,8 +342,11 @@ fn main() { .as_ref() .map(|sha| sha.full()) .unwrap_or_else(|| "no sha".to_owned()), - })) - .detach(); + }, + |task| { + app.background_executor().spawn(task).detach(); + }, + ); let (open_listener, mut open_rx) = OpenListener::new();