remote: Fix being unable to reconnect when the remote server dies (#47200)

Lukas Wirth created

Closes https://github.com/zed-industries/zed/issues/38522

Release Notes:

- Fixed remote connections getting stuck in limbo when the server side
dies unexpectedly

Change summary

crates/project/src/project.rs                      |   8 
crates/recent_projects/src/disconnected_overlay.rs |  29 ++-
crates/remote/src/proxy.rs                         |  17 +-
crates/remote/src/remote_client.rs                 |   6 
crates/remote/src/transport.rs                     |  12 +
crates/remote_server/src/main.rs                   |  15 +
crates/remote_server/src/unix.rs                   | 116 ++++++++++-----
crates/workspace/src/workspace.rs                  |   4 
8 files changed, 138 insertions(+), 69 deletions(-)

Detailed changes

crates/project/src/project.rs 🔗

@@ -333,7 +333,9 @@ pub enum Event {
     },
     RemoteIdChanged(Option<u64>),
     DisconnectedFromHost,
-    DisconnectedFromRemote,
+    DisconnectedFromRemote {
+        server_not_running: bool,
+    },
     Closed,
     DeletedEntry(WorktreeId, ProjectEntryId),
     CollaboratorUpdated {
@@ -3321,7 +3323,7 @@ impl Project {
         cx: &mut Context<Self>,
     ) {
         match event {
-            remote::RemoteClientEvent::Disconnected => {
+            &remote::RemoteClientEvent::Disconnected { server_not_running } => {
                 self.worktree_store.update(cx, |store, cx| {
                     store.disconnected_from_host(cx);
                 });
@@ -3331,7 +3333,7 @@ impl Project {
                 self.lsp_store.update(cx, |lsp_store, _cx| {
                     lsp_store.disconnected_from_ssh_remote()
                 });
-                cx.emit(Event::DisconnectedFromRemote);
+                cx.emit(Event::DisconnectedFromRemote { server_not_running });
             }
         }
     }

crates/recent_projects/src/disconnected_overlay.rs 🔗

@@ -13,7 +13,7 @@ use crate::open_remote_project;
 
 enum Host {
     CollabGuestProject,
-    RemoteServerProject(RemoteConnectionOptions),
+    RemoteServerProject(RemoteConnectionOptions, bool),
 }
 
 pub struct DisconnectedOverlay {
@@ -57,7 +57,8 @@ impl DisconnectedOverlay {
             |workspace, project, event, window, cx| {
                 if !matches!(
                     event,
-                    project::Event::DisconnectedFromHost | project::Event::DisconnectedFromRemote
+                    project::Event::DisconnectedFromHost
+                        | project::Event::DisconnectedFromRemote { .. }
                 ) {
                     return;
                 }
@@ -65,7 +66,15 @@ impl DisconnectedOverlay {
 
                 let remote_connection_options = project.read(cx).remote_connection_options(cx);
                 let host = if let Some(remote_connection_options) = remote_connection_options {
-                    Host::RemoteServerProject(remote_connection_options)
+                    Host::RemoteServerProject(
+                        remote_connection_options,
+                        matches!(
+                            event,
+                            project::Event::DisconnectedFromRemote {
+                                server_not_running: true
+                            }
+                        ),
+                    )
                 } else {
                     Host::CollabGuestProject
                 };
@@ -85,7 +94,7 @@ impl DisconnectedOverlay {
         self.finished = true;
         cx.emit(DismissEvent);
 
-        if let Host::RemoteServerProject(remote_connection_options) = &self.host {
+        if let Host::RemoteServerProject(remote_connection_options, _) = &self.host {
             self.reconnect_to_remote_project(remote_connection_options.clone(), window, cx);
         }
     }
@@ -137,13 +146,13 @@ impl DisconnectedOverlay {
 
 impl Render for DisconnectedOverlay {
     fn render(&mut self, _: &mut Window, cx: &mut Context<Self>) -> impl IntoElement {
-        let can_reconnect = matches!(self.host, Host::RemoteServerProject(_));
+        let can_reconnect = matches!(self.host, Host::RemoteServerProject(..));
 
         let message = match &self.host {
             Host::CollabGuestProject => {
                 "Your connection to the remote project has been lost.".to_string()
             }
-            Host::RemoteServerProject(options) => {
+            Host::RemoteServerProject(options, server_not_running) => {
                 let autosave = if ProjectSettings::get_global(cx)
                     .session
                     .restore_unsaved_buffers
@@ -152,10 +161,14 @@ impl Render for DisconnectedOverlay {
                 } else {
                     ""
                 };
+                let reason = if *server_not_running {
+                    "process exiting unexpectedly"
+                } else {
+                    "not responding"
+                };
                 format!(
-                    "Your connection to {} has been lost.{}",
+                    "Your connection to {} has been lost due to the server {reason}.{autosave}",
                     options.display_name(),
-                    autosave
                 )
             }
         };

crates/remote/src/proxy.rs 🔗

@@ -1,19 +1,18 @@
 use thiserror::Error;
 
-#[derive(Error, Debug)]
+#[derive(Copy, Clone, Error, Debug)]
+#[repr(i32)]
 pub enum ProxyLaunchError {
+    // We're using 90 as the exit code, because 0-78 are often taken
+    // by shells and other conventions and >128 also has certain meanings
+    // in certain contexts.
     #[error("Attempted reconnect, but server not running.")]
-    ServerNotRunning,
+    ServerNotRunning = 90,
 }
 
 impl ProxyLaunchError {
-    pub fn to_exit_code(&self) -> i32 {
-        match self {
-            // We're using 90 as the exit code, because 0-78 are often taken
-            // by shells and other conventions and >128 also has certain meanings
-            // in certain contexts.
-            Self::ServerNotRunning => 90,
-        }
+    pub fn to_exit_code(self) -> i32 {
+        self as i32
     }
 
     pub fn from_exit_code(exit_code: i32) -> Option<Self> {

crates/remote/src/remote_client.rs 🔗

@@ -324,7 +324,7 @@ pub struct RemoteClient {
 
 #[derive(Debug)]
 pub enum RemoteClientEvent {
-    Disconnected,
+    Disconnected { server_not_running: bool },
 }
 
 impl EventEmitter<RemoteClientEvent> for RemoteClient {}
@@ -881,7 +881,9 @@ impl RemoteClient {
         self.state.replace(state);
 
         if is_reconnect_exhausted || is_server_not_running {
-            cx.emit(RemoteClientEvent::Disconnected);
+            cx.emit(RemoteClientEvent::Disconnected {
+                server_not_running: is_server_not_running,
+            });
         }
         cx.notify();
     }

crates/remote/src/transport.rs 🔗

@@ -159,10 +159,14 @@ fn handle_rpc_messages_over_child_process_stdio(
                 result.context("stderr")
             }
         };
-        let status = remote_proxy_process.status().await?.code().unwrap_or(1);
-        if status != 0 {
-            anyhow::bail!("Remote server exited with status {status}");
-        }
+        let exit_status = remote_proxy_process.status().await?;
+        let status = exit_status.code().unwrap_or_else(|| {
+            #[cfg(unix)]
+            let status = std::os::unix::process::ExitStatusExt::signal(&exit_status).unwrap_or(1);
+            #[cfg(not(unix))]
+            let status = 1;
+            status
+        });
         match result {
             Ok(_) => Ok(status),
             Err(error) => Err(error),

crates/remote_server/src/main.rs 🔗

@@ -40,7 +40,20 @@ fn main() -> anyhow::Result<()> {
 
     #[cfg(not(windows))]
     if let Some(command) = cli.command {
-        remote_server::run(command)
+        use remote_server::unix::ExecuteProxyError;
+
+        let res = remote_server::run(command);
+        if let Err(e) = &res
+            && let Some(e) = e.downcast_ref::<ExecuteProxyError>()
+        {
+            eprintln!("{e:#}");
+            // It is important for us to report the proxy spawn exit code here
+            // instead of the generic 1 that result returns
+            // The client reads the exit code to determine if the server process has died when trying to reconnect
+            // signaling that it needs to try spawning a new server
+            std::process::exit(e.to_exit_code());
+        }
+        res
     } else {
         eprintln!("usage: remote <run|proxy|version>");
         std::process::exit(1);

crates/remote_server/src/unix.rs 🔗

@@ -296,7 +296,7 @@ fn start_server(
 
                     stdin_message = stdin_msg_rx.next().fuse() => {
                         let Some(message) = stdin_message else {
-                            log::warn!("error reading message on stdin. exiting.");
+                            log::warn!("error reading message on stdin, dropping connection.");
                             break;
                         };
                         if let Err(error) = incoming_tx.unbounded_send(message) {
@@ -378,7 +378,8 @@ pub fn execute_run(
     }
 
     let app = gpui::Application::headless();
-    let id = std::process::id().to_string();
+    let pid = std::process::id();
+    let id = pid.to_string();
     app.background_executor()
         .spawn(crashes::init(crashes::InitCrashHandler {
             session_id: id,
@@ -390,7 +391,8 @@ pub fn execute_run(
         .detach();
     let log_rx = init_logging_server(&log_file)?;
     log::info!(
-        "starting up. pid_file: {:?}, log_file: {:?}, stdin_socket: {:?}, stdout_socket: {:?}, stderr_socket: {:?}",
+        "starting up with PID {}:\npid_file: {:?}, log_file: {:?}, stdin_socket: {:?}, stdout_socket: {:?}, stderr_socket: {:?}",
+        pid,
         pid_file,
         log_file,
         stdin_socket,
@@ -398,7 +400,7 @@ pub fn execute_run(
         stderr_socket
     );
 
-    write_pid_file(&pid_file)
+    write_pid_file(&pid_file, pid)
         .with_context(|| format!("failed to write pid file: {:?}", &pid_file))?;
 
     let listeners = ServerListeners::new(stdin_socket, stdout_socket, stderr_socket)?;
@@ -519,7 +521,7 @@ pub fn execute_run(
 }
 
 #[derive(Debug, Error)]
-pub(crate) enum ServerPathError {
+pub enum ServerPathError {
     #[error("Failed to create server_dir `{path}`")]
     CreateServerDir {
         #[source]
@@ -575,7 +577,7 @@ impl ServerPaths {
 }
 
 #[derive(Debug, Error)]
-pub(crate) enum ExecuteProxyError {
+pub enum ExecuteProxyError {
     #[error("Failed to init server paths")]
     ServerPath(#[from] ServerPathError),
 
@@ -607,6 +609,17 @@ pub(crate) enum ExecuteProxyError {
     StderrTask(#[source] anyhow::Error),
 }
 
+impl ExecuteProxyError {
+    pub fn to_exit_code(&self) -> i32 {
+        match self {
+            ExecuteProxyError::ServerNotRunning(proxy_launch_error) => {
+                proxy_launch_error.to_exit_code()
+            }
+            _ => 1,
+        }
+    }
+}
+
 pub(crate) fn execute_proxy(
     identifier: String,
     is_reconnecting: bool,
@@ -626,35 +639,51 @@ pub(crate) fn execute_proxy(
     .detach();
 
     log::info!("starting proxy process. PID: {}", std::process::id());
-    smol::block_on(async {
+    let server_pid = smol::block_on(async {
         let server_pid = check_pid_file(&server_paths.pid_file)
             .await
             .map_err(|source| ExecuteProxyError::CheckPidFile {
                 source,
                 path: server_paths.pid_file.clone(),
             })?;
-        let server_running = server_pid.is_some();
         if is_reconnecting {
-            if !server_running {
-                log::error!("attempted to reconnect, but no server running");
-                return Err(ExecuteProxyError::ServerNotRunning(
-                    ProxyLaunchError::ServerNotRunning,
-                ));
+            match server_pid {
+                None => {
+                    log::error!("attempted to reconnect, but no server running");
+                    Err(ExecuteProxyError::ServerNotRunning(
+                        ProxyLaunchError::ServerNotRunning,
+                    ))
+                }
+                Some(server_pid) => Ok(server_pid),
             }
         } else {
-            if let Some(pid) = server_pid {
-                log::info!(
-                    "proxy found server already running with PID {}. Killing process and cleaning up files...",
-                    pid
-                );
-                kill_running_server(pid, &server_paths).await?;
+            match server_pid {
+                Some(pid) => {
+                    log::info!(
+                        "proxy found server already running with PID {}. Killing process and cleaning up files...",
+                        pid
+                    );
+                    kill_running_server(pid, &server_paths).await?;
+                    Ok(pid)
+                }
+                None => {
+                    spawn_server(&server_paths)
+                        .await
+                        .map_err(ExecuteProxyError::SpawnServer)?;
+                    std::fs::read_to_string(&server_paths.pid_file)
+                        .and_then(|contents| {
+                            contents.parse::<u32>().map_err(|_| {
+                                std::io::Error::new(
+                                    std::io::ErrorKind::InvalidData,
+                                    "Invalid PID file contents",
+                                )
+                            })
+                        })
+                        .map_err(SpawnServerError::ProcessStatus)
+                        .map_err(ExecuteProxyError::SpawnServer)
+                }
             }
-
-            spawn_server(&server_paths)
-                .await
-                .map_err(ExecuteProxyError::SpawnServer)?;
-        };
-        Ok(())
+        }
     })?;
 
     let stdin_task = smol::spawn(async move {
@@ -699,10 +728,13 @@ pub(crate) fn execute_proxy(
             result = stderr_task.fuse() => result.map_err(ExecuteProxyError::StderrTask),
         }
     }) {
-        log::error!(
-            "encountered error while forwarding messages: {:?}, terminating...",
-            forwarding_result
-        );
+        log::error!("encountered error while forwarding messages: {forwarding_result:#}",);
+        if !matches!(smol::block_on(check_server_running(server_pid)), Ok(true)) {
+            log::error!("server exited unexpectedly");
+            return Err(ExecuteProxyError::ServerNotRunning(
+                ProxyLaunchError::ServerNotRunning,
+            ));
+        }
         return Err(forwarding_result);
     }
 
@@ -730,7 +762,7 @@ async fn kill_running_server(pid: u32, paths: &ServerPaths) -> Result<(), Execut
 }
 
 #[derive(Debug, Error)]
-pub(crate) enum SpawnServerError {
+pub enum SpawnServerError {
     #[error("failed to remove stdin socket")]
     RemoveStdinSocket(#[source] std::io::Error),
 
@@ -819,11 +851,19 @@ async fn spawn_server(paths: &ServerPaths) -> Result<(), SpawnServerError> {
 
 #[derive(Debug, Error)]
 #[error("Failed to remove PID file for missing process (pid `{pid}`")]
-pub(crate) struct CheckPidError {
+pub struct CheckPidError {
     #[source]
     source: std::io::Error,
     pid: u32,
 }
+async fn check_server_running(pid: u32) -> std::io::Result<bool> {
+    new_smol_command("kill")
+        .arg("-0")
+        .arg(pid.to_string())
+        .output()
+        .await
+        .map(|output| output.status.success())
+}
 
 async fn check_pid_file(path: &Path) -> Result<Option<u32>, CheckPidError> {
     let Some(pid) = std::fs::read_to_string(&path)
@@ -834,13 +874,8 @@ async fn check_pid_file(path: &Path) -> Result<Option<u32>, CheckPidError> {
     };
 
     log::debug!("Checking if process with PID {} exists...", pid);
-    match new_smol_command("kill")
-        .arg("-0")
-        .arg(pid.to_string())
-        .output()
-        .await
-    {
-        Ok(output) if output.status.success() => {
+    match check_server_running(pid).await {
+        Ok(true) => {
             log::debug!(
                 "Process with PID {} exists. NOT spawning new server, but attaching to existing one.",
                 pid
@@ -857,13 +892,12 @@ async fn check_pid_file(path: &Path) -> Result<Option<u32>, CheckPidError> {
     }
 }
 
-fn write_pid_file(path: &Path) -> Result<()> {
+fn write_pid_file(path: &Path, pid: u32) -> Result<()> {
     if path.exists() {
         std::fs::remove_file(path)?;
     }
-    let pid = std::process::id().to_string();
     log::debug!("writing PID {} to file {:?}", pid, path);
-    std::fs::write(path, pid).context("Failed to write PID file")
+    std::fs::write(path, pid.to_string()).context("Failed to write PID file")
 }
 
 async fn handle_io<R, W>(mut reader: R, mut writer: W, socket_name: &str) -> Result<()>

crates/workspace/src/workspace.rs 🔗

@@ -1323,7 +1323,9 @@ impl Workspace {
                     }
                 }
 
-                project::Event::DisconnectedFromRemote => {
+                project::Event::DisconnectedFromRemote {
+                    server_not_running: _,
+                } => {
                     this.update_window_edited(window, cx);
                 }