fix(server): detect and remove a dead socket before starting server

Christian Rocha and Charm Crush created

The client now checks whether a leftover socket has a live server
behind it. If it does not, the client clears the dead socket and
starts a fresh server instead of getting stuck trying to reach one
that is no longer there. The detection is shared across platforms.

Co-Authored-By: Charm Crush <crush@charm.land>

Change summary

internal/cmd/root.go               | 22 ++++++++++++++++++++++
internal/server/socket.go          | 22 ++++------------------
internal/server/socket_classify.go | 27 +++++++++++++++++++++++++++
3 files changed, 53 insertions(+), 18 deletions(-)

Detailed changes

internal/cmd/root.go 🔗

@@ -430,6 +430,28 @@ func ensureServer(cmd *cobra.Command, hostURL *url.URL) error {
 		_, statErr := os.Stat(hostURL.Host)
 		switch {
 		case statErr == nil:
+			// Probe the socket explicitly before the version-check
+			// path. A stale unix socket file (the previous server
+			// exited without cleaning up) would otherwise make
+			// restartIfStale spin on a non-responsive endpoint; here
+			// we detect it with a short DialTimeout and remove the
+			// orphaned file so the normal spawn path can run.
+			if hostURL.Scheme == "unix" {
+				conn, dialErr := net.DialTimeout(
+					hostURL.Scheme, hostURL.Host, 200*time.Millisecond,
+				)
+				if dialErr == nil {
+					conn.Close()
+				} else if server.IsStaleSocketErr(dialErr) {
+					slog.Warn("Stale socket detected, removing",
+						"path", hostURL.Host, "error", dialErr)
+					if err := os.Remove(hostURL.Host); err != nil && !errors.Is(err, fs.ErrNotExist) {
+						return fmt.Errorf("failed to remove stale server socket %q: %v", hostURL.Host, err)
+					}
+					needsStart = true
+					break
+				}
+			}
 			restarted, err := restartIfStale(cmd, hostURL)
 			if err != nil {
 				slog.Warn("Failed to check server version", "error", err)

internal/server/socket.go 🔗

@@ -2,23 +2,9 @@
 
 package server
 
-import (
-	"errors"
-	"io/fs"
-	"net"
-	"syscall"
-)
-
-// isStaleSocketErr reports whether err indicates a Unix-domain socket file
-// exists on disk but no process is listening on it (a stale or orphaned
-// socket). It returns false for nil and for timeout errors.
+// isStaleSocketErr is the internal, non-Windows alias for the
+// cross-platform IsStaleSocketErr. It is kept for the existing
+// callers in net_other.go.
 func isStaleSocketErr(err error) bool {
-	if err == nil {
-		return false
-	}
-	var netErr net.Error
-	if errors.As(err, &netErr) && netErr.Timeout() {
-		return false
-	}
-	return errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, fs.ErrNotExist)
+	return IsStaleSocketErr(err)
 }

internal/server/socket_classify.go 🔗

@@ -0,0 +1,27 @@
+package server
+
+import (
+	"errors"
+	"io/fs"
+	"net"
+	"syscall"
+)
+
+// IsStaleSocketErr reports whether err indicates that a Unix-domain
+// socket file exists on disk but no process is listening on it (a stale
+// or orphaned socket). It returns false for nil and for timeout errors.
+//
+// The classification is cross-platform: ECONNREFUSED and fs.ErrNotExist
+// are defined on every supported OS, so callers on Windows can use the
+// same helper even though stale-socket recovery only applies to Unix
+// sockets in practice.
+func IsStaleSocketErr(err error) bool {
+	if err == nil {
+		return false
+	}
+	var netErr net.Error
+	if errors.As(err, &netErr) && netErr.Timeout() {
+		return false
+	}
+	return errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, fs.ErrNotExist)
+}