From 443d0ab339d92bf9181f1054a2c760e178f964e0 Mon Sep 17 00:00:00 2001 From: Christian Rocha Date: Mon, 8 Jun 2026 09:26:40 -0400 Subject: [PATCH] fix(server): clear leftover sockets so server can always start When the server starts and finds a socket file with no process behind it, it now removes the dead socket before binding instead of failing with an address-already-in-use error. A live server is never disturbed, and the cleanup is recorded in the log. Co-Authored-By: Charm Crush --- internal/server/net_other.go | 57 ++++++++++++++++++++++++++++++++-- internal/server/net_windows.go | 19 ++++++++++-- internal/server/server.go | 5 ++- 3 files changed, 74 insertions(+), 7 deletions(-) diff --git a/internal/server/net_other.go b/internal/server/net_other.go index ba84404fdbb257cc77354c842bebbab7ce1d156c..29b7dec2c3a47cb07767e1ddd0e73bf9c9bcbf07 100644 --- a/internal/server/net_other.go +++ b/internal/server/net_other.go @@ -2,9 +2,60 @@ package server -import "net" +import ( + "errors" + "io/fs" + "net" + "os" + "time" +) -func listen(network, address string) (net.Listener, error) { +// staleSocketDialTimeout bounds the probe used to detect whether a Unix +// socket file on disk is backed by a live listener. +const staleSocketDialTimeout = 200 * time.Millisecond + +// listen binds a net.Listener on the given network and address. +// +// For unix sockets it self-heals from stale socket files: if the path +// already exists on disk, it first probes with a short net.DialTimeout. +// A successful dial means a live server owns the socket, so we proceed +// to net.Listen (which surfaces the usual "address already in use" +// error). A failed dial that isStaleSocketErr classifies as stale +// triggers an os.Remove of the path (ignoring fs.ErrNotExist) before +// the bind. +// +// The returned removedStale bool reports whether a stale socket file +// was removed prior to binding so callers can log it. The operation +// is idempotent: removing an absent file is a no-op, and a live +// socket is never removed. +func listen(network, address string) (net.Listener, bool, error) { + var removedStale bool + if network == "unix" && address != "" { + if _, err := os.Stat(address); err == nil { + conn, dialErr := net.DialTimeout(network, address, staleSocketDialTimeout) + if dialErr == nil { + // A live server owns the socket. Fall through to + // net.Listen so the caller sees the standard + // "address already in use" error. + conn.Close() + } else if isStaleSocketErr(dialErr) { + rmErr := os.Remove(address) + switch { + case rmErr == nil: + removedStale = true + case errors.Is(rmErr, fs.ErrNotExist): + // Another process removed it between our + // stat and remove; treat as a no-op. + default: + return nil, false, rmErr + } + } + } + } //nolint:noctx - return net.Listen(network, address) + ln, err := net.Listen(network, address) + if err != nil { + return nil, removedStale, err + } + return ln, removedStale, nil } diff --git a/internal/server/net_windows.go b/internal/server/net_windows.go index 3692021aa7e39de8df711395c50b194d8e358047..c5ae0ec106e7c1aa37e908be9fa24908bc25852c 100644 --- a/internal/server/net_windows.go +++ b/internal/server/net_windows.go @@ -9,7 +9,12 @@ import ( "github.com/Microsoft/go-winio" ) -func listen(network, address string) (net.Listener, error) { +// listen binds a net.Listener on the given network and address. +// +// On Windows there is no Unix-socket stale-file recovery to perform, +// so removedStale is always false. The signature matches the +// non-Windows implementation so callers can use a single code path. +func listen(network, address string) (net.Listener, bool, error) { switch network { case "npipe": cfg := &winio.PipeConfig{ @@ -17,8 +22,16 @@ func listen(network, address string) (net.Listener, error) { InputBufferSize: 65536, OutputBufferSize: 65536, } - return winio.ListenPipe(address, cfg) + ln, err := winio.ListenPipe(address, cfg) + if err != nil { + return nil, false, err + } + return ln, false, nil default: - return net.Listen(network, address) //nolint:noctx + ln, err := net.Listen(network, address) //nolint:noctx + if err != nil { + return nil, false, err + } + return ln, false, nil } } diff --git a/internal/server/server.go b/internal/server/server.go index 5a154059719b8381a338817873425e5d91b94d9c..8a5b3c98f362d6dc7568b352f354d1d77cff5076 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -234,10 +234,13 @@ func (s *Server) ListenAndServe() error { if s.ln != nil { return fmt.Errorf("server already started") } - ln, err := listen(s.network, s.Addr) + ln, removedStale, err := listen(s.network, s.Addr) if err != nil { return fmt.Errorf("failed to listen on %s: %w", s.Addr, err) } + if removedStale && s.logger != nil { + s.logger.Warn("Removed stale socket before binding", "address", s.Addr) + } return s.Serve(ln) }