test(server): regression for the client/server spawn race

Christian Rocha and Charm Crush created

Co-Authored-By: Charm Crush <crush@charm.land>

Change summary

internal/cmd/clientserverrace/race_test.go | 319 ++++++++++++++++++++++++
1 file changed, 319 insertions(+)

Detailed changes

internal/cmd/clientserverrace/race_test.go 🔗

@@ -0,0 +1,319 @@
+// Package clientserverrace_test is a regression test for the
+// CRUSH_CLIENT_SERVER=1 socket-init race documented in
+// docs/notes/2026-05-11-client-server-socket-init-race.md (item F5).
+//
+// It lives in its own directory so it can build even if other test
+// files in internal/cmd are temporarily broken — this test only needs
+// the binary, not the cmd package.
+package clientserverrace_test
+
+import (
+	"context"
+	"fmt"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// readinessErrSubstr is the user-visible error string emitted by
+// ensureServer when it gives up waiting for the server socket /
+// readiness probe (internal/cmd/root.go). Seeing this in any client's
+// output means the race fired.
+const readinessErrSubstr = "failed to initialize crush server"
+
+// numClients is intentionally larger than the typical CPU count to
+// ensure the spawn lock + readiness probe are exercised under
+// contention.
+const numClients = 8
+
+// clientTimeout bounds each child invocation. It only needs to be long
+// enough for the spawn-and-readiness phase to complete on a cold cache;
+// after that, the client may legitimately keep running (e.g.
+// subscribing to server events) and we'll cancel it. The race we care
+// about is observable strictly within ensureServer.
+const clientTimeout = 15 * time.Second
+
+func TestClientServerSpawnRace(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping client/server spawn race test in -short mode")
+	}
+	// The race and its fix are unix-socket specific. Windows uses
+	// named pipes via a different code path; not covered here.
+	if runtime.GOOS == "windows" {
+		t.Skip("skipping unix-socket specific race test on windows")
+	}
+	if _, err := exec.LookPath("go"); err != nil {
+		t.Skip("skipping: 'go' not available on PATH")
+	}
+
+	repoRoot := repoRootFromTest(t)
+	bin := buildCrushBinary(t, repoRoot)
+
+	// Use /tmp directly so the unix socket path stays under the
+	// 104-char sockaddr_un limit on darwin. t.TempDir() can return a
+	// path inside /var/folders/... that is too long.
+	runDir, err := os.MkdirTemp("/tmp", "crush-race-")
+	if err != nil {
+		t.Fatalf("mkdtemp: %v", err)
+	}
+	t.Cleanup(func() { _ = os.RemoveAll(runDir) })
+
+	socketPath := filepath.Join(runDir, "crush.sock")
+	host := "unix://" + socketPath
+
+	// Fresh, isolated XDG/HOME so we don't touch the user's real
+	// state or any other test's cache. These all live under runDir
+	// so cleanup is one RemoveAll.
+	cacheHome := filepath.Join(runDir, "cache")
+	dataHome := filepath.Join(runDir, "data")
+	configHome := filepath.Join(runDir, "config")
+	homeDir := filepath.Join(runDir, "home")
+	for _, d := range []string{cacheHome, dataHome, configHome, homeDir} {
+		if err := os.MkdirAll(d, 0o700); err != nil {
+			t.Fatalf("mkdir %s: %v", d, err)
+		}
+	}
+
+	env := append(os.Environ(),
+		"CRUSH_CLIENT_SERVER=1",
+		"XDG_CACHE_HOME="+cacheHome,
+		"XDG_DATA_HOME="+dataHome,
+		"XDG_CONFIG_HOME="+configHome,
+		"HOME="+homeDir,
+		// Belt-and-suspenders: if anything tries to talk to a real
+		// provider, fail loudly rather than make a network call.
+		"CRUSH_DISABLE_PROVIDER_AUTO_UPDATE=1",
+	)
+
+	// Make sure no server is up before we start.
+	if _, err := os.Stat(socketPath); err == nil {
+		t.Fatalf("socket %s exists before test started", socketPath)
+	}
+
+	// Always try to shut down any server we spawned, regardless of
+	// outcome.
+	t.Cleanup(func() { shutdownServer(t, socketPath) })
+
+	type result struct {
+		idx    int
+		stdout string
+		stderr string
+	}
+	results := make(chan result, numClients)
+
+	var wg sync.WaitGroup
+	start := make(chan struct{})
+	for i := range numClients {
+		wg.Add(1)
+		go func(i int) {
+			defer wg.Done()
+
+			// Each client gets its own working directory so the
+			// per-client workspace registration paths don't collide
+			// in confusing ways.
+			cwd := filepath.Join(runDir, fmt.Sprintf("ws-%d", i))
+			if err := os.MkdirAll(cwd, 0o700); err != nil {
+				results <- result{idx: i, stderr: fmt.Sprintf("mkdir cwd: %v", err)}
+				return
+			}
+
+			ctx, cancel := context.WithTimeout(context.Background(), clientTimeout)
+			defer cancel()
+
+			// `crush run` exercises connectToServer (which is where
+			// the readiness race lives). On a fresh sandbox the
+			// command may legitimately keep running past the race
+			// (e.g. waiting on event subscriptions); the context
+			// timeout above bounds that. We assert race outcomes
+			// purely from output, not exit code.
+			c := exec.CommandContext(ctx, bin,
+				"--host", host,
+				"--cwd", cwd,
+				"run", "hi",
+			)
+			c.Env = env
+			var outBuf, errBuf strings.Builder
+			c.Stdout = &outBuf
+			c.Stderr = &errBuf
+
+			<-start
+			_ = c.Run()
+			results <- result{
+				idx:    i,
+				stdout: outBuf.String(),
+				stderr: errBuf.String(),
+			}
+		}(i)
+	}
+
+	close(start) // release all clients as simultaneously as possible
+	wg.Wait()
+	close(results)
+
+	var raceFailures []string
+	for r := range results {
+		if strings.Contains(r.stderr, readinessErrSubstr) ||
+			strings.Contains(r.stdout, readinessErrSubstr) {
+			raceFailures = append(raceFailures, fmt.Sprintf(
+				"client %d: readiness error in output\nstderr:\n%s\nstdout:\n%s",
+				r.idx, r.stderr, r.stdout,
+			))
+		}
+	}
+
+	if len(raceFailures) > 0 {
+		t.Fatalf("client/server spawn race regressed: %d/%d clients failed\n\n%s",
+			len(raceFailures), numClients,
+			strings.Join(raceFailures, "\n---\n"),
+		)
+	}
+
+	// Positive sanity check: a single live server should be
+	// reachable on the socket. If the race had fired, we'd either
+	// have no socket or a stale one with no listener.
+	if _, err := os.Stat(socketPath); err != nil {
+		t.Fatalf("expected socket %s to exist after parallel clients ran, got: %v",
+			socketPath, err)
+	}
+	if err := pingHealth(socketPath); err != nil {
+		t.Fatalf("server is not responding on %s: %v", socketPath, err)
+	}
+}
+
+// pingHealth issues a single GET /v1/health over the unix socket and
+// requires a 2xx response.
+func pingHealth(socketPath string) error {
+	tr := &http.Transport{
+		DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
+			var d net.Dialer
+			return d.DialContext(ctx, "unix", socketPath)
+		},
+	}
+	defer tr.CloseIdleConnections()
+	hc := &http.Client{Transport: tr, Timeout: 2 * time.Second}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet,
+		"http://crush.local/v1/health", nil)
+	if err != nil {
+		return err
+	}
+	rsp, err := hc.Do(req)
+	if err != nil {
+		return err
+	}
+	defer rsp.Body.Close()
+	if rsp.StatusCode < 200 || rsp.StatusCode >= 300 {
+		return fmt.Errorf("health check returned %s", rsp.Status)
+	}
+	return nil
+}
+
+// repoRootFromTest walks up from this test file's directory to find
+// the repo root (the directory containing go.mod). Walking up by a
+// fixed count is fragile across reorganisations.
+func repoRootFromTest(t *testing.T) string {
+	t.Helper()
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("getwd: %v", err)
+	}
+	dir := cwd
+	for {
+		if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
+			return dir
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			t.Fatalf("could not find go.mod walking up from %s", cwd)
+		}
+		dir = parent
+	}
+}
+
+// buildCrushBinary builds the crush binary once at the start of the
+// test and returns the absolute path. Subsequent t.Cleanup removes
+// the built artefact.
+func buildCrushBinary(t *testing.T, repoRoot string) string {
+	t.Helper()
+
+	binDir, err := os.MkdirTemp("", "crush-race-bin-")
+	if err != nil {
+		t.Fatalf("mkdtemp bin: %v", err)
+	}
+	t.Cleanup(func() { _ = os.RemoveAll(binDir) })
+
+	binPath := filepath.Join(binDir, "crush")
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "go", "build", "-o", binPath, ".")
+	cmd.Dir = repoRoot
+	// Match the project's standard build flags. CGO_ENABLED=0 keeps
+	// the binary statically linked and avoids surprising the test on
+	// hosts without a C toolchain.
+	cmd.Env = append(os.Environ(), "CGO_ENABLED=0")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("go build crush: %v\n%s", err, out)
+	}
+	return binPath
+}
+
+// shutdownServer best-effort terminates any crush server bound to
+// socketPath by POSTing to /v1/control. We don't import the project's
+// own client package to keep this test free of internal API churn.
+func shutdownServer(t *testing.T, socketPath string) {
+	t.Helper()
+	if _, err := os.Stat(socketPath); err != nil {
+		return
+	}
+
+	tr := &http.Transport{
+		DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
+			var d net.Dialer
+			return d.DialContext(ctx, "unix", socketPath)
+		},
+	}
+	hc := &http.Client{Transport: tr, Timeout: 5 * time.Second}
+	defer tr.CloseIdleConnections()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	body := strings.NewReader(`{"command":"shutdown"}`)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost,
+		"http://crush.local/v1/control", body)
+	if err != nil {
+		t.Logf("shutdown: build request: %v", err)
+		return
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := hc.Do(req)
+	if err != nil {
+		// Server may already be gone — not an error.
+		t.Logf("shutdown: %v (probably already exited)", err)
+		return
+	}
+	_ = resp.Body.Close()
+
+	// Wait briefly for the socket to disappear so the next test
+	// using the same path doesn't race.
+	deadline := time.Now().Add(5 * time.Second)
+	for time.Now().Before(deadline) {
+		if _, err := os.Stat(socketPath); err != nil {
+			return
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+}