fix(db): only enforce the data directory lock in client server mode

Christian Rocha and Charm Crush created

The exclusive lock on a data directory was previously taken on every Crush
startup, which broke the long standing local mode workflow of running two
Crush instances in the same directory. The lock is now opt in via a new
option, and only the shared server takes it. Local mode keeps its previous
behavior, and the existing environment variable escape hatch continues to
work.

Co-Authored-By: Charm Crush <crush@charm.land>

Change summary

internal/backend/backend.go |  2 
internal/db/connect.go      | 54 ++++++++++++++++++++++++++++++++------
internal/db/connect_test.go | 53 +++++++++++++++++++++++++++++++++-----
3 files changed, 92 insertions(+), 17 deletions(-)

Detailed changes

internal/backend/backend.go 🔗

@@ -223,7 +223,7 @@ func (b *Backend) CreateWorkspace(args proto.Workspace) (*Workspace, proto.Works
 		return nil, proto.Workspace{}, fmt.Errorf("failed to create data directory: %w", err)
 	}
 
-	conn, err := db.Connect(b.ctx, cfg.Config().Options.DataDirectory)
+	conn, err := db.Connect(b.ctx, cfg.Config().Options.DataDirectory, db.WithDataDirLock(true))
 	if err != nil {
 		return nil, proto.Workspace{}, fmt.Errorf("failed to connect to database: %w", err)
 	}

internal/db/connect.go 🔗

@@ -56,16 +56,39 @@ var (
 	poolMu sync.Mutex
 )
 
+// ConnectOption configures a Connect call. Options are applied in
+// order; later options override earlier ones for the same field.
+type ConnectOption func(*connectOptions)
+
+// connectOptions holds the resolved configuration for a Connect call.
+type connectOptions struct {
+	lockDataDir bool
+}
+
+// WithDataDirLock toggles acquisition of the per-data-directory lock
+// for this Connect call. The lock is off by default so local-mode
+// invocations do not regress today's behavior; the server's
+// workspace-bootstrap path opts in. CRUSH_SKIP_DATADIR_LOCK still
+// bypasses acquisition even when this option is set.
+func WithDataDirLock(enable bool) ConnectOption {
+	return func(o *connectOptions) { o.lockDataDir = enable }
+}
+
 // Connect opens a SQLite database connection for the given data
 // directory and runs migrations. If a connection to the same database
 // file already exists, the existing connection is returned with its
 // reference count incremented. Callers must pair each Connect with a
 // [Release] when they no longer need the connection.
-func Connect(ctx context.Context, dataDir string) (*sql.DB, error) {
+func Connect(ctx context.Context, dataDir string, opts ...ConnectOption) (*sql.DB, error) {
 	if dataDir == "" {
 		return nil, fmt.Errorf("data.dir is not set")
 	}
 
+	var cfg connectOptions
+	for _, opt := range opts {
+		opt(&cfg)
+	}
+
 	dbPath := filepath.Join(dataDir, "crush.db")
 
 	// Resolve to an absolute path so that different relative paths to
@@ -88,18 +111,25 @@ func Connect(ctx context.Context, dataDir string) (*sql.DB, error) {
 	// crush process on the same SQLite file. The lock is released when
 	// the matching Release call drops the refcount to zero. Ensuring
 	// the data directory exists is required because the lock file
-	// lives inside it.
+	// lives inside it. Locking is opt-in via WithDataDirLock so that
+	// local-mode invocations do not refuse a second crush against the
+	// same data dir until client/server becomes the default.
 	if err := os.MkdirAll(dataDir, 0o700); err != nil {
 		return nil, fmt.Errorf("failed to create data directory %q: %w", dataDir, err)
 	}
-	lock, err := acquireDataDirLock(dataDir)
-	if err != nil {
-		return nil, err
+	var lock *dataDirLock
+	if cfg.lockDataDir && !skipDataDirLock() {
+		lock, err = acquireDataDirLock(dataDir)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	conn, err := openDB(dbPath)
 	if err != nil {
-		lock.release()
+		if lock != nil {
+			lock.release()
+		}
 		return nil, err
 	}
 
@@ -110,22 +140,28 @@ func Connect(ctx context.Context, dataDir string) (*sql.DB, error) {
 	// resulting in SQLITE_NOTADB (26) on the next open.
 	conn.SetMaxOpenConns(1)
 
+	releaseLock := func() {
+		if lock != nil {
+			lock.release()
+		}
+	}
+
 	if err = conn.PingContext(ctx); err != nil {
 		conn.Close()
-		lock.release()
+		releaseLock()
 		return nil, fmt.Errorf("failed to connect to database: %w", err)
 	}
 
 	if err := initGoose(); err != nil {
 		conn.Close()
-		lock.release()
+		releaseLock()
 		slog.Error("Failed to initialize goose", "error", err)
 		return nil, fmt.Errorf("failed to initialize goose: %w", err)
 	}
 
 	if err := goose.Up(conn, "migrations"); err != nil {
 		conn.Close()
-		lock.release()
+		releaseLock()
 		slog.Error("Failed to apply migrations", "error", err)
 		return nil, fmt.Errorf("failed to apply migrations: %w", err)
 	}

internal/db/connect_test.go 🔗

@@ -69,7 +69,7 @@ func TestConnect_FailsWhenDataDirLocked(t *testing.T) {
 	require.NoError(t, err, "expected to take the data-dir lock for the first time")
 	t.Cleanup(release)
 
-	_, err = Connect(context.Background(), dataDir)
+	_, err = Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.Error(t, err, "Connect must refuse to open a locked data dir")
 	require.ErrorIs(t, err, ErrDataDirLocked)
 }
@@ -85,12 +85,12 @@ func TestConnect_SucceedsAfterContenderReleases(t *testing.T) {
 	release, err := tryFileLock(lockPath)
 	require.NoError(t, err)
 
-	_, err = Connect(context.Background(), dataDir)
+	_, err = Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.ErrorIs(t, err, ErrDataDirLocked)
 
 	release()
 
-	conn, err := Connect(context.Background(), dataDir)
+	conn, err := Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.NoError(t, err, "Connect should succeed once the contender releases the lock")
 	require.NoError(t, conn.PingContext(context.Background()))
 	require.NoError(t, Release(dataDir))
@@ -105,7 +105,7 @@ func TestConnect_LockReleasedOnFinalRelease(t *testing.T) {
 	dataDir := t.TempDir()
 	lockPath := filepath.Join(dataDir, dataDirLockFile)
 
-	conn, err := Connect(context.Background(), dataDir)
+	conn, err := Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.NoError(t, err)
 	require.NoError(t, conn.PingContext(context.Background()))
 
@@ -135,10 +135,10 @@ func TestConnect_SharedPoolDoesNotReacquireLock(t *testing.T) {
 	dataDir := t.TempDir()
 	lockPath := filepath.Join(dataDir, dataDirLockFile)
 
-	_, err := Connect(context.Background(), dataDir)
+	_, err := Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.NoError(t, err)
 
-	_, err = Connect(context.Background(), dataDir)
+	_, err = Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.NoError(t, err)
 
 	// Drop one reference; lock must still be held.
@@ -163,8 +163,47 @@ func TestConnect_SkipLockEnvBypassesAcquisition(t *testing.T) {
 
 	t.Setenv("CRUSH_SKIP_DATADIR_LOCK", "1")
 
-	conn, err := Connect(context.Background(), dataDir)
+	conn, err := Connect(context.Background(), dataDir, WithDataDirLock(true))
 	require.NoError(t, err, "skip-lock env should bypass contention")
 	require.NoError(t, conn.PingContext(context.Background()))
 	require.NoError(t, Release(dataDir))
 }
+
+// TestConnect_DefaultIgnoresContendedLock confirms that without
+// WithDataDirLock(true) the lock file is irrelevant: a contender can
+// hold tryFileLock and Connect still succeeds. This pins the
+// local-mode default to its pre-lock behavior.
+func TestConnect_DefaultIgnoresContendedLock(t *testing.T) {
+	t.Cleanup(ResetPool)
+
+	dataDir := t.TempDir()
+	lockPath := filepath.Join(dataDir, dataDirLockFile)
+
+	release, err := tryFileLock(lockPath)
+	require.NoError(t, err, "expected to take the data-dir lock for the first time")
+	t.Cleanup(release)
+
+	conn, err := Connect(context.Background(), dataDir)
+	require.NoError(t, err, "default Connect must not take the lock and must succeed under contention")
+	require.NoError(t, conn.PingContext(context.Background()))
+	require.NoError(t, Release(dataDir))
+}
+
+// TestConnect_ServerPathFailsWhenDataDirLocked is the server's
+// workspace-bootstrap analogue of TestConnect_FailsWhenDataDirLocked:
+// passing WithDataDirLock(true) must surface ErrDataDirLocked when a
+// contender already holds the lock.
+func TestConnect_ServerPathFailsWhenDataDirLocked(t *testing.T) {
+	t.Cleanup(ResetPool)
+
+	dataDir := t.TempDir()
+	lockPath := filepath.Join(dataDir, dataDirLockFile)
+
+	release, err := tryFileLock(lockPath)
+	require.NoError(t, err, "expected to take the data-dir lock for the first time")
+	t.Cleanup(release)
+
+	_, err = Connect(context.Background(), dataDir, WithDataDirLock(true))
+	require.Error(t, err, "server-path Connect must refuse to open a locked data dir")
+	require.ErrorIs(t, err, ErrDataDirLocked)
+}