From c986a35c465fdc22fbaede2609522e3e1997e632 Mon Sep 17 00:00:00 2001 From: Christian Rocha Date: Wed, 20 May 2026 07:15:21 -0400 Subject: [PATCH] fix(db): only enforce the data directory lock in client server mode The exclusive lock on a data directory was previously taken on every Crush startup, which broke the long standing local mode workflow of running two Crush instances in the same directory. The lock is now opt in via a new option, and only the shared server takes it. Local mode keeps its previous behavior, and the existing environment variable escape hatch continues to work. Co-Authored-By: Charm Crush --- internal/backend/backend.go | 2 +- internal/db/connect.go | 54 ++++++++++++++++++++++++++++++------- internal/db/connect_test.go | 53 +++++++++++++++++++++++++++++++----- 3 files changed, 92 insertions(+), 17 deletions(-) diff --git a/internal/backend/backend.go b/internal/backend/backend.go index 8b78f85fb1c06114ec4bafc55371c82f623e2bf7..55e3a2785a77de4ab849cc367eb189abae81dbdb 100644 --- a/internal/backend/backend.go +++ b/internal/backend/backend.go @@ -223,7 +223,7 @@ func (b *Backend) CreateWorkspace(args proto.Workspace) (*Workspace, proto.Works return nil, proto.Workspace{}, fmt.Errorf("failed to create data directory: %w", err) } - conn, err := db.Connect(b.ctx, cfg.Config().Options.DataDirectory) + conn, err := db.Connect(b.ctx, cfg.Config().Options.DataDirectory, db.WithDataDirLock(true)) if err != nil { return nil, proto.Workspace{}, fmt.Errorf("failed to connect to database: %w", err) } diff --git a/internal/db/connect.go b/internal/db/connect.go index 1ed0f69a45a9526f9dce25257b531a97ad73d8c6..706fdab1216beac1c92db01c3702d2366725a3ee 100644 --- a/internal/db/connect.go +++ b/internal/db/connect.go @@ -56,16 +56,39 @@ var ( poolMu sync.Mutex ) +// ConnectOption configures a Connect call. Options are applied in +// order; later options override earlier ones for the same field. +type ConnectOption func(*connectOptions) + +// connectOptions holds the resolved configuration for a Connect call. +type connectOptions struct { + lockDataDir bool +} + +// WithDataDirLock toggles acquisition of the per-data-directory lock +// for this Connect call. The lock is off by default so local-mode +// invocations do not regress today's behavior; the server's +// workspace-bootstrap path opts in. CRUSH_SKIP_DATADIR_LOCK still +// bypasses acquisition even when this option is set. +func WithDataDirLock(enable bool) ConnectOption { + return func(o *connectOptions) { o.lockDataDir = enable } +} + // Connect opens a SQLite database connection for the given data // directory and runs migrations. If a connection to the same database // file already exists, the existing connection is returned with its // reference count incremented. Callers must pair each Connect with a // [Release] when they no longer need the connection. -func Connect(ctx context.Context, dataDir string) (*sql.DB, error) { +func Connect(ctx context.Context, dataDir string, opts ...ConnectOption) (*sql.DB, error) { if dataDir == "" { return nil, fmt.Errorf("data.dir is not set") } + var cfg connectOptions + for _, opt := range opts { + opt(&cfg) + } + dbPath := filepath.Join(dataDir, "crush.db") // Resolve to an absolute path so that different relative paths to @@ -88,18 +111,25 @@ func Connect(ctx context.Context, dataDir string) (*sql.DB, error) { // crush process on the same SQLite file. The lock is released when // the matching Release call drops the refcount to zero. Ensuring // the data directory exists is required because the lock file - // lives inside it. + // lives inside it. Locking is opt-in via WithDataDirLock so that + // local-mode invocations do not refuse a second crush against the + // same data dir until client/server becomes the default. if err := os.MkdirAll(dataDir, 0o700); err != nil { return nil, fmt.Errorf("failed to create data directory %q: %w", dataDir, err) } - lock, err := acquireDataDirLock(dataDir) - if err != nil { - return nil, err + var lock *dataDirLock + if cfg.lockDataDir && !skipDataDirLock() { + lock, err = acquireDataDirLock(dataDir) + if err != nil { + return nil, err + } } conn, err := openDB(dbPath) if err != nil { - lock.release() + if lock != nil { + lock.release() + } return nil, err } @@ -110,22 +140,28 @@ func Connect(ctx context.Context, dataDir string) (*sql.DB, error) { // resulting in SQLITE_NOTADB (26) on the next open. conn.SetMaxOpenConns(1) + releaseLock := func() { + if lock != nil { + lock.release() + } + } + if err = conn.PingContext(ctx); err != nil { conn.Close() - lock.release() + releaseLock() return nil, fmt.Errorf("failed to connect to database: %w", err) } if err := initGoose(); err != nil { conn.Close() - lock.release() + releaseLock() slog.Error("Failed to initialize goose", "error", err) return nil, fmt.Errorf("failed to initialize goose: %w", err) } if err := goose.Up(conn, "migrations"); err != nil { conn.Close() - lock.release() + releaseLock() slog.Error("Failed to apply migrations", "error", err) return nil, fmt.Errorf("failed to apply migrations: %w", err) } diff --git a/internal/db/connect_test.go b/internal/db/connect_test.go index d3b7fc86351d2cf43300d683fc9df0ad77b639b6..45e39758924a9351b07bdb5956ddbf1ae85d1b02 100644 --- a/internal/db/connect_test.go +++ b/internal/db/connect_test.go @@ -69,7 +69,7 @@ func TestConnect_FailsWhenDataDirLocked(t *testing.T) { require.NoError(t, err, "expected to take the data-dir lock for the first time") t.Cleanup(release) - _, err = Connect(context.Background(), dataDir) + _, err = Connect(context.Background(), dataDir, WithDataDirLock(true)) require.Error(t, err, "Connect must refuse to open a locked data dir") require.ErrorIs(t, err, ErrDataDirLocked) } @@ -85,12 +85,12 @@ func TestConnect_SucceedsAfterContenderReleases(t *testing.T) { release, err := tryFileLock(lockPath) require.NoError(t, err) - _, err = Connect(context.Background(), dataDir) + _, err = Connect(context.Background(), dataDir, WithDataDirLock(true)) require.ErrorIs(t, err, ErrDataDirLocked) release() - conn, err := Connect(context.Background(), dataDir) + conn, err := Connect(context.Background(), dataDir, WithDataDirLock(true)) require.NoError(t, err, "Connect should succeed once the contender releases the lock") require.NoError(t, conn.PingContext(context.Background())) require.NoError(t, Release(dataDir)) @@ -105,7 +105,7 @@ func TestConnect_LockReleasedOnFinalRelease(t *testing.T) { dataDir := t.TempDir() lockPath := filepath.Join(dataDir, dataDirLockFile) - conn, err := Connect(context.Background(), dataDir) + conn, err := Connect(context.Background(), dataDir, WithDataDirLock(true)) require.NoError(t, err) require.NoError(t, conn.PingContext(context.Background())) @@ -135,10 +135,10 @@ func TestConnect_SharedPoolDoesNotReacquireLock(t *testing.T) { dataDir := t.TempDir() lockPath := filepath.Join(dataDir, dataDirLockFile) - _, err := Connect(context.Background(), dataDir) + _, err := Connect(context.Background(), dataDir, WithDataDirLock(true)) require.NoError(t, err) - _, err = Connect(context.Background(), dataDir) + _, err = Connect(context.Background(), dataDir, WithDataDirLock(true)) require.NoError(t, err) // Drop one reference; lock must still be held. @@ -163,8 +163,47 @@ func TestConnect_SkipLockEnvBypassesAcquisition(t *testing.T) { t.Setenv("CRUSH_SKIP_DATADIR_LOCK", "1") - conn, err := Connect(context.Background(), dataDir) + conn, err := Connect(context.Background(), dataDir, WithDataDirLock(true)) require.NoError(t, err, "skip-lock env should bypass contention") require.NoError(t, conn.PingContext(context.Background())) require.NoError(t, Release(dataDir)) } + +// TestConnect_DefaultIgnoresContendedLock confirms that without +// WithDataDirLock(true) the lock file is irrelevant: a contender can +// hold tryFileLock and Connect still succeeds. This pins the +// local-mode default to its pre-lock behavior. +func TestConnect_DefaultIgnoresContendedLock(t *testing.T) { + t.Cleanup(ResetPool) + + dataDir := t.TempDir() + lockPath := filepath.Join(dataDir, dataDirLockFile) + + release, err := tryFileLock(lockPath) + require.NoError(t, err, "expected to take the data-dir lock for the first time") + t.Cleanup(release) + + conn, err := Connect(context.Background(), dataDir) + require.NoError(t, err, "default Connect must not take the lock and must succeed under contention") + require.NoError(t, conn.PingContext(context.Background())) + require.NoError(t, Release(dataDir)) +} + +// TestConnect_ServerPathFailsWhenDataDirLocked is the server's +// workspace-bootstrap analogue of TestConnect_FailsWhenDataDirLocked: +// passing WithDataDirLock(true) must surface ErrDataDirLocked when a +// contender already holds the lock. +func TestConnect_ServerPathFailsWhenDataDirLocked(t *testing.T) { + t.Cleanup(ResetPool) + + dataDir := t.TempDir() + lockPath := filepath.Join(dataDir, dataDirLockFile) + + release, err := tryFileLock(lockPath) + require.NoError(t, err, "expected to take the data-dir lock for the first time") + t.Cleanup(release) + + _, err = Connect(context.Background(), dataDir, WithDataDirLock(true)) + require.Error(t, err, "server-path Connect must refuse to open a locked data dir") + require.ErrorIs(t, err, ErrDataDirLocked) +}