From 401084133df3e7b1895de169c4b67aea0f72b572 Mon Sep 17 00:00:00 2001 From: Greg Slepak Date: Mon, 4 May 2026 07:22:09 -0700 Subject: [PATCH] fix(db): prevent SQLITE_NOTADB corruption under concurrent sub-agents (#2690) Crush could fail to start with "file is not a database (26)" after a session ran many sub-agents in parallel. The shared *sql.DB was opened with Go's default unlimited connection pool, so each concurrent sub-agent (session/message/cost writes via coordinator.runSubAgent) could open its own SQLite handle. Under load, WAL frames from different handles interleaved and auto-checkpoints could race; if the process was cancelled or killed mid-checkpoint, the main DB header and WAL desynced, producing SQLITE_NOTADB (26) on the next open. Two changes address this: - Set db.SetMaxOpenConns(1). SQLite serializes writes at the file level anyway, so extra pool connections add no throughput, only race surface. Forcing a single pooled connection makes database/sql queue all callers through one SQLite handle, eliminating cross-handle WAL/checkpoint races. Sub-agents still function; their writes just serialize (which SQLite was going to do regardless). - Add _txlock=immediate to both driver DSNs (modernc.org/sqlite and ncruces/go-sqlite3). BEGIN IMMEDIATE acquires the reserved-writer lock up front so concurrent writers queue cleanly on busy_timeout instead of racing the deferred-to-writer upgrade (the pattern behind prior SQLITE_BUSY reports, e.g. #2129, #2576). Defense in depth alongside the pool limit. For the ncruces driver the DSN must use the file: URI prefix for query params to be parsed; the plain path form silently ignores _txlock. --- internal/db/connect.go | 7 +++++++ internal/db/connect_modernc.go | 3 +++ internal/db/connect_ncruces.go | 6 +++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/internal/db/connect.go b/internal/db/connect.go index e31d220c7454a3ccc4241b336bedbc7bf30d5fba..ef800c716efc44b137c163a188f599366f1c66e3 100644 --- a/internal/db/connect.go +++ b/internal/db/connect.go @@ -50,6 +50,13 @@ func Connect(ctx context.Context, dataDir string) (*sql.DB, error) { return nil, err } + // Serialize all access through a single connection. SQLite serializes + // writes at the file level anyway, and allowing multiple pool + // connections to interleave writes/checkpoints (especially under + // concurrent sub-agents) has caused WAL/header desync resulting in + // SQLITE_NOTADB (26) on the next open. + db.SetMaxOpenConns(1) + if err = db.PingContext(ctx); err != nil { db.Close() return nil, fmt.Errorf("failed to connect to database: %w", err) diff --git a/internal/db/connect_modernc.go b/internal/db/connect_modernc.go index 39c7faa42516297d4df497821baa0be56835be15..2e9676d6420c6fbda69cb924595a6e8a55740a5c 100644 --- a/internal/db/connect_modernc.go +++ b/internal/db/connect_modernc.go @@ -17,6 +17,9 @@ func openDB(dbPath string) (*sql.DB, error) { for name, value := range pragmas { params.Add("_pragma", fmt.Sprintf("%s(%s)", name, value)) } + // Use BEGIN IMMEDIATE so writers acquire the reserved lock up front, + // preventing deferred-to-writer upgrade deadlocks. + params.Set("_txlock", "immediate") dsn := fmt.Sprintf("file:%s?%s", dbPath, params.Encode()) db, err := sql.Open("sqlite", dsn) diff --git a/internal/db/connect_ncruces.go b/internal/db/connect_ncruces.go index 12946e29439862653e0253adc9eb6e2869bea559..d90833350030f08be364f940d483e0afb0b63eb5 100644 --- a/internal/db/connect_ncruces.go +++ b/internal/db/connect_ncruces.go @@ -11,7 +11,11 @@ import ( ) func openDB(dbPath string) (*sql.DB, error) { - db, err := driver.Open(dbPath, func(c *sqlite3.Conn) error { + // Use BEGIN IMMEDIATE so writers acquire the reserved lock up front, + // preventing deferred-to-writer upgrade deadlocks. The "file:" prefix + // is required for the ncruces driver to parse query parameters. + dsn := fmt.Sprintf("file:%s?_txlock=immediate", dbPath) + db, err := driver.Open(dsn, func(c *sqlite3.Conn) error { // Set pragmas for better performance via _pragma query params. // Format: PRAGMA name = value; for name, value := range pragmas {