feat(tools/crush_info): staleness detection and auto-reload

Christian Rocha created

Ensures crush_info reflects fresh config after edits and detects
external/manual config file modifications.

Change summary

internal/config/load.go       |  30 +++++---
internal/config/load_test.go  |  49 ++++++++++++
internal/config/store.go      |  73 ++++++++++++++++++-
internal/config/store_test.go | 135 +++++++++++++++++++++++++++++++++++++
4 files changed, 269 insertions(+), 18 deletions(-)

Detailed changes

internal/config/load.go 🔗

@@ -91,6 +91,12 @@ func Load(workingDir, dataDir string, debug bool) (*ConfigStore, error) {
 	// Configure providers
 	valueResolver := NewShellVariableResolver(env)
 	store.resolver = valueResolver
+
+	// Disable auto-reload during initial load to prevent nested calls from
+	// config-modifying operations inside configureProviders.
+	store.autoReloadDisabled = true
+	defer func() { store.autoReloadDisabled = false }()
+
 	if err := cfg.configureProviders(store, env, valueResolver, store.knownProviders); err != nil {
 		return nil, fmt.Errorf("failed to configure providers: %w", err)
 	}
@@ -100,7 +106,7 @@ func Load(workingDir, dataDir string, debug bool) (*ConfigStore, error) {
 		return store, nil
 	}
 
-	if err := configureSelectedModels(store, store.knownProviders); err != nil {
+	if err := configureSelectedModels(store, store.knownProviders, true); err != nil {
 		return nil, fmt.Errorf("failed to configure selected models: %w", err)
 	}
 	store.SetupAgents()
@@ -236,7 +242,9 @@ func (c *Config) configureProviders(store *ConfigStore, env env.Env, resolver Va
 		switch {
 		case p.ID == catwalk.InferenceProviderAnthropic && config.OAuthToken != nil:
 			// Claude Code subscription is not supported anymore. Remove to show onboarding.
-			store.RemoveConfigField(ScopeGlobal, "providers.anthropic")
+			if !store.reloadInProgress {
+				store.RemoveConfigField(ScopeGlobal, "providers.anthropic")
+			}
 			c.Providers.Del(string(p.ID))
 			continue
 		case p.ID == catwalk.InferenceProviderCopilot && config.OAuthToken != nil:
@@ -553,7 +561,7 @@ func (c *Config) defaultModelSelection(knownProviders []catwalk.Provider) (large
 	return largeModel, smallModel, err
 }
 
-func configureSelectedModels(store *ConfigStore, knownProviders []catwalk.Provider) error {
+func configureSelectedModels(store *ConfigStore, knownProviders []catwalk.Provider, persist bool) error {
 	c := store.config
 	defaultLarge, defaultSmall, err := c.defaultModelSelection(knownProviders)
 	if err != nil {
@@ -572,10 +580,10 @@ func configureSelectedModels(store *ConfigStore, knownProviders []catwalk.Provid
 		model := c.GetModel(large.Provider, large.Model)
 		if model == nil {
 			large = defaultLarge
-			// override the model type to large
-			err := store.UpdatePreferredModel(ScopeGlobal, SelectedModelTypeLarge, large)
-			if err != nil {
-				return fmt.Errorf("failed to update preferred large model: %w", err)
+			if persist {
+				if err := store.UpdatePreferredModel(ScopeGlobal, SelectedModelTypeLarge, large); err != nil {
+					return fmt.Errorf("failed to update preferred large model: %w", err)
+				}
 			}
 		} else {
 			if largeModelSelected.MaxTokens > 0 {
@@ -616,10 +624,10 @@ func configureSelectedModels(store *ConfigStore, knownProviders []catwalk.Provid
 		model := c.GetModel(small.Provider, small.Model)
 		if model == nil {
 			small = defaultSmall
-			// override the model type to small
-			err := store.UpdatePreferredModel(ScopeGlobal, SelectedModelTypeSmall, small)
-			if err != nil {
-				return fmt.Errorf("failed to update preferred small model: %w", err)
+			if persist {
+				if err := store.UpdatePreferredModel(ScopeGlobal, SelectedModelTypeSmall, small); err != nil {
+					return fmt.Errorf("failed to update preferred small model: %w", err)
+				}
 			}
 		} else {
 			if smallModelSelected.MaxTokens > 0 {

internal/config/load_test.go 🔗

@@ -1310,6 +1310,49 @@ func TestConfig_setDefaultsDisableDefaultProvidersEnvVar(t *testing.T) {
 }
 
 func TestConfig_configureSelectedModels(t *testing.T) {
+	t.Run("reload mode should not persist fallback defaults", func(t *testing.T) {
+		dir := t.TempDir()
+		globalPath := filepath.Join(dir, "crush.json")
+		require.NoError(t, os.WriteFile(globalPath, []byte(`{"models":{"large":{"provider":"ghost","model":"missing"}}}`), 0o600))
+
+		knownProviders := []catwalk.Provider{
+			{
+				ID:                  "openai",
+				APIKey:              "abc",
+				DefaultLargeModelID: "large-model",
+				DefaultSmallModelID: "small-model",
+				Models: []catwalk.Model{
+					{ID: "large-model", DefaultMaxTokens: 1000},
+					{ID: "small-model", DefaultMaxTokens: 500},
+				},
+			},
+		}
+
+		cfg := &Config{
+			Models: map[SelectedModelType]SelectedModel{
+				SelectedModelTypeLarge: {Provider: "ghost", Model: "missing"},
+			},
+		}
+		cfg.setDefaults(dir, "")
+		store := &ConfigStore{config: cfg, globalDataPath: globalPath}
+		env := env.NewFromMap(map[string]string{})
+		resolver := NewEnvironmentVariableResolver(env)
+		err := cfg.configureProviders(store, env, resolver, knownProviders)
+		require.NoError(t, err)
+
+		err = configureSelectedModels(store, knownProviders, false)
+		require.NoError(t, err)
+
+		// In-memory falls back to default.
+		require.Equal(t, "openai", cfg.Models[SelectedModelTypeLarge].Provider)
+		require.Equal(t, "large-model", cfg.Models[SelectedModelTypeLarge].Model)
+
+		// Disk remains unchanged in reload mode.
+		data, readErr := os.ReadFile(globalPath)
+		require.NoError(t, readErr)
+		require.Contains(t, string(data), `"provider":"ghost"`)
+		require.Contains(t, string(data), `"model":"missing"`)
+	})
 	t.Run("should override defaults", func(t *testing.T) {
 		knownProviders := []catwalk.Provider{
 			{
@@ -1347,7 +1390,7 @@ func TestConfig_configureSelectedModels(t *testing.T) {
 		err := cfg.configureProviders(testStore(cfg), env, resolver, knownProviders)
 		require.NoError(t, err)
 
-		err = configureSelectedModels(testStore(cfg), knownProviders)
+		err = configureSelectedModels(testStore(cfg), knownProviders, true)
 		require.NoError(t, err)
 		large := cfg.Models[SelectedModelTypeLarge]
 		small := cfg.Models[SelectedModelTypeSmall]
@@ -1409,7 +1452,7 @@ func TestConfig_configureSelectedModels(t *testing.T) {
 		err := cfg.configureProviders(testStore(cfg), env, resolver, knownProviders)
 		require.NoError(t, err)
 
-		err = configureSelectedModels(testStore(cfg), knownProviders)
+		err = configureSelectedModels(testStore(cfg), knownProviders, true)
 		require.NoError(t, err)
 		large := cfg.Models[SelectedModelTypeLarge]
 		small := cfg.Models[SelectedModelTypeSmall]
@@ -1454,7 +1497,7 @@ func TestConfig_configureSelectedModels(t *testing.T) {
 		err := cfg.configureProviders(testStore(cfg), env, resolver, knownProviders)
 		require.NoError(t, err)
 
-		err = configureSelectedModels(testStore(cfg), knownProviders)
+		err = configureSelectedModels(testStore(cfg), knownProviders, true)
 		require.NoError(t, err)
 		large := cfg.Models[SelectedModelTypeLarge]
 		require.Equal(t, "large-model", large.Model)

internal/config/store.go 🔗

@@ -48,6 +48,8 @@ type ConfigStore struct {
 	overrides          RuntimeOverrides
 	trackedConfigPaths []string                // unique, normalized config file paths
 	snapshots          map[string]fileSnapshot // path -> snapshot at last capture
+	autoReloadDisabled bool                    // set during load/reload to prevent re-entrancy
+	reloadInProgress   bool                    // set during reload to avoid disk writes mid-reload
 }
 
 // Config returns the pure-data config struct (read-only after load).
@@ -121,6 +123,8 @@ func (s *ConfigStore) HasConfigField(scope Scope, key string) bool {
 }
 
 // SetConfigField sets a key/value pair in the config file for the given scope.
+// After a successful write, it automatically reloads config to keep in-memory
+// state fresh.
 func (s *ConfigStore) SetConfigField(scope Scope, key string, value any) error {
 	path, err := s.configPath(scope)
 	if err != nil {
@@ -145,10 +149,21 @@ func (s *ConfigStore) SetConfigField(scope Scope, key string, value any) error {
 	if err := os.WriteFile(path, []byte(newValue), 0o600); err != nil {
 		return fmt.Errorf("failed to write config file: %w", err)
 	}
+
+	// Auto-reload to keep in-memory state fresh after config edits.
+	// We use context.Background() since this is an internal operation that
+	// shouldn't be cancelled by user context.
+	if err := s.autoReload(context.Background()); err != nil {
+		// Log warning but don't fail the write - disk is already updated.
+		slog.Warn("Config file updated but failed to reload in-memory state", "error", err)
+	}
+
 	return nil
 }
 
 // RemoveConfigField removes a key from the config file for the given scope.
+// After a successful write, it automatically reloads config to keep in-memory
+// state fresh.
 func (s *ConfigStore) RemoveConfigField(scope Scope, key string) error {
 	path, err := s.configPath(scope)
 	if err != nil {
@@ -169,6 +184,12 @@ func (s *ConfigStore) RemoveConfigField(scope Scope, key string) error {
 	if err := os.WriteFile(path, []byte(newValue), 0o600); err != nil {
 		return fmt.Errorf("failed to write config file: %w", err)
 	}
+
+	// Auto-reload to keep in-memory state fresh after config edits.
+	if err := s.autoReload(context.Background()); err != nil {
+		slog.Warn("Config file updated but failed to reload in-memory state", "error", err)
+	}
+
 	return nil
 }
 
@@ -529,12 +550,21 @@ func (s *ConfigStore) captureStalenessSnapshot(paths []string) {
 }
 
 // ReloadFromDisk re-runs the config load/merge flow and updates the in-memory
-// config safely. It rebuilds the staleness snapshot after successful reload.
+// config atomically. It rebuilds the staleness snapshot after successful reload.
+// On failure, the store state is rolled back to its previous state.
 func (s *ConfigStore) ReloadFromDisk(ctx context.Context) error {
 	if s.workingDir == "" {
 		return fmt.Errorf("cannot reload: working directory not set")
 	}
 
+	// Disable auto-reload during reload to prevent nested/re-entrant calls.
+	s.autoReloadDisabled = true
+	s.reloadInProgress = true
+	defer func() {
+		s.autoReloadDisabled = false
+		s.reloadInProgress = false
+	}()
+
 	configPaths := lookupConfigs(s.workingDir)
 	cfg, loadedPaths, err := loadFromConfigPaths(configPaths)
 	if err != nil {
@@ -575,6 +605,14 @@ func (s *ConfigStore) ReloadFromDisk(ctx context.Context) error {
 		return fmt.Errorf("failed to configure providers during reload: %w", err)
 	}
 
+	// Save current state for potential rollback
+	oldConfig := s.config
+	oldLoadedPaths := s.loadedPaths
+	oldResolver := s.resolver
+	oldKnownProviders := s.knownProviders
+	oldOverrides := s.overrides
+	oldWorkspacePath := s.workspacePath
+
 	// Update store state BEFORE running model/agent setup (so they see new config)
 	s.config = cfg
 	s.loadedPaths = loadedPaths
@@ -584,13 +622,26 @@ func (s *ConfigStore) ReloadFromDisk(ctx context.Context) error {
 	s.workspacePath = workspacePath
 
 	// Mirror startup flow: setup models and agents against NEW config
+	var setupErr error
 	if !cfg.IsConfigured() {
 		slog.Warn("No providers configured after reload")
 	} else {
-		if err := configureSelectedModels(s, providers); err != nil {
-			return fmt.Errorf("failed to configure selected models during reload: %w", err)
+		if err := configureSelectedModels(s, providers, false); err != nil {
+			setupErr = fmt.Errorf("failed to configure selected models during reload: %w", err)
+		} else {
+			s.SetupAgents()
 		}
-		s.SetupAgents()
+	}
+
+	// Rollback on setup failure
+	if setupErr != nil {
+		s.config = oldConfig
+		s.loadedPaths = oldLoadedPaths
+		s.resolver = oldResolver
+		s.knownProviders = oldKnownProviders
+		s.overrides = oldOverrides
+		s.workspacePath = oldWorkspacePath
+		return setupErr
 	}
 
 	// Rebuild staleness tracking
@@ -598,3 +649,17 @@ func (s *ConfigStore) ReloadFromDisk(ctx context.Context) error {
 
 	return nil
 }
+
+// autoReload conditionally reloads config from disk after writes.
+// It returns nil (no error) for expected skip cases: when auto-reload is
+// disabled during load/reload flows, or when working directory is not set
+// (e.g., during testing). Only actual reload failures return an error.
+func (s *ConfigStore) autoReload(ctx context.Context) error {
+	if s.autoReloadDisabled {
+		return nil // Expected skip: already in load/reload flow
+	}
+	if s.workingDir == "" {
+		return nil // Expected skip: working directory not set
+	}
+	return s.ReloadFromDisk(ctx)
+}

internal/config/store_test.go 🔗

@@ -375,3 +375,138 @@ func TestReloadFromDisk_UsesNewConfigValues(t *testing.T) {
 	require.Equal(t, "anthropic", store.config.Models[SelectedModelTypeLarge].Provider)
 	require.Equal(t, "claude-3", store.config.Models[SelectedModelTypeLarge].Model)
 }
+
+// TestSetConfigField_AutoReloads verifies that SetConfigField automatically
+// reloads config into memory after writing, so subsequent reads see the new value.
+func TestSetConfigField_AutoReloads(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	configPath := filepath.Join(dir, "crush.json")
+
+	// Create initial config file with debug = false
+	initialConfig := `{"options": {"debug": false}}`
+	require.NoError(t, os.WriteFile(configPath, []byte(initialConfig), 0o600))
+
+	// Load initial config
+	store, err := Load(dir, dir, false)
+	require.NoError(t, err)
+
+	// Verify initial state
+	require.False(t, store.config.Options.Debug)
+
+	// Set globalDataPath and capture snapshot for staleness tracking
+	store.globalDataPath = configPath
+	store.CaptureStalenessSnapshot([]string{configPath})
+
+	// Use SetConfigField to change debug to true
+	err = store.SetConfigField(ScopeGlobal, "options.debug", true)
+	require.NoError(t, err)
+
+	// Verify in-memory state was automatically reloaded and reflects the change
+	require.True(t, store.config.Options.Debug, "Expected config to auto-reload and show debug = true")
+
+	// Verify staleness is clean after the reload
+	staleness := store.ConfigStaleness()
+	require.False(t, staleness.Dirty, "Expected staleness to be clean after auto-reload")
+}
+
+// TestRemoveConfigField_AutoReloads verifies that RemoveConfigField automatically
+// reloads config into memory after writing.
+func TestRemoveConfigField_AutoReloads(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	configPath := filepath.Join(dir, "crush.json")
+
+	// Create initial config file with a custom option
+	initialConfig := `{"options": {"debug": true, "custom_field": "value"}}`
+	require.NoError(t, os.WriteFile(configPath, []byte(initialConfig), 0o600))
+
+	// Load initial config
+	store, err := Load(dir, dir, false)
+	require.NoError(t, err)
+
+	// Set globalDataPath and capture snapshot
+	store.globalDataPath = configPath
+	store.CaptureStalenessSnapshot([]string{configPath})
+
+	// Verify the field exists initially (indirectly - store loaded successfully)
+	require.True(t, store.config.Options.Debug)
+
+	// Remove the debug field
+	err = store.RemoveConfigField(ScopeGlobal, "options.debug")
+	require.NoError(t, err)
+
+	// Verify auto-reload occurred and stale state is clean
+	staleness := store.ConfigStaleness()
+	require.False(t, staleness.Dirty, "Expected staleness to be clean after auto-reload from RemoveConfigField")
+}
+
+// TestSetConfigField_AutoReloadSkipsWhenNoWorkingDir verifies that auto-reload
+// gracefully skips when working directory is not set (e.g., during testing).
+func TestSetConfigField_AutoReloadSkipsWhenNoWorkingDir(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	configPath := filepath.Join(dir, "crush.json")
+
+	// Create a store without working directory (like some test setups)
+	store := &ConfigStore{
+		config:         &Config{},
+		globalDataPath: configPath,
+		// workingDir is empty
+	}
+
+	// SetConfigField should succeed even without workingDir (auto-reload skips)
+	err := store.SetConfigField(ScopeGlobal, "foo", "bar")
+	require.NoError(t, err)
+
+	// Verify file was still written
+	data, err := os.ReadFile(configPath)
+	require.NoError(t, err)
+	require.Contains(t, string(data), "foo")
+}
+
+// TestAutoReloadDisabledDuringReload verifies that auto-reload is suppressed
+// during ReloadFromDisk to prevent re-entrant/nested reload calls.
+func TestAutoReloadDisabledDuringReload(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	configPath := filepath.Join(dir, "crush.json")
+
+	// Create initial config with a provider that will trigger config modification during reload
+	// (simulating the anthropic OAuth token removal case)
+	initialConfig := `{
+		"providers": {
+			"anthropic": {
+				"api_key": "test-key",
+				"oauth": {"access_token": "token", "refresh_token": "refresh"}
+			}
+		}
+	}`
+	require.NoError(t, os.WriteFile(configPath, []byte(initialConfig), 0o600))
+
+	// Load will trigger configureProviders which removes anthropic OAuth config
+	// This should NOT cause infinite recursion thanks to autoReloadDisabled guard
+	store, err := Load(dir, dir, false)
+	require.NoError(t, err)
+
+	// Verify the store loaded successfully and autoReloadDisabled was unset
+	require.False(t, store.autoReloadDisabled)
+
+	// Capture snapshot and verify reload also works without recursion
+	store.globalDataPath = configPath
+	store.CaptureStalenessSnapshot([]string{configPath})
+
+	// Modify file and reload - this should work without re-entrancy issues
+	time.Sleep(10 * time.Millisecond)
+	require.NoError(t, os.WriteFile(configPath, []byte(`{"options": {"debug": true}}`), 0o600))
+
+	err = store.ReloadFromDisk(context.Background())
+	require.NoError(t, err)
+
+	// Verify reload completed successfully
+	require.False(t, store.autoReloadDisabled, "autoReloadDisabled should be false after ReloadFromDisk")
+}