From 72c0c0f3223fc881565d5a92054862205e06f990 Mon Sep 17 00:00:00 2001 From: eliran-mic Date: Thu, 30 Apr 2026 14:15:23 +0300 Subject: [PATCH 1/4] fix(contexts): refresh registry against disk on each list call (SKY-834) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 52: junk / stale clusters lingering in the cluster dropdown. Root cause: in multi-file (isolated-load) mode, contextRegistry + perFileConfigs were built ONCE in setupIsolatedLoad and never refreshed. Single-file mode reloads on every GetAvailableContexts call (it walks RawConfig() through DeferredLoadingClientConfig), but the multi-file path served whatever was in memory. So the dropdown kept showing entries for: - kubeconfig files removed from a watched directory, - kubeconfig files rewritten by 'kubectl config delete-context', - kubeconfig files written then deleted by CAPI when a managed cluster was destroyed. All three look identical to the user: a row that's there, but errors out when they try to switch to it. Fix: introduce refreshContextRegistry — surgical, mtime-driven reconciliation of registry + perFileConfigs against what's on disk now. - File missing on disk → drop every registry entry pointing at it AND its cached config + mtime. (CAPI-destroyed case.) - File mtime moved → re-parse THAT file only. Drop entries for contexts removed from the file; add entries for newly-added contexts. (kubectl config delete-context / set-context case.) - File mtime unchanged → no-op (avoids re-parsing N files on every dropdown open). - File present but parse-fails → keep prior entries. The user might be mid-edit; silently pruning the dropdown while they save would be more confusing than a momentarily stale entry. Wire it into GetAvailableContexts (multi-file branch only) under clientMu write lock. Same per-file isolation guarantees as buildContextRegistry — colliding user/cluster names across files remain structurally impossible. Out of scope (deliberately deferred): - Re-scanning kubeconfig DIRECTORIES for newly-added files. Today, a freshly-created kubeconfig won't appear without a Radar restart. Same workaround as before (restart). Worth a follow-up but adds a syscall budget question (every dropdown open shouldn't ReadDir all watched dirs). - fsnotify-based eager refresh. Same data, more moving parts; on-demand mtime check on dropdown open is enough to clear the junk-cluster complaint. Tests pin: removed-file drop, in-place context delete, in-place context add, no-op stability, and the parse-failure defensive case (keep stale, don't prune). Made-with: Cursor --- internal/k8s/client.go | 22 ++- internal/k8s/context_registry.go | 120 +++++++++++++++ internal/k8s/context_registry_test.go | 213 ++++++++++++++++++++++++++ 3 files changed, 352 insertions(+), 3 deletions(-) diff --git a/internal/k8s/client.go b/internal/k8s/client.go index 59ef5aafb..4fe6ebad7 100644 --- a/internal/k8s/client.go +++ b/internal/k8s/client.go @@ -10,6 +10,7 @@ import ( "sort" "strings" "sync" + "time" "k8s.io/client-go/discovery" "k8s.io/client-go/dynamic" @@ -43,7 +44,14 @@ var ( contextRegistry map[string]contextEntry // perFileConfigs caches each file's parsed api.Config so GetAvailableContexts // doesn't re-read N files on every call. Keyed by absolute file path. - perFileConfigs map[string]*clientcmdapi.Config + perFileConfigs map[string]*clientcmdapi.Config + // perFileMtimes lets refreshContextRegistry detect rewritten or + // removed kubeconfig files between calls. Without this the + // registry is built once at startup and never refreshes, so + // destroyed clusters / removed contexts linger in the dropdown + // (they only error out when the user tries to switch to them). + // Same lifecycle / lock as perFileConfigs. + perFileMtimes map[string]time.Time contextName string clusterName string contextNamespace string // Default namespace from kubeconfig context @@ -692,11 +700,19 @@ func GetAvailableContexts() ([]ContextInfo, error) { }, nil } - clientMu.RLock() + // Reconcile registry against disk before reading. This is the + // only refresh point in multi-file (isolated-load) mode — without + // it, kubeconfigs that were rewritten or deleted on disk after + // startup keep showing up in the dropdown until the user + // restarts Radar (the "junk clusters" complaint). + clientMu.Lock() + if contextRegistry != nil { + refreshContextRegistry(contextRegistry, perFileConfigs, perFileMtimes) + } registry := contextRegistry fileConfigs := perFileConfigs currentCtx := contextName - clientMu.RUnlock() + clientMu.Unlock() if registry != nil { // Isolated-load mode: enumerate every registered context, pulling diff --git a/internal/k8s/context_registry.go b/internal/k8s/context_registry.go index bd7a9ac90..4e032b144 100644 --- a/internal/k8s/context_registry.go +++ b/internal/k8s/context_registry.go @@ -3,9 +3,11 @@ package k8s import ( "fmt" "log" + "os" "path/filepath" "sort" "strings" + "time" "k8s.io/client-go/tools/clientcmd" clientcmdapi "k8s.io/client-go/tools/clientcmd/api" @@ -38,6 +40,12 @@ func setupIsolatedLoad(paths []string) ( } contextRegistry = registry perFileConfigs = fileConfigs + perFileMtimes = make(map[string]time.Time, len(paths)) + for _, p := range paths { + if info, err := os.Stat(p); err == nil { + perFileMtimes[p] = info.ModTime() + } + } contextName = qName return &clientcmd.ClientConfigLoadingRules{ExplicitPath: entry.SourceFile}, &clientcmd.ConfigOverrides{CurrentContext: entry.InFileName}, @@ -166,6 +174,118 @@ func pickInitialContext( return "", contextEntry{}, false } +// refreshContextRegistry reconciles the in-memory contextRegistry + +// perFileConfigs against what's actually on disk RIGHT NOW. Returns +// whether anything changed (so the caller can decide whether to log +// or invalidate downstream caches). +// +// The original registry is built ONCE in setupIsolatedLoad and was +// never refreshed in multi-file mode. That left the cluster +// dropdown showing entries for: +// +// - kubeconfig files the user removed from a watched directory, +// - kubeconfig files rewritten by `kubectl config delete-context`, +// - kubeconfig files written then deleted by CAPI when a managed +// cluster was destroyed. +// +// All three look like "junk clusters" to the user (the entry is +// there but selecting it errors out). This helper is the surgical +// fix: same per-file isolation guarantees as buildContextRegistry, +// just incremental — it ONLY touches files whose mtime moved or +// whose path no longer exists on disk. +// +// Concurrency: caller MUST hold clientMu (write lock). The helper +// rebuilds shared map values, so it's not safe to interleave with +// GetAvailableContexts readers. +func refreshContextRegistry( + registry map[string]contextEntry, + fileConfigs map[string]*clientcmdapi.Config, + fileMtimes map[string]time.Time, +) bool { + if registry == nil { + return false + } + changed := false + // Group registry entries by source file so we can decide + // per-file: keep, re-parse, or drop everything pointing at it. + byFile := make(map[string][]string) + for qName, entry := range registry { + byFile[entry.SourceFile] = append(byFile[entry.SourceFile], qName) + } + for path, qNames := range byFile { + info, statErr := os.Stat(path) + if statErr != nil { + // File is gone (or unreadable). Drop every registry + // entry pointing at it AND its cached config. This + // is the CAPI-cluster-destroyed and + // "user removed file from kubeconfig dir" cases. + for _, qName := range qNames { + delete(registry, qName) + } + delete(fileConfigs, path) + delete(fileMtimes, path) + changed = true + continue + } + mtime := info.ModTime() + if cached, ok := fileMtimes[path]; ok && cached.Equal(mtime) { + // Unchanged — keep the cached parse + entries. + continue + } + // File is new to us OR has been rewritten on disk. Re-parse + // and rebuild ONLY this file's entries. + cfg, err := clientcmd.LoadFromFile(path) + if err != nil { + // Couldn't parse the rewritten file. Don't drop the + // existing entries — the user may be mid-edit, and + // silently pruning the dropdown while they save would + // be more confusing than a stale entry. Log and skip. + log.Printf("[k8s-init] refresh: skipping kubeconfig %q (parse failed): %v", + filepath.Base(path), err) + errorlog.Record("k8s-init", "warning", + "refresh: kubeconfig %q failed to load: %s", + filepath.Base(path), scrubPathError(err)) + continue + } + fileConfigs[path] = cfg + fileMtimes[path] = mtime + // Replace this file's entries in the registry. Names that + // are no longer in the file get dropped; new ones are added. + liveNames := make(map[string]struct{}, len(cfg.Contexts)) + for name := range cfg.Contexts { + liveNames[name] = struct{}{} + } + for _, qName := range qNames { + if _, alive := liveNames[registry[qName].InFileName]; !alive { + delete(registry, qName) + changed = true + } + } + // Add any contexts that are new in this file. We deliberately + // re-use qualifyContextName to keep the cross-file collision + // behaviour consistent with the initial build. + for name := range cfg.Contexts { + already := false + for _, qName := range qNames { + if e, ok := registry[qName]; ok && e.SourceFile == path && e.InFileName == name { + already = true + break + } + } + if already { + continue + } + qName := qualifyContextName(registry, name, path) + registry[qName] = contextEntry{ + SourceFile: path, + InFileName: name, + } + changed = true + } + } + return changed +} + // aggregateExecPluginCommands walks every context across every per-file // config and returns the unique sorted set of exec-plugin basenames plus // the list of AuthInfos that reference exec blocks with empty Commands. diff --git a/internal/k8s/context_registry_test.go b/internal/k8s/context_registry_test.go index 25b7193f9..37b0a771b 100644 --- a/internal/k8s/context_registry_test.go +++ b/internal/k8s/context_registry_test.go @@ -5,6 +5,7 @@ import ( "path/filepath" "sort" "testing" + "time" "k8s.io/client-go/tools/clientcmd" clientcmdapi "k8s.io/client-go/tools/clientcmd/api" @@ -389,6 +390,218 @@ func injectEmptyExec(t *testing.T, path, userName string) { } } +// SKY-834 bug 52: kubeconfig files rewritten or deleted on disk +// after Radar startup kept showing their old contexts in the +// cluster dropdown — the in-memory registry was built once in +// setupIsolatedLoad and never refreshed in multi-file mode. The +// user saw "junk clusters" that errored out on switch. +// +// refreshContextRegistry is the surgical fix: same per-file +// isolation as buildContextRegistry, but driven by mtime so it +// only re-parses files that actually changed. + +func loadFixture(t *testing.T, paths []string) ( + map[string]contextEntry, + map[string]*clientcmdapi.Config, + map[string]time.Time, +) { + t.Helper() + registry, fileConfigs := buildContextRegistry(paths) + mtimes := make(map[string]time.Time, len(paths)) + for _, p := range paths { + info, err := os.Stat(p) + if err != nil { + t.Fatalf("stat fixture %s: %v", p, err) + } + mtimes[p] = info.ModTime() + } + return registry, fileConfigs, mtimes +} + +func TestRefreshContextRegistry_DropsRemovedFile(t *testing.T) { + // CAPI scenario: a file that was watched at startup is removed + // from disk (the cluster was destroyed and the controller + // cleaned up). All registry entries pointing at that file MUST + // disappear from the dropdown on the next refresh, otherwise + // the user sees a junk row that errors on switch. + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "alive.yaml", "ctx-alive", []kubeEntry{ + {ctxName: "ctx-alive", userName: "u", clusterName: "c1"}, + }) + f2 := writeKubeconfig(t, dir, "doomed.yaml", "ctx-doomed", []kubeEntry{ + {ctxName: "ctx-doomed", userName: "u", clusterName: "c2"}, + }) + + registry, fileConfigs, mtimes := loadFixture(t, []string{f1, f2}) + if _, ok := registry["ctx-doomed"]; !ok { + t.Fatalf("setup: expected ctx-doomed in registry, got %v", keysOf(registry)) + } + + if err := os.Remove(f2); err != nil { + t.Fatalf("remove fixture: %v", err) + } + + changed := refreshContextRegistry(registry, fileConfigs, mtimes) + if !changed { + t.Errorf("expected refresh to report a change after deleting %s", filepath.Base(f2)) + } + if _, ok := registry["ctx-doomed"]; ok { + t.Errorf("ctx-doomed still in registry after file removed: %v", keysOf(registry)) + } + if _, ok := registry["ctx-alive"]; !ok { + t.Errorf("ctx-alive should still be in registry: %v", keysOf(registry)) + } + if _, ok := fileConfigs[f2]; ok { + t.Errorf("perFileConfigs still has entry for removed file %s", filepath.Base(f2)) + } + if _, ok := mtimes[f2]; ok { + t.Errorf("perFileMtimes still has entry for removed file %s", filepath.Base(f2)) + } +} + +func TestRefreshContextRegistry_DropsContextRemovedFromFile(t *testing.T) { + // `kubectl config delete-context` rewrites the kubeconfig in + // place: same file, different mtime, fewer contexts. The + // removed context MUST disappear from the dropdown. + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "two.yaml", "ctx-keep", []kubeEntry{ + {ctxName: "ctx-keep", userName: "u", clusterName: "c1"}, + {ctxName: "ctx-delete", userName: "u", clusterName: "c2"}, + }) + + registry, fileConfigs, mtimes := loadFixture(t, []string{f1}) + if _, ok := registry["ctx-delete"]; !ok { + t.Fatalf("setup: expected ctx-delete in registry") + } + + rewriteKubeconfig(t, f1, []kubeEntry{ + {ctxName: "ctx-keep", userName: "u", clusterName: "c1"}, + }) + + changed := refreshContextRegistry(registry, fileConfigs, mtimes) + if !changed { + t.Errorf("expected refresh to report a change after rewriting %s", filepath.Base(f1)) + } + if _, ok := registry["ctx-delete"]; ok { + t.Errorf("ctx-delete still in registry after rewrite: %v", keysOf(registry)) + } + if _, ok := registry["ctx-keep"]; !ok { + t.Errorf("ctx-keep should still be in registry: %v", keysOf(registry)) + } +} + +func TestRefreshContextRegistry_PicksUpNewContextInSameFile(t *testing.T) { + // `kubectl config set-context foo` adds a new entry to an + // existing file. The new context should appear after refresh + // without needing a Radar restart. + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "one.yaml", "ctx-original", []kubeEntry{ + {ctxName: "ctx-original", userName: "u", clusterName: "c1"}, + }) + registry, fileConfigs, mtimes := loadFixture(t, []string{f1}) + + rewriteKubeconfig(t, f1, []kubeEntry{ + {ctxName: "ctx-original", userName: "u", clusterName: "c1"}, + {ctxName: "ctx-new", userName: "u", clusterName: "c2"}, + }) + + changed := refreshContextRegistry(registry, fileConfigs, mtimes) + if !changed { + t.Errorf("expected refresh to report a change after add") + } + if _, ok := registry["ctx-new"]; !ok { + t.Errorf("ctx-new not picked up after refresh: %v", keysOf(registry)) + } + if _, ok := registry["ctx-original"]; !ok { + t.Errorf("ctx-original disappeared from registry: %v", keysOf(registry)) + } +} + +func TestRefreshContextRegistry_NoOpWhenNothingChanged(t *testing.T) { + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "stable.yaml", "ctx-a", []kubeEntry{ + {ctxName: "ctx-a", userName: "u", clusterName: "c1"}, + }) + registry, fileConfigs, mtimes := loadFixture(t, []string{f1}) + before := keysOf(registry) + + changed := refreshContextRegistry(registry, fileConfigs, mtimes) + if changed { + t.Errorf("expected no change on stable disk state, got changed=true") + } + after := keysOf(registry) + sort.Strings(before) + sort.Strings(after) + if len(before) != len(after) { + t.Errorf("registry shape changed during no-op refresh: %v vs %v", before, after) + } +} + +func TestRefreshContextRegistry_BadParseDoesNotDropExisting(t *testing.T) { + // Defensive case: user is mid-edit and saved a syntactically + // broken kubeconfig (mtime moved, parse fails). We deliberately + // keep the previous registry entries — silently pruning the + // dropdown while the user saves would be more confusing than a + // momentarily stale entry. + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "broken.yaml", "ctx-a", []kubeEntry{ + {ctxName: "ctx-a", userName: "u", clusterName: "c1"}, + }) + registry, fileConfigs, mtimes := loadFixture(t, []string{f1}) + + if err := os.WriteFile(f1, []byte("not: valid: yaml: at: all"), 0o600); err != nil { + t.Fatalf("rewrite: %v", err) + } + + refreshContextRegistry(registry, fileConfigs, mtimes) + if _, ok := registry["ctx-a"]; !ok { + t.Errorf("ctx-a was dropped on parse failure; expected to keep it: %v", keysOf(registry)) + } +} + +func keysOf(m map[string]contextEntry) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} + +func rewriteKubeconfig(t *testing.T, path string, entries []kubeEntry) { + t.Helper() + cfg := clientcmdapi.NewConfig() + for _, e := range entries { + cfg.Contexts[e.ctxName] = &clientcmdapi.Context{ + Cluster: e.clusterName, + AuthInfo: e.userName, + Namespace: e.namespace, + } + if _, ok := cfg.Clusters[e.clusterName]; !ok { + cfg.Clusters[e.clusterName] = &clientcmdapi.Cluster{ + Server: "https://" + e.clusterName, + InsecureSkipTLSVerify: true, + } + } + if _, ok := cfg.AuthInfos[e.userName]; !ok { + cfg.AuthInfos[e.userName] = &clientcmdapi.AuthInfo{Token: "fake-token-for-" + e.userName} + } + } + data, err := clientcmd.Write(*cfg) + if err != nil { + t.Fatalf("rewrite serialize: %v", err) + } + // Force a different mtime even if the test writes within the + // same filesystem-resolution tick (HFS+ is 1s). + time.Sleep(15 * time.Millisecond) + if err := os.WriteFile(path, data, 0o600); err != nil { + t.Fatalf("rewrite %s: %v", path, err) + } + now := time.Now() + if err := os.Chtimes(path, now, now); err != nil { + t.Fatalf("chtimes %s: %v", path, err) + } +} + func TestAggregateExecPluginCommands_UniqueAcrossFiles(t *testing.T) { dir := t.TempDir() // File 1: user 'oidc' with kubectl exec plugin. From d9fcfde313b98ec1454370f62d7a9f93cfc06406 Mon Sep 17 00:00:00 2001 From: eliran-mic Date: Thu, 30 Apr 2026 14:57:52 +0300 Subject: [PATCH 2/4] fix(contexts): hold lock through iteration; seed mtimes on CAPI promote MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot HIGH x2 on PR #595. 1. Concurrent map read/write panic. v1 released clientMu before iterating the registry. The local variables 'registry' and 'fileConfigs' still pointed at the LIVE maps, so a second concurrent GetAvailableContexts call would acquire the lock and mutate them via refreshContextRegistry while the first caller iterated. Two HTTP /contexts requests in flight at once = 'fatal error: concurrent map read and map write'. Hold the lock through the iteration. The contexts slice is private to the caller, so we can release just before the return. 2. Nil perFileMtimes after CAPI promotion. MergeAndSwitchContext promotes single-file → isolated-load mode by writing contextRegistry and perFileConfigs but never touched perFileMtimes. After a CAPI cluster was registered, perFileMtimes stayed nil and the next refresh would write to it ('assignment to entry in nil map' panic). Seed perFileMtimes alongside the other globals on both promo paths. Add a defensive lazy-init in GetAvailableContexts so a future caller that forgets to seed it doesn't crash either. Test pins the contract that refresh populates an empty mtime map without panicking and writes the mtime back. Made-with: Cursor --- internal/k8s/client.go | 49 +++++++++++++++++++++++---- internal/k8s/context_registry_test.go | 35 +++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/internal/k8s/client.go b/internal/k8s/client.go index 4fe6ebad7..90d377357 100644 --- a/internal/k8s/client.go +++ b/internal/k8s/client.go @@ -705,22 +705,34 @@ func GetAvailableContexts() ([]ContextInfo, error) { // it, kubeconfigs that were rewritten or deleted on disk after // startup keep showing up in the dropdown until the user // restarts Radar (the "junk clusters" complaint). + // + // CRITICAL: refresh, snapshot, AND iterate must all happen under + // the same lock. The previous shape released the lock and then + // iterated, but `registry` and `fileConfigs` still pointed at the + // LIVE maps that a second concurrent caller could be mutating + // via refreshContextRegistry — Go's "fatal error: concurrent map + // read and map write". Hold the write lock through the snapshot, + // then iterate over a private []ContextInfo we've already built. clientMu.Lock() if contextRegistry != nil { + // Lazy init: MergeAndSwitchContext promotes single-file mode + // to isolated-load mode without touching perFileMtimes, so a + // CAPI promotion can leave it nil. Seeding it here is safe + // because we always hold the write lock. + if perFileMtimes == nil { + perFileMtimes = make(map[string]time.Time, len(perFileConfigs)) + } refreshContextRegistry(contextRegistry, perFileConfigs, perFileMtimes) } - registry := contextRegistry - fileConfigs := perFileConfigs currentCtx := contextName - clientMu.Unlock() - if registry != nil { + if contextRegistry != nil { // Isolated-load mode: enumerate every registered context, pulling // cluster/user/namespace from the file it originally lives in. // No merge happens — shared names across files stay distinct. - contexts := make([]ContextInfo, 0, len(registry)) - for qName, entry := range registry { - cfg, ok := fileConfigs[entry.SourceFile] + contexts := make([]ContextInfo, 0, len(contextRegistry)) + for qName, entry := range contextRegistry { + cfg, ok := perFileConfigs[entry.SourceFile] if !ok { continue } @@ -736,8 +748,10 @@ func GetAvailableContexts() ([]ContextInfo, error) { IsCurrent: qName == currentCtx, }) } + clientMu.Unlock() return contexts, nil } + clientMu.Unlock() // Single-file fallback: load the one file and enumerate its contexts. kubeconfig := kubeconfigPath @@ -964,6 +978,7 @@ func MergeAndSwitchContext(kubeconfigData []byte, contextName string) (string, s // remove the temp file and leave the globals untouched — no half-state. var newRegistry map[string]contextEntry var newFileConfigs map[string]*clientcmdapi.Config + var newFileMtimes map[string]time.Time var newPaths []string if contextRegistry == nil { @@ -980,6 +995,18 @@ func MergeAndSwitchContext(kubeconfigData []byte, contextName string) (string, s } newRegistry = registry newFileConfigs = fileConfigs + // Seed the mtime cache for the same set of files. Without + // this, the next refresh would write to a nil map + // (perFileMtimes is package-level and stays nil through the + // promotion). Refresh's nil-map guard would also catch this, + // but seeding here keeps the invariant "perFileMtimes is + // non-nil whenever contextRegistry is non-nil". + newFileMtimes = make(map[string]time.Time, len(seedPaths)) + for _, p := range seedPaths { + if info, err := os.Stat(p); err == nil { + newFileMtimes[p] = info.ModTime() + } + } newPaths = seedPaths } else { cfg, err := clientcmd.LoadFromFile(tmpPath) @@ -998,6 +1025,13 @@ func MergeAndSwitchContext(kubeconfigData []byte, contextName string) (string, s newFileConfigs[k] = v } newFileConfigs[tmpPath] = cfg + newFileMtimes = make(map[string]time.Time, len(perFileMtimes)+1) + for k, v := range perFileMtimes { + newFileMtimes[k] = v + } + if info, err := os.Stat(tmpPath); err == nil { + newFileMtimes[tmpPath] = info.ModTime() + } newPaths = append(append([]string(nil), kubeconfigPaths...), tmpPath) for name := range cfg.Contexts { qName := qualifyContextName(newRegistry, name, tmpPath) @@ -1017,6 +1051,7 @@ func MergeAndSwitchContext(kubeconfigData []byte, contextName string) (string, s // Commit. All globals updated atomically under the single Lock held above. contextRegistry = newRegistry perFileConfigs = newFileConfigs + perFileMtimes = newFileMtimes kubeconfigPaths = newPaths capiKubeconfigs[contextName] = tmpPath diff --git a/internal/k8s/context_registry_test.go b/internal/k8s/context_registry_test.go index 37b0a771b..9dc773754 100644 --- a/internal/k8s/context_registry_test.go +++ b/internal/k8s/context_registry_test.go @@ -537,6 +537,41 @@ func TestRefreshContextRegistry_NoOpWhenNothingChanged(t *testing.T) { } } +func TestRefreshContextRegistry_NilMtimeMapNoOp(t *testing.T) { + // Defensive: if perFileMtimes is nil (e.g. a code path forgot + // to initialise it after MergeAndSwitchContext promoted + // single-file → isolated-load), refresh must not panic. The + // original v1 of this PR wrote `fileMtimes[path] = mtime` and + // would have hit "assignment to entry in nil map". + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "a.yaml", "ctx-a", []kubeEntry{ + {ctxName: "ctx-a", userName: "u", clusterName: "c1"}, + }) + registry, fileConfigs := buildContextRegistry([]string{f1}) + defer func() { + if r := recover(); r != nil { + t.Fatalf("refresh panicked on nil mtimes: %v", r) + } + }() + // Pass nil — refresh must initialise it OR no-op cleanly. The + // PRODUCTION code seeds it before calling refresh; this test + // is the belt-and-braces guard that the helper itself doesn't + // crash if a future caller forgets. + var nilMtimes map[string]time.Time + if nilMtimes != nil { + t.Fatal("setup: nilMtimes should be nil") + } + // Production wraps refresh in a nil-init, but if a future + // refactor stops doing that we want refresh itself to survive. + // We simulate that by passing an empty map (refresh always + // writes, never reads as a precondition). + emptyMtimes := make(map[string]time.Time) + refreshContextRegistry(registry, fileConfigs, emptyMtimes) + if _, ok := emptyMtimes[f1]; !ok { + t.Errorf("refresh should have populated emptyMtimes for %s", filepath.Base(f1)) + } +} + func TestRefreshContextRegistry_BadParseDoesNotDropExisting(t *testing.T) { // Defensive case: user is mid-edit and saved a syntactically // broken kubeconfig (mtime moved, parse fails). We deliberately From 5f2654db5a5e6dcbf58dc5c6e82fec90ed9e22ee Mon Sep 17 00:00:00 2001 From: eliran-mic Date: Thu, 30 Apr 2026 15:13:25 +0300 Subject: [PATCH 3/4] fix(contexts): seed byFile from mtimes; nil-safe refresh; CAPI fast-path mtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot v3 follow-ups on PR #595. 1. Refresh loses track of files with all contexts removed. byFile was built solely from registry entries. If a previous refresh dropped every context for a file (e.g. user did 'kubectl config delete-context' for each one), the file's path stayed in fileMtimes/fileConfigs but vanished from byFile on subsequent calls. Any new contexts later added to that file were never discovered until restart. Seed byFile from BOTH registry AND fileMtimes keys so an emptied-but-still-tracked file gets re-stat'd on every refresh. 2. CAPI fast path omits perFileMtimes update. The MergeAndSwitchContext re-use branch overwrote the existing temp file (changing its mtime) and refreshed perFileConfigs, but didn't bump perFileMtimes. The next refresh would see a stale mtime and uselessly re-parse a file we'd just parsed in the same critical section. Bump perFileMtimes alongside perFileConfigs in the fast path. 3. Nil-mtime test was hand-waving. v3 declared nilMtimes, asserted it was nil, but then passed emptyMtimes (a non-nil map) to refresh. The function was NOT actually nil-safe — production seeded the map before calling. Make the function itself nil-safe (returns false / no-op) so callers that forget can't panic, and rewrite the test to pass an actual nil. Add a second regression that exercises the byFile-from-mtimes seed: emptied registry + rewritten file must surface the new contexts. Made-with: Cursor --- internal/k8s/client.go | 10 +++- internal/k8s/context_registry.go | 19 ++++++++ internal/k8s/context_registry_test.go | 70 +++++++++++++++++++-------- 3 files changed, 79 insertions(+), 20 deletions(-) diff --git a/internal/k8s/client.go b/internal/k8s/client.go index 90d377357..8114f2d0c 100644 --- a/internal/k8s/client.go +++ b/internal/k8s/client.go @@ -943,9 +943,17 @@ func MergeAndSwitchContext(kubeconfigData []byte, contextName string) (string, s if existingPath, ok := capiKubeconfigs[contextName]; ok { if err := clientcmd.WriteToFile(*newConfig, existingPath); err == nil { // Refresh the cached parsed config so subsequent GetAvailableContexts - // calls reflect any changes in the incoming YAML. + // calls reflect any changes in the incoming YAML. Also bump the + // cached mtime so the next refresh doesn't see a stale value + // (the WriteToFile above just changed the file's mtime) and + // uselessly re-parse a file we've already re-parsed here. if parsed, perr := clientcmd.LoadFromFile(existingPath); perr == nil { perFileConfigs[existingPath] = parsed + if perFileMtimes != nil { + if info, serr := os.Stat(existingPath); serr == nil { + perFileMtimes[existingPath] = info.ModTime() + } + } } qName := findQualifiedNameForPath(contextRegistry, existingPath, contextName) if qName == "" { diff --git a/internal/k8s/context_registry.go b/internal/k8s/context_registry.go index 4e032b144..05970d8c1 100644 --- a/internal/k8s/context_registry.go +++ b/internal/k8s/context_registry.go @@ -205,13 +205,32 @@ func refreshContextRegistry( if registry == nil { return false } + // Defensive nil-check on fileMtimes: callers (GetAvailableContexts) + // already lazy-init this, but the helper is exported and a future + // caller that forgets would otherwise panic on the first + // `fileMtimes[path] = mtime` write below. + if fileMtimes == nil { + return false + } changed := false // Group registry entries by source file so we can decide // per-file: keep, re-parse, or drop everything pointing at it. + // Seed byFile from BOTH the registry AND fileMtimes — if a + // previous refresh dropped every context for a file (e.g. user + // removed all kubectl-config-delete-context'd from a single + // file), the file's path stays in fileMtimes but no longer + // appears in registry. Without seeding from fileMtimes, that + // file would never be re-stat'd and any newly-added contexts + // to it would be invisible until the user restarted Radar. byFile := make(map[string][]string) for qName, entry := range registry { byFile[entry.SourceFile] = append(byFile[entry.SourceFile], qName) } + for path := range fileMtimes { + if _, ok := byFile[path]; !ok { + byFile[path] = nil + } + } for path, qNames := range byFile { info, statErr := os.Stat(path) if statErr != nil { diff --git a/internal/k8s/context_registry_test.go b/internal/k8s/context_registry_test.go index 9dc773754..5f325096d 100644 --- a/internal/k8s/context_registry_test.go +++ b/internal/k8s/context_registry_test.go @@ -538,11 +538,11 @@ func TestRefreshContextRegistry_NoOpWhenNothingChanged(t *testing.T) { } func TestRefreshContextRegistry_NilMtimeMapNoOp(t *testing.T) { - // Defensive: if perFileMtimes is nil (e.g. a code path forgot - // to initialise it after MergeAndSwitchContext promoted + // Defensive: if perFileMtimes is nil (e.g. a future code path + // forgets to initialise it after MergeAndSwitchContext promoted // single-file → isolated-load), refresh must not panic. The - // original v1 of this PR wrote `fileMtimes[path] = mtime` and - // would have hit "assignment to entry in nil map". + // production path already nil-inits before calling, but the + // helper is exported and we want depth. dir := t.TempDir() f1 := writeKubeconfig(t, dir, "a.yaml", "ctx-a", []kubeEntry{ {ctxName: "ctx-a", userName: "u", clusterName: "c1"}, @@ -553,22 +553,54 @@ func TestRefreshContextRegistry_NilMtimeMapNoOp(t *testing.T) { t.Fatalf("refresh panicked on nil mtimes: %v", r) } }() - // Pass nil — refresh must initialise it OR no-op cleanly. The - // PRODUCTION code seeds it before calling refresh; this test - // is the belt-and-braces guard that the helper itself doesn't - // crash if a future caller forgets. var nilMtimes map[string]time.Time - if nilMtimes != nil { - t.Fatal("setup: nilMtimes should be nil") - } - // Production wraps refresh in a nil-init, but if a future - // refactor stops doing that we want refresh itself to survive. - // We simulate that by passing an empty map (refresh always - // writes, never reads as a precondition). - emptyMtimes := make(map[string]time.Time) - refreshContextRegistry(registry, fileConfigs, emptyMtimes) - if _, ok := emptyMtimes[f1]; !ok { - t.Errorf("refresh should have populated emptyMtimes for %s", filepath.Base(f1)) + changed := refreshContextRegistry(registry, fileConfigs, nilMtimes) + if changed { + t.Errorf("refresh on nil mtimes should report no-op (changed=false)") + } + // Registry must be untouched on the nil-mtimes no-op path. + if _, ok := registry["ctx-a"]; !ok { + t.Errorf("nil-mtimes refresh should not have modified the registry") + } +} + +func TestRefreshContextRegistry_SeedsByFileFromMtimesEvenWhenRegistryEmptyForFile(t *testing.T) { + // Regression: if every context in a file got removed by a + // previous refresh, the file path stayed in fileMtimes but + // wasn't in the registry — so the next refresh's byFile only + // included paths still represented in the registry, and the + // emptied file would never be re-stat'd. Any new contexts + // later added to that file would be invisible until restart. + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "a.yaml", "ctx-a", []kubeEntry{ + {ctxName: "ctx-a", userName: "u", clusterName: "c1"}, + }) + registry, fileConfigs := buildContextRegistry([]string{f1}) + mtimes := map[string]time.Time{} + for _, p := range []string{f1} { + if info, err := os.Stat(p); err == nil { + mtimes[p] = info.ModTime() + } + } + // Simulate "all contexts in f1 were removed by a prior refresh" + // while leaving the mtime cache intact. + delete(registry, "ctx-a") + if len(registry) != 0 { + t.Fatalf("setup: expected empty registry, got %v", registry) + } + // Wait, then rewrite f1 to add a brand-new context. The mtime + // cache will still hold the OLD timestamp, so refresh should + // see the file as changed and rebuild it. + rewriteKubeconfig(t, f1, []kubeEntry{ + {ctxName: "ctx-a-fresh", userName: "u", clusterName: "c1"}, + {ctxName: "ctx-b-fresh", userName: "u", clusterName: "c1"}, + }) + refreshContextRegistry(registry, fileConfigs, mtimes) + if _, ok := registry["ctx-a-fresh"]; !ok { + t.Errorf("refresh should have picked up ctx-a-fresh; got %v", registry) + } + if _, ok := registry["ctx-b-fresh"]; !ok { + t.Errorf("refresh should have picked up ctx-b-fresh; got %v", registry) } } From 43c2cf58c6591605fd17e03bfba014eba443cf1a Mon Sep 17 00:00:00 2001 From: eliran-mic Date: Sat, 2 May 2026 23:09:20 +0300 Subject: [PATCH 4/4] fix(contexts): atomic map swap in refresh to fix snapshot-reader race refreshContextRegistry mutated contextRegistry, perFileConfigs, and perFileMtimes in place. SwitchContext and WriteKubeconfigForCurrentContext take bare references under RLock and iterate after the unlock; a concurrent GetAvailableContexts caller running refresh under the write lock would race with that iteration and trigger Go's "concurrent map read and map write" fatal error. Refactor refresh to build new maps and return them. Callers (GetAvailableContexts) atomically swap the package globals when something changed. The maps the snapshot readers captured stay frozen, restoring the post-init "publish once, never modify" invariant. Add a regression test that runs writers triggering refresh through GetAvailableContexts alongside snapshotters mimicking SwitchContext's RLock + post-unlock iterate pattern; -race must stay clean. Addresses Cursor bugbot findings #1 and #6 on PR #595. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/k8s/client.go | 46 ++++--- internal/k8s/context_registry.go | 90 ++++++++---- internal/k8s/context_registry_test.go | 189 ++++++++++++++++++++++---- 3 files changed, 251 insertions(+), 74 deletions(-) diff --git a/internal/k8s/client.go b/internal/k8s/client.go index 8114f2d0c..3000ea845 100644 --- a/internal/k8s/client.go +++ b/internal/k8s/client.go @@ -706,33 +706,45 @@ func GetAvailableContexts() ([]ContextInfo, error) { // startup keep showing up in the dropdown until the user // restarts Radar (the "junk clusters" complaint). // - // CRITICAL: refresh, snapshot, AND iterate must all happen under - // the same lock. The previous shape released the lock and then - // iterated, but `registry` and `fileConfigs` still pointed at the - // LIVE maps that a second concurrent caller could be mutating - // via refreshContextRegistry — Go's "fatal error: concurrent map - // read and map write". Hold the write lock through the snapshot, - // then iterate over a private []ContextInfo we've already built. + // refreshContextRegistry returns NEW maps when anything changes, + // so we publish them atomically under the write lock. Snapshot + // readers (SwitchContext, WriteKubeconfigForCurrentContext) take + // bare references under RLock and use them after the unlock — that + // pattern is only safe as long as the maps they captured are never + // mutated. Returning fresh maps preserves that invariant. clientMu.Lock() if contextRegistry != nil { - // Lazy init: MergeAndSwitchContext promotes single-file mode - // to isolated-load mode without touching perFileMtimes, so a - // CAPI promotion can leave it nil. Seeding it here is safe - // because we always hold the write lock. + // Lazy init: a future code path that promotes single-file mode + // to isolated-load without touching perFileMtimes would leave + // it nil. Seeding it here is safe because we always hold the + // write lock and refresh's nil guard catches it too. if perFileMtimes == nil { perFileMtimes = make(map[string]time.Time, len(perFileConfigs)) } - refreshContextRegistry(contextRegistry, perFileConfigs, perFileMtimes) + newRegistry, newFileConfigs, newFileMtimes, changed := refreshContextRegistry( + contextRegistry, perFileConfigs, perFileMtimes, + ) + if changed { + contextRegistry = newRegistry + perFileConfigs = newFileConfigs + perFileMtimes = newFileMtimes + } } + registry := contextRegistry + fileConfigs := perFileConfigs currentCtx := contextName + clientMu.Unlock() - if contextRegistry != nil { + if registry != nil { // Isolated-load mode: enumerate every registered context, pulling // cluster/user/namespace from the file it originally lives in. // No merge happens — shared names across files stay distinct. - contexts := make([]ContextInfo, 0, len(contextRegistry)) - for qName, entry := range contextRegistry { - cfg, ok := perFileConfigs[entry.SourceFile] + // Iterating outside the lock is safe because refresh publishes + // fresh maps on change rather than mutating in place, so the + // snapshot we captured is frozen. + contexts := make([]ContextInfo, 0, len(registry)) + for qName, entry := range registry { + cfg, ok := fileConfigs[entry.SourceFile] if !ok { continue } @@ -748,10 +760,8 @@ func GetAvailableContexts() ([]ContextInfo, error) { IsCurrent: qName == currentCtx, }) } - clientMu.Unlock() return contexts, nil } - clientMu.Unlock() // Single-file fallback: load the one file and enumerate its contexts. kubeconfig := kubeconfigPath diff --git a/internal/k8s/context_registry.go b/internal/k8s/context_registry.go index 05970d8c1..1ee639acb 100644 --- a/internal/k8s/context_registry.go +++ b/internal/k8s/context_registry.go @@ -176,8 +176,9 @@ func pickInitialContext( // refreshContextRegistry reconciles the in-memory contextRegistry + // perFileConfigs against what's actually on disk RIGHT NOW. Returns -// whether anything changed (so the caller can decide whether to log -// or invalidate downstream caches). +// new map values (registry, fileConfigs, fileMtimes) plus a `changed` +// flag. When `changed` is false the returned maps are guaranteed to +// equal the inputs and callers can keep the originals. // // The original registry is built ONCE in setupIsolatedLoad and was // never refreshed in multi-file mode. That left the cluster @@ -194,25 +195,32 @@ func pickInitialContext( // just incremental — it ONLY touches files whose mtime moved or // whose path no longer exists on disk. // -// Concurrency: caller MUST hold clientMu (write lock). The helper -// rebuilds shared map values, so it's not safe to interleave with -// GetAvailableContexts readers. +// Concurrency: returns NEW maps instead of mutating in place so +// callers (GetAvailableContexts under Lock, plus snapshot-style +// readers like SwitchContext under RLock) can atomically swap the +// package globals. The maps the inputs point at are never mutated, +// preserving the post-init "publish once, never modify" invariant +// that the snapshot-then-read pattern relies on. func refreshContextRegistry( registry map[string]contextEntry, fileConfigs map[string]*clientcmdapi.Config, fileMtimes map[string]time.Time, -) bool { +) ( + map[string]contextEntry, + map[string]*clientcmdapi.Config, + map[string]time.Time, + bool, +) { if registry == nil { - return false + return registry, fileConfigs, fileMtimes, false } - // Defensive nil-check on fileMtimes: callers (GetAvailableContexts) - // already lazy-init this, but the helper is exported and a future - // caller that forgets would otherwise panic on the first - // `fileMtimes[path] = mtime` write below. + // Defensive nil-check on fileMtimes: callers + // (GetAvailableContexts) already lazy-init this, but the helper + // is exported and a future caller that forgets would otherwise + // panic on the first `fileMtimes[path] = mtime` write below. if fileMtimes == nil { - return false + return registry, fileConfigs, fileMtimes, false } - changed := false // Group registry entries by source file so we can decide // per-file: keep, re-parse, or drop everything pointing at it. // Seed byFile from BOTH the registry AND fileMtimes — if a @@ -231,6 +239,35 @@ func refreshContextRegistry( byFile[path] = nil } } + // Walk each file once, deciding whether to keep / drop / re-parse. + // We start from a lazy "no changes" state and only allocate fresh + // maps when something actually changes. This keeps the steady-state + // cost (most calls find nothing to do) at a few stat()s. + newRegistry := registry + newFileConfigs := fileConfigs + newFileMtimes := fileMtimes + changed := false + cloneOnce := func() { + if changed { + return + } + changed = true + nr := make(map[string]contextEntry, len(registry)) + for k, v := range registry { + nr[k] = v + } + nfc := make(map[string]*clientcmdapi.Config, len(fileConfigs)) + for k, v := range fileConfigs { + nfc[k] = v + } + nm := make(map[string]time.Time, len(fileMtimes)) + for k, v := range fileMtimes { + nm[k] = v + } + newRegistry = nr + newFileConfigs = nfc + newFileMtimes = nm + } for path, qNames := range byFile { info, statErr := os.Stat(path) if statErr != nil { @@ -238,12 +275,12 @@ func refreshContextRegistry( // entry pointing at it AND its cached config. This // is the CAPI-cluster-destroyed and // "user removed file from kubeconfig dir" cases. + cloneOnce() for _, qName := range qNames { - delete(registry, qName) + delete(newRegistry, qName) } - delete(fileConfigs, path) - delete(fileMtimes, path) - changed = true + delete(newFileConfigs, path) + delete(newFileMtimes, path) continue } mtime := info.ModTime() @@ -266,8 +303,9 @@ func refreshContextRegistry( filepath.Base(path), scrubPathError(err)) continue } - fileConfigs[path] = cfg - fileMtimes[path] = mtime + cloneOnce() + newFileConfigs[path] = cfg + newFileMtimes[path] = mtime // Replace this file's entries in the registry. Names that // are no longer in the file get dropped; new ones are added. liveNames := make(map[string]struct{}, len(cfg.Contexts)) @@ -275,9 +313,8 @@ func refreshContextRegistry( liveNames[name] = struct{}{} } for _, qName := range qNames { - if _, alive := liveNames[registry[qName].InFileName]; !alive { - delete(registry, qName) - changed = true + if _, alive := liveNames[newRegistry[qName].InFileName]; !alive { + delete(newRegistry, qName) } } // Add any contexts that are new in this file. We deliberately @@ -286,7 +323,7 @@ func refreshContextRegistry( for name := range cfg.Contexts { already := false for _, qName := range qNames { - if e, ok := registry[qName]; ok && e.SourceFile == path && e.InFileName == name { + if e, ok := newRegistry[qName]; ok && e.SourceFile == path && e.InFileName == name { already = true break } @@ -294,15 +331,14 @@ func refreshContextRegistry( if already { continue } - qName := qualifyContextName(registry, name, path) - registry[qName] = contextEntry{ + qName := qualifyContextName(newRegistry, name, path) + newRegistry[qName] = contextEntry{ SourceFile: path, InFileName: name, } - changed = true } } - return changed + return newRegistry, newFileConfigs, newFileMtimes, changed } // aggregateExecPluginCommands walks every context across every per-file diff --git a/internal/k8s/context_registry_test.go b/internal/k8s/context_registry_test.go index 5f325096d..85d5179f8 100644 --- a/internal/k8s/context_registry_test.go +++ b/internal/k8s/context_registry_test.go @@ -4,6 +4,8 @@ import ( "os" "path/filepath" "sort" + "sync" + "sync/atomic" "testing" "time" @@ -441,22 +443,34 @@ func TestRefreshContextRegistry_DropsRemovedFile(t *testing.T) { t.Fatalf("remove fixture: %v", err) } - changed := refreshContextRegistry(registry, fileConfigs, mtimes) + newRegistry, newFileConfigs, newMtimes, changed := refreshContextRegistry(registry, fileConfigs, mtimes) if !changed { t.Errorf("expected refresh to report a change after deleting %s", filepath.Base(f2)) } - if _, ok := registry["ctx-doomed"]; ok { - t.Errorf("ctx-doomed still in registry after file removed: %v", keysOf(registry)) + if _, ok := newRegistry["ctx-doomed"]; ok { + t.Errorf("ctx-doomed still in registry after file removed: %v", keysOf(newRegistry)) } - if _, ok := registry["ctx-alive"]; !ok { - t.Errorf("ctx-alive should still be in registry: %v", keysOf(registry)) + if _, ok := newRegistry["ctx-alive"]; !ok { + t.Errorf("ctx-alive should still be in registry: %v", keysOf(newRegistry)) } - if _, ok := fileConfigs[f2]; ok { + if _, ok := newFileConfigs[f2]; ok { t.Errorf("perFileConfigs still has entry for removed file %s", filepath.Base(f2)) } - if _, ok := mtimes[f2]; ok { + if _, ok := newMtimes[f2]; ok { t.Errorf("perFileMtimes still has entry for removed file %s", filepath.Base(f2)) } + // Original maps must be untouched — refresh returns fresh maps so + // snapshot readers (SwitchContext, WriteKubeconfigForCurrentContext) + // can iterate the captured maps without locking. + if _, ok := registry["ctx-doomed"]; !ok { + t.Errorf("input registry was mutated; expected immutability") + } + if _, ok := fileConfigs[f2]; !ok { + t.Errorf("input fileConfigs was mutated; expected immutability") + } + if _, ok := mtimes[f2]; !ok { + t.Errorf("input mtimes was mutated; expected immutability") + } } func TestRefreshContextRegistry_DropsContextRemovedFromFile(t *testing.T) { @@ -478,15 +492,18 @@ func TestRefreshContextRegistry_DropsContextRemovedFromFile(t *testing.T) { {ctxName: "ctx-keep", userName: "u", clusterName: "c1"}, }) - changed := refreshContextRegistry(registry, fileConfigs, mtimes) + newRegistry, _, _, changed := refreshContextRegistry(registry, fileConfigs, mtimes) if !changed { t.Errorf("expected refresh to report a change after rewriting %s", filepath.Base(f1)) } - if _, ok := registry["ctx-delete"]; ok { - t.Errorf("ctx-delete still in registry after rewrite: %v", keysOf(registry)) + if _, ok := newRegistry["ctx-delete"]; ok { + t.Errorf("ctx-delete still in registry after rewrite: %v", keysOf(newRegistry)) } - if _, ok := registry["ctx-keep"]; !ok { - t.Errorf("ctx-keep should still be in registry: %v", keysOf(registry)) + if _, ok := newRegistry["ctx-keep"]; !ok { + t.Errorf("ctx-keep should still be in registry: %v", keysOf(newRegistry)) + } + if _, ok := registry["ctx-delete"]; !ok { + t.Errorf("input registry was mutated; expected immutability") } } @@ -505,15 +522,15 @@ func TestRefreshContextRegistry_PicksUpNewContextInSameFile(t *testing.T) { {ctxName: "ctx-new", userName: "u", clusterName: "c2"}, }) - changed := refreshContextRegistry(registry, fileConfigs, mtimes) + newRegistry, _, _, changed := refreshContextRegistry(registry, fileConfigs, mtimes) if !changed { t.Errorf("expected refresh to report a change after add") } - if _, ok := registry["ctx-new"]; !ok { - t.Errorf("ctx-new not picked up after refresh: %v", keysOf(registry)) + if _, ok := newRegistry["ctx-new"]; !ok { + t.Errorf("ctx-new not picked up after refresh: %v", keysOf(newRegistry)) } - if _, ok := registry["ctx-original"]; !ok { - t.Errorf("ctx-original disappeared from registry: %v", keysOf(registry)) + if _, ok := newRegistry["ctx-original"]; !ok { + t.Errorf("ctx-original disappeared from registry: %v", keysOf(newRegistry)) } } @@ -525,11 +542,11 @@ func TestRefreshContextRegistry_NoOpWhenNothingChanged(t *testing.T) { registry, fileConfigs, mtimes := loadFixture(t, []string{f1}) before := keysOf(registry) - changed := refreshContextRegistry(registry, fileConfigs, mtimes) + newRegistry, _, _, changed := refreshContextRegistry(registry, fileConfigs, mtimes) if changed { t.Errorf("expected no change on stable disk state, got changed=true") } - after := keysOf(registry) + after := keysOf(newRegistry) sort.Strings(before) sort.Strings(after) if len(before) != len(after) { @@ -554,12 +571,12 @@ func TestRefreshContextRegistry_NilMtimeMapNoOp(t *testing.T) { } }() var nilMtimes map[string]time.Time - changed := refreshContextRegistry(registry, fileConfigs, nilMtimes) + newRegistry, _, _, changed := refreshContextRegistry(registry, fileConfigs, nilMtimes) if changed { t.Errorf("refresh on nil mtimes should report no-op (changed=false)") } // Registry must be untouched on the nil-mtimes no-op path. - if _, ok := registry["ctx-a"]; !ok { + if _, ok := newRegistry["ctx-a"]; !ok { t.Errorf("nil-mtimes refresh should not have modified the registry") } } @@ -595,12 +612,12 @@ func TestRefreshContextRegistry_SeedsByFileFromMtimesEvenWhenRegistryEmptyForFil {ctxName: "ctx-a-fresh", userName: "u", clusterName: "c1"}, {ctxName: "ctx-b-fresh", userName: "u", clusterName: "c1"}, }) - refreshContextRegistry(registry, fileConfigs, mtimes) - if _, ok := registry["ctx-a-fresh"]; !ok { - t.Errorf("refresh should have picked up ctx-a-fresh; got %v", registry) + newRegistry, _, _, _ := refreshContextRegistry(registry, fileConfigs, mtimes) + if _, ok := newRegistry["ctx-a-fresh"]; !ok { + t.Errorf("refresh should have picked up ctx-a-fresh; got %v", newRegistry) } - if _, ok := registry["ctx-b-fresh"]; !ok { - t.Errorf("refresh should have picked up ctx-b-fresh; got %v", registry) + if _, ok := newRegistry["ctx-b-fresh"]; !ok { + t.Errorf("refresh should have picked up ctx-b-fresh; got %v", newRegistry) } } @@ -620,9 +637,9 @@ func TestRefreshContextRegistry_BadParseDoesNotDropExisting(t *testing.T) { t.Fatalf("rewrite: %v", err) } - refreshContextRegistry(registry, fileConfigs, mtimes) - if _, ok := registry["ctx-a"]; !ok { - t.Errorf("ctx-a was dropped on parse failure; expected to keep it: %v", keysOf(registry)) + newRegistry, _, _, _ := refreshContextRegistry(registry, fileConfigs, mtimes) + if _, ok := newRegistry["ctx-a"]; !ok { + t.Errorf("ctx-a was dropped on parse failure; expected to keep it: %v", keysOf(newRegistry)) } } @@ -669,6 +686,120 @@ func rewriteKubeconfig(t *testing.T, path string, entries []kubeEntry) { } } +// Regression: GetAvailableContexts triggers refreshContextRegistry under +// the write lock; concurrent callers that snapshot the live maps under +// RLock and iterate after unlocking must not race with the refresh. The +// previous shape mutated maps in place, so a refresh during another +// caller's post-RLock iteration triggered Go's "concurrent map read and +// map write" panic. The fix swaps maps atomically rather than mutating — +// this test runs both patterns concurrently with the race detector. +func TestGetAvailableContexts_ConcurrentRefreshAndSnapshotIterate(t *testing.T) { + dir := t.TempDir() + f1 := writeKubeconfig(t, dir, "a.yaml", "ctx-a", []kubeEntry{ + {ctxName: "ctx-a", userName: "u", clusterName: "c1"}, + }) + f2 := writeKubeconfig(t, dir, "b.yaml", "ctx-b", []kubeEntry{ + {ctxName: "ctx-b", userName: "u", clusterName: "c2"}, + }) + + // Stand up the package globals to look like multi-file isolated-load + // mode. Restore them after the test so other tests in this package + // don't see a polluted state. + clientMu.Lock() + prevRegistry := contextRegistry + prevConfigs := perFileConfigs + prevMtimes := perFileMtimes + prevPaths := kubeconfigPaths + prevName := contextName + registry, fileConfigs := buildContextRegistry([]string{f1, f2}) + mtimes := make(map[string]time.Time, 2) + for _, p := range []string{f1, f2} { + if info, err := os.Stat(p); err == nil { + mtimes[p] = info.ModTime() + } + } + contextRegistry = registry + perFileConfigs = fileConfigs + perFileMtimes = mtimes + kubeconfigPaths = []string{f1, f2} + contextName = "ctx-a" + clientMu.Unlock() + t.Cleanup(func() { + clientMu.Lock() + contextRegistry = prevRegistry + perFileConfigs = prevConfigs + perFileMtimes = prevMtimes + kubeconfigPaths = prevPaths + contextName = prevName + clientMu.Unlock() + }) + + const iterations = 200 + const writers = 4 + const snapshotters = 4 + var wg sync.WaitGroup + var stop atomic.Bool + + // Writer goroutines: rewrite kubeconfig files on disk so the next + // GetAvailableContexts call observes a changed mtime and re-parses, + // exercising the refresh path that previously mutated maps in place. + for i := 0; i < writers; i++ { + wg.Add(1) + go func(idx int) { + defer wg.Done() + for j := 0; j < iterations && !stop.Load(); j++ { + target := f1 + if j%2 == 1 { + target = f2 + } + ctxBase := "ctx-a" + if target == f2 { + ctxBase = "ctx-b" + } + rewriteKubeconfig(t, target, []kubeEntry{ + {ctxName: ctxBase, userName: "u", clusterName: "c1"}, + }) + if _, err := GetAvailableContexts(); err != nil { + t.Errorf("GetAvailableContexts: %v", err) + stop.Store(true) + return + } + } + }(i) + } + + // Snapshotter goroutines: replicate SwitchContext / + // WriteKubeconfigForCurrentContext's bare-reference snapshot pattern, + // then iterate after releasing the lock. Without map immutability + // this races with refresh and panics. + for i := 0; i < snapshotters; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < iterations && !stop.Load(); j++ { + clientMu.RLock() + snapReg := contextRegistry + snapConfigs := perFileConfigs + clientMu.RUnlock() + + // Iterate outside the lock — same shape as SwitchContext. + for qName, entry := range snapReg { + _ = qName + cfg, ok := snapConfigs[entry.SourceFile] + if !ok { + continue + } + for name := range cfg.Contexts { + _ = name + } + } + } + }() + } + + wg.Wait() +} + func TestAggregateExecPluginCommands_UniqueAcrossFiles(t *testing.T) { dir := t.TempDir() // File 1: user 'oidc' with kubectl exec plugin.