From 78e965cfc4b56c021ce8032819bd35d39fb6195d Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 27 Apr 2026 01:04:00 +0300 Subject: [PATCH 1/4] cache: patience-window + minimal-set first paint, no hard timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the 60s blanket SyncTimeout for the connecting screen with a softer two-stage gate: 1. Patience window (8s) — wait for ALL critical informers. On the common path (fast cluster) nothing else triggers; first paint is complete and coherent. 2. Minimal-set fallback — once the patience window elapses, return as soon as pods/namespaces/nodes/services/deployments are synced. Any other critical informer still loading (ingresses, jobs, etc.) is promoted to deferred and joins the topology when ready. No hard timeout: deferred informers retry indefinitely. The user-facing "give up" semantic that the 60s cap implied was always misleading — informers stayed subscribed regardless. The patience window is a "render with what you have" boundary, not a failure cliff. Cache progress is wired into the connection's progressMessage so the connecting screen shows "Loading cluster data… X of Y ready" instead of a static "Loading workloads…". After first paint, slower informers arrive coalesced into 5s windows (was 3s) to keep the topology graph from jittering, and the home view shows a small inline indicator listing the kinds still loading. Mechanism lives in pkg/k8score (CacheConfig.PatienceWindow, CacheConfig.MinimalSet, CacheConfig.SyncProgress); skyhook-connector's legacy SyncTimeout path is unchanged. --- internal/k8s/cache.go | 57 +++++++++- internal/server/sse.go | 12 +- pkg/k8score/cache.go | 164 ++++++++++++++++++++++----- pkg/k8score/cache_test.go | 112 ++++++++++++++++++ pkg/k8score/types.go | 30 ++++- web/src/components/home/HomeView.tsx | 14 ++- 6 files changed, 353 insertions(+), 36 deletions(-) diff --git a/internal/k8s/cache.go b/internal/k8s/cache.go index 18186f391..314389a2f 100644 --- a/internal/k8s/cache.go +++ b/internal/k8s/cache.go @@ -65,6 +65,31 @@ var deferredResources = map[string]bool{ "limitranges": true, // audit inheritance lookups, not first-render } +// minimalFirstPaintSet is the irreducible subset of critical informers the +// home dashboard needs to feel coherent. Once these are synced (and the +// patience window has elapsed), the cache returns and slower critical +// informers (ingresses, jobs, cronjobs, etc.) continue loading in the +// background, joining the topology when they arrive. +// +// Pods are deliberately included even though they are typically the +// largest kind — without pods the topology and resource counts are +// essentially empty. The patience window absorbs pod-sync latency on +// most clusters; only when it doesn't do we render with what we have. +var minimalFirstPaintSet = map[string]bool{ + "pods": true, + "namespaces": true, + "nodes": true, + "services": true, + "deployments": true, +} + +// firstPaintPatience is how long we wait for ALL critical informers before +// falling back to the minimal set. On most clusters the full critical set +// syncs well inside this window, so first paint is complete and there is +// no progressive fill-in. Slow clusters fall through to the minimal-set +// gate and render with whatever is ready then. +const firstPaintPatience = 8 * time.Second + // ResourceChange is a type alias for the canonical definition in pkg/k8score. type ResourceChange = k8score.ResourceChange @@ -139,7 +164,9 @@ func InitResourceCache(ctx context.Context) error { Namespace: permResult.Namespace, DebugEvents: DebugEvents, TimingLogger: logTiming, - SyncTimeout: 60 * time.Second, + PatienceWindow: firstPaintPatience, + MinimalSet: minimalFirstPaintSet, + SyncProgress: emitSyncProgress, DeferredSyncTimeout: 3 * time.Minute, OnReceived: func(kind string) { @@ -264,6 +291,34 @@ func recordK8sEventToTimeline(obj any) { } } +// emitSyncProgress is the SyncProgress callback wired into the resource +// cache. It keeps the connection's progressMessage in step with the live +// informer-sync count so the connecting screen shows "Loading cluster +// data… X of Y resource types ready" instead of a frozen "Loading +// workloads…". After first paint (minimalReady=true), the cache returns +// and the connection state flips to "connected" — further progress lives +// in the deferred-loading indicator on the home dashboard. +func emitSyncProgress(synced, total int, minimalReady bool) { + if total == 0 { + return + } + // Only update while we're still in the connecting phase. Once + // connected, the connecting screen is gone and the message is moot. + if GetConnectionStatus().State != StateConnecting { + return + } + var msg string + switch { + case synced == total: + msg = "Finalizing…" + case minimalReady: + msg = fmt.Sprintf("Loading cluster data… %d of %d ready (showing partial)", synced, total) + default: + msg = fmt.Sprintf("Loading cluster data… %d of %d ready", synced, total) + } + UpdateConnectionProgress(msg) +} + // isNoisyResource returns true if this resource generates constant updates that aren't interesting func isNoisyResource(kind, name, op string) bool { if op != "update" { diff --git a/internal/server/sse.go b/internal/server/sse.go index 1be039f30..5ab544e4c 100644 --- a/internal/server/sse.go +++ b/internal/server/sse.go @@ -405,12 +405,16 @@ func (b *SSEBroadcaster) watchResourceChanges() { } // Debounce strategy: - // - During warmup (initial sync + CRD discovery): 3s to avoid constant - // topology rebuilds from dynamic informer syncs. The UI shows a connecting - // spinner anyway, so the delay is invisible. + // - During warmup: critical informers that didn't make the patience + // window (e.g. ingresses, jobs, replicasets) plus deferred informers + // plus dynamic CRD informers all stream in over the next 10–60s. We + // want the topology graph to settle in a few coherent paints, not + // jump on every arrival, so we coalesce into 5s windows. The UI is + // already on the home view by this point with a "loading more" hint; + // the slight delay is preferable to a fidgety graph. // - After warmup: re-evaluate based on cluster size. Large clusters (>5000 // resources) use 5s; smaller clusters use 500ms. - const warmupDebounce = 3 * time.Second + const warmupDebounce = 5 * time.Second debounceDuration := warmupDebounce b.watchMu.Lock() warmupCh := b.warmupDone // local copy under lock; nil-ed after firing to avoid closed-channel spin diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go index a7b2dc273..309d6db11 100644 --- a/pkg/k8score/cache.go +++ b/pkg/k8score/cache.go @@ -85,9 +85,12 @@ type informerSetup struct { // NewResourceCache creates and starts a ResourceCache from the given config. // Startup has three tiers: -// - Critical informers block until synced (or SyncTimeout, then promoted to deferred) -// - Deferred informers sync in the background after critical completes -// - Background informers (e.g. Events) sync independently on their own goroutine +// - Critical informers block until synced. If both PatienceWindow and +// MinimalSet are set, the cache returns as soon as the minimal set is +// ready *after* the patience window elapses (rest of critical promoted +// to deferred). Otherwise, falls back to the legacy SyncTimeout path. +// - Deferred informers sync in the background after critical completes. +// - Background informers (e.g. Events) sync independently on their own goroutine. func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { if cfg.Client == nil { return nil, fmt.Errorf("CacheConfig.Client must not be nil") @@ -273,22 +276,87 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { }() } - // Phase 1: Wait for critical informers with periodic progress logging. - // Log every 5s so stuck syncs are visible in logs (previously silent). + // Phase 1: Wait for critical informers. + // + // Two modes: + // - Patience+MinimalSet (preferred): wait for ALL critical until the + // patience window elapses. Once elapsed, return as soon as the + // minimal set is ready; promote the rest of critical to deferred. + // Keep waiting indefinitely until MinimalSet is satisfied — there + // is no hard timeout, the loading UI handles the wait. + // - SyncTimeout (legacy): wait for ALL critical, hard-cap at the + // timeout, promote everything still pending. + useMinimalSet := cfg.PatienceWindow > 0 && len(cfg.MinimalSet) > 0 timedOut := false + patienceElapsed := false if len(criticalSyncFuncs) > 0 { - progressTicker := time.NewTicker(5 * time.Second) + // Build the index of "must-sync" entries for the minimal-set check. + // Filter to entries that are critical AND in MinimalSet (deferred + // types or unknown keys don't gate first paint). + var minimalEntries []informerEntry + if useMinimalSet { + for _, e := range allEntries { + if e.deferred { + continue + } + if cfg.MinimalSet[e.key] { + minimalEntries = append(minimalEntries, e) + } + } + } + + progressTicker := time.NewTicker(1 * time.Second) defer progressTicker.Stop() + var patienceCh <-chan time.Time + if useMinimalSet { + patienceTimer := time.NewTimer(cfg.PatienceWindow) + defer patienceTimer.Stop() + patienceCh = patienceTimer.C + } + var deadlineCh <-chan time.Time - if cfg.SyncTimeout > 0 { + if !useMinimalSet && cfg.SyncTimeout > 0 { deadline := time.NewTimer(cfg.SyncTimeout) defer deadline.Stop() deadlineCh = deadline.C } + // emitProgress reports current sync progress to the application + // callback (typically wired to the connection's progressMessage). + emitProgress := func(minimalReady bool) { + if cfg.SyncProgress == nil { + return + } + rc.informerMu.RLock() + var synced, total int + for _, s := range rc.informerStatuses { + if s.Deferred { + continue + } + total++ + if s.Synced { + synced++ + } + } + rc.informerMu.RUnlock() + cfg.SyncProgress(synced, total, minimalReady) + } + + minimalReady := func() bool { + for _, e := range minimalEntries { + if !e.synced() { + return false + } + } + return true + } + + // Initial progress emission so UI shows "0/N" right away. + emitProgress(false) + for { - // Check if all critical informers are synced + // Check if all critical informers are synced — preferred exit. allSynced := true for _, fn := range criticalSyncFuncs { if !fn() { @@ -300,14 +368,23 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { break } - // Wait for either: stopCh, deadline, or next progress tick + // Once the patience window has elapsed, exit as soon as the + // minimal set is ready. Until then, keep waiting for everything. + if patienceElapsed && useMinimalSet && minimalReady() { + break + } + select { case <-stopCh: return nil, fmt.Errorf("failed to sync critical resource caches") + case <-patienceCh: + patienceElapsed = true + patienceCh = nil // disable further receives on closed timer case <-deadlineCh: timedOut = true case <-progressTicker.C: - // Log which informers are still pending, with item counts + // Log + emit progress callback. Log every tick; the application + // callback decides whether to surface anything (e.g. via SSE). counts := rc.GetKindObjectCounts() rc.informerMu.RLock() var synced, pendingParts []string @@ -323,8 +400,11 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { } } rc.informerMu.RUnlock() - stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s", - len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", ")) + if len(pendingParts) > 0 { + stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s", + len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", ")) + } + emitProgress(useMinimalSet && patienceElapsed && minimalReady()) default: time.Sleep(100 * time.Millisecond) } @@ -334,34 +414,60 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { } } - if timedOut { - // Promote unsynced critical informers to deferred so they continue - // syncing in the background. The UI renders with partial data. - var promoted []string - rc.informerMu.Lock() - for i, e := range allEntries { - if e.deferred || e.synced() { - continue - } - promoted = append(promoted, e.kind) - // Add to deferred tracking (the informer is already running from Phase 1) - deferredSyncFuncs = append(deferredSyncFuncs, e.synced) - deferredKeys = append(deferredKeys, e.key) - // Mark as deferred in diagnostics - rc.informerStatuses[i].Deferred = true + // Promotion: any critical informer not yet synced gets reclassified as + // deferred so the cache can return. Applies to both the SyncTimeout + // (legacy) path and the MinimalSet (new) path. When everything synced + // in time, this is a no-op. + var promoted []string + rc.informerMu.Lock() + for i, e := range allEntries { + if e.deferred || e.synced() { + continue } - rc.informerMu.Unlock() + promoted = append(promoted, e.kind) + deferredSyncFuncs = append(deferredSyncFuncs, e.synced) + deferredKeys = append(deferredKeys, e.key) + rc.informerStatuses[i].Deferred = true + } + rc.informerMu.Unlock() + + switch { + case timedOut: stdlog.Printf("WARNING: Critical sync timed out after %v — promoting %d informers to deferred: %s", cfg.SyncTimeout, len(promoted), strings.Join(promoted, ", ")) stdlog.Printf("UI will render with partial data; promoted informers continue syncing in background") logf(" Phase 1 sync TIMED OUT (%d critical, %d promoted to deferred): %v", len(criticalSyncFuncs), len(promoted), time.Since(syncStart)) rc.promotedKinds = promoted - } else { + case patienceElapsed && len(promoted) > 0: + stdlog.Printf("First-paint ready after %v: minimal set synced; %d slower informers continue in background: %s", + time.Since(syncStart), len(promoted), strings.Join(promoted, ", ")) + logf(" Phase 1 minimal-set sync (%d/%d critical, %d still loading): %v", + len(criticalSyncFuncs)-len(promoted), len(criticalSyncFuncs), len(promoted), time.Since(syncStart)) + rc.promotedKinds = promoted + default: logf(" Phase 1 sync (%d critical informers): %v", len(criticalSyncFuncs), time.Since(syncStart)) stdlog.Printf("Critical resource caches synced in %v — UI can render", time.Since(syncStart)) } + // Final progress emission so the UI's "X of Y ready" indicator settles + // to the correct state immediately upon return. + if cfg.SyncProgress != nil { + rc.informerMu.RLock() + var synced, total int + for _, s := range rc.informerStatuses { + if s.Deferred { + continue + } + total++ + if s.Synced { + synced++ + } + } + rc.informerMu.RUnlock() + cfg.SyncProgress(synced, total, true) + } + // Log per-type resource counts for startup diagnostics if counts := rc.GetKindObjectCounts(); len(counts) > 0 { total := 0 diff --git a/pkg/k8score/cache_test.go b/pkg/k8score/cache_test.go index 17e20621d..0699da2ca 100644 --- a/pkg/k8score/cache_test.go +++ b/pkg/k8score/cache_test.go @@ -176,6 +176,118 @@ func TestNewResourceCache_DeferredSync_PartialFailure(t *testing.T) { } } +// TestNewResourceCache_MinimalSet_AllFast verifies that when the patience +// window is set but every informer syncs quickly, NewResourceCache returns +// via the all-critical-synced path (not the minimal-set fallback). No +// informers should be promoted, syncProgress should fire to completion. +func TestNewResourceCache_MinimalSet_AllFast(t *testing.T) { + client := fake.NewSimpleClientset() + + var lastSynced, lastTotal int + var lastReady bool + var progMu sync.Mutex + + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + Services: true, + Deployments: true, + Ingresses: true, + }, + PatienceWindow: 2 * time.Second, + MinimalSet: map[string]bool{ + Pods: true, + Services: true, + }, + SyncProgress: func(synced, total int, ready bool) { + progMu.Lock() + lastSynced, lastTotal, lastReady = synced, total, ready + progMu.Unlock() + }, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + // All four critical informers should be synced — no promotion. + if got := rc.PromotedKinds(); len(got) != 0 { + t.Errorf("expected no promoted kinds when all critical synced fast, got %v", got) + } + + progMu.Lock() + defer progMu.Unlock() + if lastTotal != 4 { + t.Errorf("expected SyncProgress.total=4, got %d", lastTotal) + } + if lastSynced != 4 { + t.Errorf("expected SyncProgress.synced=4, got %d", lastSynced) + } + if !lastReady { + t.Error("expected final SyncProgress to report minimalReady=true") + } +} + +// TestNewResourceCache_MinimalSet_Promotion verifies the slow-cluster path: +// when a non-minimal critical informer can't sync within the patience +// window but minimal-set members are ready, NewResourceCache returns and +// promotes the slow informer to deferred. The promoted informer must +// continue running and eventually become available. +func TestNewResourceCache_MinimalSet_Promotion(t *testing.T) { + client := fake.NewSimpleClientset() + + // Make ingresses LIST fail. The reflector retries with backoff, so + // HasSynced() stays false, but the failure returns immediately — + // no shared tracker lock held, pods/services LIST proceed normally. + // Ingress is critical (not in MinimalSet) so it should be the one + // that gets promoted. + client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, fmt.Errorf("simulated slow API: ingresses unavailable") + }) + + start := time.Now() + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + Services: true, + Ingresses: true, + }, + PatienceWindow: 200 * time.Millisecond, + MinimalSet: map[string]bool{ + Pods: true, + Services: true, + }, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + elapsed := time.Since(start) + if elapsed < 200*time.Millisecond { + t.Errorf("expected return after at least the patience window (200ms), got %v", elapsed) + } + // Cap upper bound generously: patience window + a few jitter ticks. + if elapsed > 3*time.Second { + t.Errorf("returned far later than patience window — minimal-set fallback didn't trigger? elapsed=%v", elapsed) + } + + promoted := rc.PromotedKinds() + if len(promoted) != 1 || promoted[0] != "Ingress" { + t.Errorf("expected Ingress to be promoted, got %v", promoted) + } + + // Minimal-set listers should be available immediately. + if rc.Pods() == nil { + t.Error("expected Pods() lister available after first paint") + } + if rc.Services() == nil { + t.Error("expected Services() lister available after first paint") + } +} + func TestNewResourceCache_Callbacks(t *testing.T) { // Pre-create a pod so the informer fires an add event pod := &corev1.Pod{ diff --git a/pkg/k8score/types.go b/pkg/k8score/types.go index 928aa92bc..154c91787 100644 --- a/pkg/k8score/types.go +++ b/pkg/k8score/types.go @@ -144,9 +144,37 @@ type CacheConfig struct { // SyncTimeout is the maximum time to wait for critical informers to sync // before proceeding with partial data. Unsynced critical informers are // promoted to deferred and continue syncing in the background. - // Zero means wait indefinitely (original behavior). + // Zero means wait indefinitely (original behavior). Prefer PatienceWindow + // + MinimalSet for application-driven first-paint, this remains for + // backwards compat (skyhook-connector still uses it). SyncTimeout time.Duration + // PatienceWindow is the soft deadline after which the cache returns as + // soon as MinimalSet (below) is fully synced. Critical informers not in + // MinimalSet that are still syncing are promoted to deferred so the + // caller can render a useful first paint while the rest stream in. + // + // When 0 or when MinimalSet is empty, this falls back to the legacy + // behavior controlled by SyncTimeout. + PatienceWindow time.Duration + + // MinimalSet is the subset of critical resource types (keyed by + // ResourceType, e.g. Pods, Services) that the application considers + // the irreducible minimum for a useful first render. Once these are + // synced AND the patience window has elapsed, NewResourceCache + // returns even if other critical informers are still syncing. + // + // Keys must reference types also present in ResourceTypes. Members + // that are deferred or absent are ignored. + MinimalSet map[string]bool + + // SyncProgress is invoked roughly every second during the critical + // sync phase with the current progress. It is also invoked once when + // first paint becomes ready (allCritical=true OR minimalReady=true + // after the patience window elapsed). Application code uses this to + // surface a "loading X of Y resource types" indicator. + SyncProgress func(synced, total int, minimalReady bool) + // DeferredSyncTimeout caps how long we wait for deferred informers to // finish syncing before giving up. When the deadline fires, deferredDone // is closed and deferredFailed is set so any still-unsynced types return diff --git a/web/src/components/home/HomeView.tsx b/web/src/components/home/HomeView.tsx index 7da0f5928..c43c5f811 100644 --- a/web/src/components/home/HomeView.tsx +++ b/web/src/components/home/HomeView.tsx @@ -11,7 +11,7 @@ import { NetworkPolicyCoverageCard } from './NetworkPolicyCoverageCard' import { CostCard } from './CostCard' import { AuditCard, PaneLoader, StatusDot, mapHealthToTone } from '@skyhook-io/k8s-ui' import { ClusterHealthCard } from './ClusterHealthCard' -import { AlertTriangle, Shield } from 'lucide-react' +import { AlertTriangle, Loader2, Shield } from 'lucide-react' import { clsx } from 'clsx' interface HomeViewProps { @@ -58,9 +58,21 @@ export function HomeView({ namespaces, topology, onNavigateToView, onNavigateToR const hasProblems = data.problems && data.problems.length > 0 + const stillLoading = data.deferredLoading || (data.partialData && data.partialData.length > 0) + return (
+ {stillLoading && ( +
+ + + {data.partialData && data.partialData.length > 0 + ? `Still loading: ${data.partialData.join(', ')}` + : 'Loading remaining resources…'} + +
+ )} {/* Row 1: Cluster Health Card (combined health + resource counts) */} Date: Mon, 27 Apr 2026 02:10:10 +0300 Subject: [PATCH 2/4] review fixes: minimal-set typo guard, live partial-data, comment cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Guard MinimalSet against typos / unmatched keys: log a warning at cache construction listing keys that don't correspond to any enabled critical informer, plus a separate warning when the effective minimal set is empty (would otherwise return at PatienceWindow with nothing meaningfully gated). - Add PendingPromotedKinds() — live-filtered view of promoted-at-first- paint informers that still aren't synced. Dashboard banner switches to this so the "still loading" pill drains as kinds arrive instead of showing the snapshot from connect time. - After patience elapses, log distinguishes "first-paint blocked" (only minimal-set kinds matter) from generic "critical sync progress" so operators can see the actual blocker. - New tests: legacy SyncTimeout promotion, MinimalSet typo, PatienceWindow without MinimalSet, PendingPromotedKinds live-filter contract. - Comment cleanup: drop commit-flavored "(legacy)/(new)" parentheticals, fix SyncProgress godoc parameter names, trim restating-what comments. --- internal/k8s/cache.go | 25 +++--- internal/server/dashboard.go | 10 ++- pkg/k8score/cache.go | 112 +++++++++++++++++++------- pkg/k8score/cache_test.go | 151 +++++++++++++++++++++++++++++++++++ pkg/k8score/types.go | 19 +++-- web/src/api/client.ts | 2 +- 6 files changed, 261 insertions(+), 58 deletions(-) diff --git a/internal/k8s/cache.go b/internal/k8s/cache.go index 314389a2f..0cc7aa22a 100644 --- a/internal/k8s/cache.go +++ b/internal/k8s/cache.go @@ -65,16 +65,12 @@ var deferredResources = map[string]bool{ "limitranges": true, // audit inheritance lookups, not first-render } -// minimalFirstPaintSet is the irreducible subset of critical informers the -// home dashboard needs to feel coherent. Once these are synced (and the -// patience window has elapsed), the cache returns and slower critical -// informers (ingresses, jobs, cronjobs, etc.) continue loading in the -// background, joining the topology when they arrive. -// -// Pods are deliberately included even though they are typically the -// largest kind — without pods the topology and resource counts are -// essentially empty. The patience window absorbs pod-sync latency on -// most clusters; only when it doesn't do we render with what we have. +// minimalFirstPaintSet is the subset of critical informers the home +// dashboard needs to feel coherent. Pods are included despite being +// typically the largest kind — without pods the topology graph and +// resource counts are empty. The patience window absorbs pod-sync +// latency on healthy clusters; on slow ones, the user sees a working +// home view sooner with a "still loading" hint for the rest. var minimalFirstPaintSet = map[string]bool{ "pods": true, "namespaces": true, @@ -293,11 +289,10 @@ func recordK8sEventToTimeline(obj any) { // emitSyncProgress is the SyncProgress callback wired into the resource // cache. It keeps the connection's progressMessage in step with the live -// informer-sync count so the connecting screen shows "Loading cluster -// data… X of Y resource types ready" instead of a frozen "Loading -// workloads…". After first paint (minimalReady=true), the cache returns -// and the connection state flips to "connected" — further progress lives -// in the deferred-loading indicator on the home dashboard. +// informer-sync count so the connecting screen ticks up instead of +// sitting on a static message during a 30–60s sync. Once the cache +// returns and connection state flips to "connected", further progress +// lives in the home dashboard's deferred-loading indicator. func emitSyncProgress(synced, total int, minimalReady bool) { if total == 0 { return diff --git a/internal/server/dashboard.go b/internal/server/dashboard.go index 2e3da8618..924496368 100644 --- a/internal/server/dashboard.go +++ b/internal/server/dashboard.go @@ -249,10 +249,12 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) { // may be incomplete because deferred informers are still syncing. resp.DeferredLoading = !cache.IsDeferredSynced() - // If critical sync timed out, tell the frontend which resource kinds - // may be missing so it can show a banner. - if promoted := cache.PromotedKinds(); len(promoted) > 0 && resp.DeferredLoading { - resp.PartialData = promoted + // If critical informers were promoted at first paint, tell the + // frontend which kinds are STILL loading (live-filtered, not the + // snapshot from connect time) so the banner doesn't list kinds that + // have since populated. + if pending := cache.PendingPromotedKinds(); len(pending) > 0 { + resp.PartialData = pending } // --- Slow network calls: run in parallel --- diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go index 309d6db11..2753641f9 100644 --- a/pkg/k8score/cache.go +++ b/pkg/k8score/cache.go @@ -85,10 +85,10 @@ type informerSetup struct { // NewResourceCache creates and starts a ResourceCache from the given config. // Startup has three tiers: -// - Critical informers block until synced. If both PatienceWindow and -// MinimalSet are set, the cache returns as soon as the minimal set is -// ready *after* the patience window elapses (rest of critical promoted -// to deferred). Otherwise, falls back to the legacy SyncTimeout path. +// - Critical informers block until synced. With PatienceWindow + MinimalSet, +// the cache returns as soon as the minimal set is ready *after* the +// patience window elapses (rest of critical promoted to deferred). With +// SyncTimeout, the cache returns at most after the timeout. // - Deferred informers sync in the background after critical completes. // - Background informers (e.g. Events) sync independently on their own goroutine. func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { @@ -279,13 +279,13 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { // Phase 1: Wait for critical informers. // // Two modes: - // - Patience+MinimalSet (preferred): wait for ALL critical until the - // patience window elapses. Once elapsed, return as soon as the - // minimal set is ready; promote the rest of critical to deferred. - // Keep waiting indefinitely until MinimalSet is satisfied — there - // is no hard timeout, the loading UI handles the wait. - // - SyncTimeout (legacy): wait for ALL critical, hard-cap at the - // timeout, promote everything still pending. + // - Patience+MinimalSet: wait for ALL critical until the patience + // window elapses. Once elapsed, return as soon as the minimal set + // is ready; promote the rest of critical to deferred. Keep waiting + // indefinitely until MinimalSet is satisfied — there is no hard + // timeout, the loading UI handles the wait. + // - SyncTimeout: wait for ALL critical, hard-cap at the timeout, + // promote everything still pending. useMinimalSet := cfg.PatienceWindow > 0 && len(cfg.MinimalSet) > 0 timedOut := false patienceElapsed := false @@ -295,7 +295,11 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { // types or unknown keys don't gate first paint). var minimalEntries []informerEntry if useMinimalSet { + knownCritical := make(map[string]bool, len(allEntries)) for _, e := range allEntries { + if !e.deferred { + knownCritical[e.key] = true + } if e.deferred { continue } @@ -303,6 +307,23 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { minimalEntries = append(minimalEntries, e) } } + // Validate MinimalSet keys: typos or kinds not enabled produce a + // silently-empty minimalEntries → cache returns ~PatienceWindow + // later with nothing meaningful synced. Surface that loud. + var unknown []string + for k := range cfg.MinimalSet { + if !knownCritical[k] { + unknown = append(unknown, k) + } + } + if len(unknown) > 0 { + sort.Strings(unknown) + stdlog.Printf("WARNING: MinimalSet keys not registered as critical informers (typo or RBAC-denied?): %s", + strings.Join(unknown, ", ")) + } + if len(minimalEntries) == 0 { + stdlog.Printf("WARNING: MinimalSet matched no enabled critical informers; first paint will fire as soon as PatienceWindow elapses regardless of sync state") + } } progressTicker := time.NewTicker(1 * time.Second) @@ -352,11 +373,15 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { return true } - // Initial progress emission so UI shows "0/N" right away. emitProgress(false) + // Set of minimal-set keys for the post-patience log discrimination. + minimalKindSet := make(map[string]bool, len(minimalEntries)) + for _, e := range minimalEntries { + minimalKindSet[e.kind] = true + } + for { - // Check if all critical informers are synced — preferred exit. allSynced := true for _, fn := range criticalSyncFuncs { if !fn() { @@ -368,8 +393,6 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { break } - // Once the patience window has elapsed, exit as soon as the - // minimal set is ready. Until then, keep waiting for everything. if patienceElapsed && useMinimalSet && minimalReady() { break } @@ -379,15 +402,13 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { return nil, fmt.Errorf("failed to sync critical resource caches") case <-patienceCh: patienceElapsed = true - patienceCh = nil // disable further receives on closed timer + patienceCh = nil // disable further receives on the drained timer channel case <-deadlineCh: timedOut = true case <-progressTicker.C: - // Log + emit progress callback. Log every tick; the application - // callback decides whether to surface anything (e.g. via SSE). counts := rc.GetKindObjectCounts() rc.informerMu.RLock() - var synced, pendingParts []string + var synced, pendingParts, minimalPending []string for _, s := range rc.informerStatuses { if s.Deferred { continue @@ -397,12 +418,23 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { } else { n := counts[s.Kind] pendingParts = append(pendingParts, fmt.Sprintf("%s(%d)", s.Kind, n)) + if minimalKindSet[s.Kind] { + minimalPending = append(minimalPending, s.Kind) + } } } rc.informerMu.RUnlock() if len(pendingParts) > 0 { - stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s", - len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", ")) + // After patience elapses, the actual blocker for first paint + // is the minimal-set subset, not all of critical — surface it + // distinctly so operators know which kind is holding render. + if patienceElapsed && len(minimalPending) > 0 { + stdlog.Printf("First-paint blocked: %d/%d minimal-set kinds still syncing (%.0fs elapsed) — pending: %s", + len(minimalPending), len(minimalEntries), time.Since(syncStart).Seconds(), strings.Join(minimalPending, ", ")) + } else { + stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s", + len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", ")) + } } emitProgress(useMinimalSet && patienceElapsed && minimalReady()) default: @@ -414,10 +446,8 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { } } - // Promotion: any critical informer not yet synced gets reclassified as - // deferred so the cache can return. Applies to both the SyncTimeout - // (legacy) path and the MinimalSet (new) path. When everything synced - // in time, this is a no-op. + // Reclassify any critical informer still pending as deferred so the + // cache can return. No-op on the all-synced path. var promoted []string rc.informerMu.Lock() for i, e := range allEntries { @@ -450,8 +480,6 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { stdlog.Printf("Critical resource caches synced in %v — UI can render", time.Since(syncStart)) } - // Final progress emission so the UI's "X of Y ready" indicator settles - // to the correct state immediately upon return. if cfg.SyncProgress != nil { rc.informerMu.RLock() var synced, total int @@ -867,7 +895,10 @@ func (rc *ResourceCache) ChangesRaw() chan ResourceChange { } // PromotedKinds returns the list of resource kinds that were promoted from -// critical to deferred due to SyncTimeout. Empty if sync completed normally. +// critical to deferred at first paint. Empty if sync completed normally. +// This is a snapshot from cache construction and does NOT shrink as +// promoted informers eventually sync — use PendingPromotedKinds for the +// live "still loading" view (e.g. for UI banners). func (rc *ResourceCache) PromotedKinds() []string { if rc == nil { return nil @@ -875,6 +906,31 @@ func (rc *ResourceCache) PromotedKinds() []string { return rc.promotedKinds } +// PendingPromotedKinds returns the subset of PromotedKinds whose informers +// have not yet completed their initial sync. Drains to empty as the +// background informers finish, so a UI bound to this method shows a +// truthful "still loading" indicator. +func (rc *ResourceCache) PendingPromotedKinds() []string { + if rc == nil || len(rc.promotedKinds) == 0 { + return nil + } + rc.informerMu.RLock() + defer rc.informerMu.RUnlock() + syncedByKind := make(map[string]bool, len(rc.informerStatuses)) + for _, s := range rc.informerStatuses { + if s.Synced { + syncedByKind[s.Kind] = true + } + } + var pending []string + for _, k := range rc.promotedKinds { + if !syncedByKind[k] { + pending = append(pending, k) + } + } + return pending +} + // IsSyncComplete returns true after the initial critical informer sync. func (rc *ResourceCache) IsSyncComplete() bool { if rc == nil { diff --git a/pkg/k8score/cache_test.go b/pkg/k8score/cache_test.go index 0699da2ca..75bfaa322 100644 --- a/pkg/k8score/cache_test.go +++ b/pkg/k8score/cache_test.go @@ -288,6 +288,157 @@ func TestNewResourceCache_MinimalSet_Promotion(t *testing.T) { } } +// TestNewResourceCache_SyncTimeout_Promotion covers the legacy hard-cap +// path used by skyhook-connector. Without PatienceWindow/MinimalSet, a +// stuck critical informer must promote at SyncTimeout and the cache +// must return. +func TestNewResourceCache_SyncTimeout_Promotion(t *testing.T) { + client := fake.NewSimpleClientset() + client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, fmt.Errorf("simulated stuck API") + }) + + start := time.Now() + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + Ingresses: true, + }, + SyncTimeout: 200 * time.Millisecond, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + elapsed := time.Since(start) + if elapsed < 200*time.Millisecond { + t.Errorf("expected return after SyncTimeout (200ms), got %v", elapsed) + } + if elapsed > 3*time.Second { + t.Errorf("returned far later than SyncTimeout — promotion didn't fire? elapsed=%v", elapsed) + } + + promoted := rc.PromotedKinds() + if len(promoted) != 1 || promoted[0] != "Ingress" { + t.Errorf("expected Ingress to be promoted, got %v", promoted) + } +} + +// TestNewResourceCache_MinimalSet_UnknownKey verifies the validation +// log path: a typo or kind not enabled in ResourceTypes results in an +// empty effective minimal set; the cache returns at PatienceWindow with +// nothing meaningfully gating first paint. This is intentionally +// permissive (we don't fail construction) but the operator must see a +// warning. We don't capture log output here — just verify the cache +// still returns and shape is consistent. +func TestNewResourceCache_MinimalSet_UnknownKey(t *testing.T) { + client := fake.NewSimpleClientset() + + start := time.Now() + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + }, + PatienceWindow: 200 * time.Millisecond, + MinimalSet: map[string]bool{ + "pod": true, // typo — should be "pods" + }, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + // Pods syncs instantly on fake client, so allCritical fires before + // patience even elapses — promoted should be empty. + if got := rc.PromotedKinds(); len(got) != 0 { + t.Errorf("expected no promoted kinds (everything synced), got %v", got) + } + if rc.Pods() == nil { + t.Error("expected Pods() lister to be available") + } + // Sanity: returned in reasonable time despite the bogus minimal-set key + if elapsed := time.Since(start); elapsed > 3*time.Second { + t.Errorf("returned far too late with bogus MinimalSet key: %v", elapsed) + } +} + +// TestNewResourceCache_PatienceWindow_WithoutMinimalSet verifies the +// edge case where only PatienceWindow is set: useMinimalSet is false, +// so behavior degrades to "wait indefinitely for all critical" with no +// hard cap. With a fake client that syncs instantly, this should just +// return on the all-synced path. +func TestNewResourceCache_PatienceWindow_WithoutMinimalSet(t *testing.T) { + client := fake.NewSimpleClientset() + + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + }, + PatienceWindow: 100 * time.Millisecond, + // MinimalSet intentionally nil + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + if got := rc.PromotedKinds(); len(got) != 0 { + t.Errorf("expected no promotion without MinimalSet, got %v", got) + } + if rc.Pods() == nil { + t.Error("expected Pods() lister to be available") + } +} + +// TestPendingPromotedKinds verifies the live-filtered accessor that the +// dashboard banner relies on: starts equal to PromotedKinds, and shrinks +// as informers eventually sync. +func TestPendingPromotedKinds(t *testing.T) { + client := fake.NewSimpleClientset() + client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, fmt.Errorf("simulated slow API") + }) + + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + Services: true, + Ingresses: true, + }, + PatienceWindow: 200 * time.Millisecond, + MinimalSet: map[string]bool{ + Pods: true, + Services: true, + }, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + // At first paint, Ingress is both Promoted and Pending. + if got := rc.PromotedKinds(); len(got) != 1 || got[0] != "Ingress" { + t.Fatalf("PromotedKinds: expected [Ingress], got %v", got) + } + if got := rc.PendingPromotedKinds(); len(got) != 1 || got[0] != "Ingress" { + t.Errorf("PendingPromotedKinds: expected [Ingress], got %v", got) + } + // PromotedKinds is a stable historical snapshot, PendingPromotedKinds + // is the live view — verify they are distinct concepts and don't + // share backing storage. + promoted := rc.PromotedKinds() + pending := rc.PendingPromotedKinds() + if len(promoted) > 0 && len(pending) > 0 && &promoted[0] == &pending[0] { + t.Error("PromotedKinds and PendingPromotedKinds must not share backing array") + } +} + func TestNewResourceCache_Callbacks(t *testing.T) { // Pre-create a pod so the informer fires an add event pod := &corev1.Pod{ diff --git a/pkg/k8score/types.go b/pkg/k8score/types.go index 154c91787..853ef8f30 100644 --- a/pkg/k8score/types.go +++ b/pkg/k8score/types.go @@ -141,12 +141,10 @@ type CacheConfig struct { // In connector mode this is true. SuppressInitialAdds bool - // SyncTimeout is the maximum time to wait for critical informers to sync - // before proceeding with partial data. Unsynced critical informers are - // promoted to deferred and continue syncing in the background. - // Zero means wait indefinitely (original behavior). Prefer PatienceWindow - // + MinimalSet for application-driven first-paint, this remains for - // backwards compat (skyhook-connector still uses it). + // SyncTimeout is a hard cap on the critical-informer wait. When it + // fires, unsynced critical informers are promoted to deferred and the + // cache returns. Used when PatienceWindow is zero. Zero means wait + // indefinitely. SyncTimeout time.Duration // PatienceWindow is the soft deadline after which the cache returns as @@ -169,10 +167,11 @@ type CacheConfig struct { MinimalSet map[string]bool // SyncProgress is invoked roughly every second during the critical - // sync phase with the current progress. It is also invoked once when - // first paint becomes ready (allCritical=true OR minimalReady=true - // after the patience window elapsed). Application code uses this to - // surface a "loading X of Y resource types" indicator. + // sync phase, and once more when first paint is ready. `synced` and + // `total` count critical informers; `minimalReady` is true when the + // minimal set is satisfied AND the patience window has elapsed (i.e. + // the cache is about to return on the partial-paint path). + // Application code uses this to drive a "loading X of Y" indicator. SyncProgress func(synced, total int, minimalReady bool) // DeferredSyncTimeout caps how long we wait for deferred informers to diff --git a/web/src/api/client.ts b/web/src/api/client.ts index ebff45692..31c6afd1a 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -272,7 +272,7 @@ export interface DashboardResponse { audit: DashboardAudit | null nodeVersionSkew: { versions: Record; minVersion: string; maxVersion: string } | null deferredLoading?: boolean // True while deferred informers (secrets, events, etc.) are still syncing - partialData?: string[] // Resource kinds still loading after first paint (slow-cluster fallback) + partialData?: string[] // Critical kinds promoted at first paint that haven't yet finished syncing (live-filtered) accessRestricted?: boolean // True when user has no namespace access (RBAC) } From f1d53e6d5bb126327a5a735c3fb311882cbabb3b Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 3 May 2026 01:55:01 +0300 Subject: [PATCH 3/4] review followups: 5min sync backstop + map clone + dedup helper + drain test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three review items from the diagnostics PR (#603) and self-review of #544: - SyncTimeout now applies on the patience+minimal-set path too, as a hard backstop. Without it, a permanently-stuck minimal-set informer (e.g. severely throttled API server, the #598 scenario) would trap the caller on the connecting screen forever. New default in Radar is 5 min — generous enough that honest-slow clusters complete naturally, short enough that a broken cluster doesn't make Radar feel hung. When the backstop fires the existing timedOut path handles promotion, so the user falls back to today's experience (zeros + "Still loading" hint) instead of an indefinite wait. - Clone CacheConfig.MinimalSet defensively, matching the pattern for ResourceTypes / DeferredTypes. No behavior change for current callers (Radar passes a top-level var) but consistent. - New test TestNewResourceCache_MinimalSet_BackstopFires covers the worst case: a kind in the minimal set never syncs, SyncTimeout fires, cache returns with that kind promoted. - New test TestPendingPromotedKinds_Drains exercises the live-filtering claim — flips a failing reactor to success and verifies the pending list shrinks. - DiagnosticsOverlay: extract getPendingInformers() helper. The Set+ filter+sort logic was duplicated verbatim between PendingInformers (UI) and formatForGitHub (markdown export); future contract changes could silently desync the two. --- internal/k8s/cache.go | 10 ++ pkg/k8score/cache.go | 15 +-- pkg/k8score/cache_test.go | 116 +++++++++++++++++++ pkg/k8score/types.go | 5 +- web/src/components/ui/DiagnosticsOverlay.tsx | 24 ++-- 5 files changed, 149 insertions(+), 21 deletions(-) diff --git a/internal/k8s/cache.go b/internal/k8s/cache.go index 0cc7aa22a..e345f643f 100644 --- a/internal/k8s/cache.go +++ b/internal/k8s/cache.go @@ -86,6 +86,15 @@ var minimalFirstPaintSet = map[string]bool{ // gate and render with whatever is ready then. const firstPaintPatience = 8 * time.Second +// firstPaintBackstop is the hard upper bound on the critical-sync wait. +// If the minimal set still hasn't synced after this long, give up and +// render with whatever's available — the user gets the same partial-data +// experience they'd see today (zeros + "Still loading: …" hint) instead +// of being trapped on the connecting screen indefinitely. Picked to be +// much longer than a healthy cluster's sync time but short enough that +// a permanently-throttled API server doesn't make Radar feel broken. +const firstPaintBackstop = 5 * time.Minute + // ResourceChange is a type alias for the canonical definition in pkg/k8score. type ResourceChange = k8score.ResourceChange @@ -162,6 +171,7 @@ func InitResourceCache(ctx context.Context) error { TimingLogger: logTiming, PatienceWindow: firstPaintPatience, MinimalSet: minimalFirstPaintSet, + SyncTimeout: firstPaintBackstop, SyncProgress: emitSyncProgress, DeferredSyncTimeout: 3 * time.Minute, diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go index 2753641f9..60692ea69 100644 --- a/pkg/k8score/cache.go +++ b/pkg/k8score/cache.go @@ -117,6 +117,7 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { // Clone caller-owned maps to prevent mutation after construction. cfg.ResourceTypes = maps.Clone(cfg.ResourceTypes) cfg.DeferredTypes = maps.Clone(cfg.DeferredTypes) + cfg.MinimalSet = maps.Clone(cfg.MinimalSet) stopCh := make(chan struct{}) changes := make(chan ResourceChange, channelSize) @@ -278,14 +279,14 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { // Phase 1: Wait for critical informers. // - // Two modes: + // Two modes (combinable): // - Patience+MinimalSet: wait for ALL critical until the patience // window elapses. Once elapsed, return as soon as the minimal set - // is ready; promote the rest of critical to deferred. Keep waiting - // indefinitely until MinimalSet is satisfied — there is no hard - // timeout, the loading UI handles the wait. - // - SyncTimeout: wait for ALL critical, hard-cap at the timeout, - // promote everything still pending. + // is ready; promote the rest of critical to deferred. + // - SyncTimeout: hard upper bound. When it fires, promote everything + // still pending and return. Acts as a backstop on the minimal-set + // path so a permanently-stuck informer doesn't trap the caller + // forever (caller still gets to render with whatever synced). useMinimalSet := cfg.PatienceWindow > 0 && len(cfg.MinimalSet) > 0 timedOut := false patienceElapsed := false @@ -337,7 +338,7 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { } var deadlineCh <-chan time.Time - if !useMinimalSet && cfg.SyncTimeout > 0 { + if cfg.SyncTimeout > 0 { deadline := time.NewTimer(cfg.SyncTimeout) defer deadline.Stop() deadlineCh = deadline.C diff --git a/pkg/k8score/cache_test.go b/pkg/k8score/cache_test.go index 75bfaa322..edb4b2e3f 100644 --- a/pkg/k8score/cache_test.go +++ b/pkg/k8score/cache_test.go @@ -3,6 +3,7 @@ package k8score import ( "fmt" "sync" + "sync/atomic" "testing" "time" @@ -439,6 +440,121 @@ func TestPendingPromotedKinds(t *testing.T) { } } +// TestPendingPromotedKinds_Drains verifies the live-filtering claim: as +// a promoted informer eventually catches up and reports HasSynced=true, +// it leaves PendingPromotedKinds. PromotedKinds (the snapshot) does not +// change. Without this, a UI banner would list kinds forever even after +// they finished loading. +func TestPendingPromotedKinds_Drains(t *testing.T) { + client := fake.NewSimpleClientset() + + // Toggle: fail Ingress LIST until we flip the flag, then succeed. + // The reflector retries on backoff so HasSynced flips when LIST + // stops failing. + var failIngress atomic.Bool + failIngress.Store(true) + client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) { + if failIngress.Load() { + return true, nil, fmt.Errorf("simulated transient failure") + } + return false, nil, nil // pass through to default tracker + }) + + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + Services: true, + Ingresses: true, + }, + PatienceWindow: 200 * time.Millisecond, + MinimalSet: map[string]bool{ + Pods: true, + Services: true, + }, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + // Sanity: at construction, Ingress is pending. + if got := rc.PendingPromotedKinds(); len(got) != 1 || got[0] != "Ingress" { + t.Fatalf("PendingPromotedKinds at start: expected [Ingress], got %v", got) + } + + // Flip the reactor to succeed, then poll for the live view to drain. + failIngress.Store(false) + + deadline := time.Now().Add(15 * time.Second) + for time.Now().Before(deadline) { + if len(rc.PendingPromotedKinds()) == 0 { + break + } + time.Sleep(50 * time.Millisecond) + } + if got := rc.PendingPromotedKinds(); len(got) != 0 { + t.Errorf("PendingPromotedKinds did not drain after Ingress LIST began succeeding; still pending: %v", got) + } + + // PromotedKinds is the snapshot — must NOT shrink. + if got := rc.PromotedKinds(); len(got) != 1 || got[0] != "Ingress" { + t.Errorf("PromotedKinds (snapshot) should not shrink; expected [Ingress], got %v", got) + } +} + +// TestNewResourceCache_MinimalSet_BackstopFires covers the worst case on +// the patience+minimal-set path: a kind that's IN the minimal set never +// syncs. Without a backstop the cache would block in NewResourceCache +// forever, trapping the caller on a connecting screen. SyncTimeout is +// the safety net — it must promote everything still pending (including +// minimal-set members) and let the cache return. +func TestNewResourceCache_MinimalSet_BackstopFires(t *testing.T) { + client := fake.NewSimpleClientset() + // Pods is in MinimalSet — make its LIST fail forever. + client.PrependReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, fmt.Errorf("simulated permanently-stuck API") + }) + + start := time.Now() + rc, err := NewResourceCache(CacheConfig{ + Client: client, + ResourceTypes: map[string]bool{ + Pods: true, + Services: true, + }, + PatienceWindow: 100 * time.Millisecond, + MinimalSet: map[string]bool{ + Pods: true, + Services: true, + }, + SyncTimeout: 500 * time.Millisecond, + }) + if err != nil { + t.Fatalf("NewResourceCache failed: %v", err) + } + defer rc.Stop() + + elapsed := time.Since(start) + if elapsed < 500*time.Millisecond { + t.Errorf("expected return after at least SyncTimeout (500ms), got %v", elapsed) + } + if elapsed > 3*time.Second { + t.Errorf("returned far later than SyncTimeout — backstop didn't fire? elapsed=%v", elapsed) + } + + // Pods stuck → must be promoted; Services synced → must not be promoted. + promoted := rc.PromotedKinds() + if len(promoted) != 1 || promoted[0] != "Pod" { + t.Errorf("expected only Pod to be promoted by backstop, got %v", promoted) + } + + // Lister still available (it's just empty). + if rc.Pods() == nil { + t.Error("expected Pods() lister to be available even after backstop promotion") + } +} + func TestNewResourceCache_Callbacks(t *testing.T) { // Pre-create a pod so the informer fires an add event pod := &corev1.Pod{ diff --git a/pkg/k8score/types.go b/pkg/k8score/types.go index 853ef8f30..4f37ecdc0 100644 --- a/pkg/k8score/types.go +++ b/pkg/k8score/types.go @@ -143,8 +143,9 @@ type CacheConfig struct { // SyncTimeout is a hard cap on the critical-informer wait. When it // fires, unsynced critical informers are promoted to deferred and the - // cache returns. Used when PatienceWindow is zero. Zero means wait - // indefinitely. + // cache returns. Always applies, including when PatienceWindow is set + // — in that case it acts as a backstop so a permanently-stuck informer + // doesn't trap the caller forever. Zero means wait indefinitely. SyncTimeout time.Duration // PatienceWindow is the soft deadline after which the cache returns as diff --git a/web/src/components/ui/DiagnosticsOverlay.tsx b/web/src/components/ui/DiagnosticsOverlay.tsx index 088059751..7f7bf49fb 100644 --- a/web/src/components/ui/DiagnosticsOverlay.tsx +++ b/web/src/components/ui/DiagnosticsOverlay.tsx @@ -358,13 +358,8 @@ function InformersSection({ data }: { data: DiagnosticsSnapshot }) { } function PendingInformers({ sync }: { sync: DiagCacheSyncStatus }) { - const pendingNames = new Set([ - ...(sync.pendingCritical ?? []), - ...(sync.pendingDeferred ?? []), - ]) - const pending = sync.informers.filter((i) => pendingNames.has(i.kind)) + const pending = getPendingInformers(sync) if (pending.length === 0) return null - pending.sort((a, b) => Number(a.deferred) - Number(b.deferred) || a.kind.localeCompare(b.kind)) return (
Pending Informers ({pending.length}) @@ -380,6 +375,16 @@ function PendingInformers({ sync }: { sync: DiagCacheSyncStatus }) { ) } +function getPendingInformers(sync: DiagCacheSyncStatus): DiagInformerSyncStatus[] { + const pendingNames = new Set([ + ...(sync.pendingCritical ?? []), + ...(sync.pendingDeferred ?? []), + ]) + return sync.informers + .filter((i) => pendingNames.has(i.kind)) + .sort((a, b) => Number(a.deferred) - Number(b.deferred) || a.kind.localeCompare(b.kind)) +} + function formatSyncPhase(phase: DiagSyncPhase): string { switch (phase) { case 'not_started': return 'not started' @@ -592,13 +597,8 @@ function formatForGitHub(data: DiagnosticsSnapshot, includeRawJson = true): stri if (sync.promotedKinds && sync.promotedKinds.length > 0) { lines.push(`- **Promoted to Deferred:** ${sync.promotedKinds.join(', ')}`) } - const pendingNames = new Set([ - ...(sync.pendingCritical ?? []), - ...(sync.pendingDeferred ?? []), - ]) - const pending = sync.informers.filter((i) => pendingNames.has(i.kind)) + const pending = getPendingInformers(sync) if (pending.length > 0) { - pending.sort((a, b) => Number(a.deferred) - Number(b.deferred) || a.kind.localeCompare(b.kind)) const parts = pending.map((i) => `${i.kind}(${i.deferred ? 'deferred' : 'critical'},${i.items.toLocaleString()} items)`) lines.push(`- **Pending:** ${parts.join(', ')}`) } From f31b5413d7c6eee9e7faaf6972cb8486fdeafd86 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 3 May 2026 02:18:07 +0300 Subject: [PATCH 4/4] fix(cache): final SyncProgress emit uses HasSynced, not lagging Synced flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase 1 loop exits on direct HasSynced() calls but the final SyncProgress emit was counting from informerStatuses[].Synced — which is set asynchronously by per-informer tracking goroutines polling at 10ms. On the all-synced happy path there's a sub-100ms window where the loop has exited but the trackers haven't caught up, so synced < total here even though everything is actually synced. Combined with hardcoded minimalReady=true, Radar's connecting screen would briefly show 'Loading cluster data… 9 of 10 ready (showing partial)' before transitioning to connected, instead of 'Finalizing…'. Read informerStatuses[].Deferred under lock (post-promotion classification) but use e.synced() (HasSynced) for the actual sync check — same source the loop used, so the count is consistent with the exit condition. --- pkg/k8score/cache.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go index 60692ea69..dae4f2e1b 100644 --- a/pkg/k8score/cache.go +++ b/pkg/k8score/cache.go @@ -482,14 +482,20 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) { } if cfg.SyncProgress != nil { + // Count via e.synced() (same source as the Phase 1 loop) rather than + // informerStatuses[].Synced, which is set asynchronously by per-informer + // tracking goroutines and can lag by ~10ms after HasSynced() flips. On + // the all-synced happy path the lag would otherwise produce synced