From 78e965cfc4b56c021ce8032819bd35d39fb6195d Mon Sep 17 00:00:00 2001
From: Nadav Erell <nadaverell@gmail.com>
Date: Mon, 27 Apr 2026 01:04:00 +0300
Subject: [PATCH 1/4] cache: patience-window + minimal-set first paint, no hard
 timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the 60s blanket SyncTimeout for the connecting screen with a
softer two-stage gate:

1. Patience window (8s) — wait for ALL critical informers. On the
   common path (fast cluster) nothing else triggers; first paint is
   complete and coherent.
2. Minimal-set fallback — once the patience window elapses, return as
   soon as pods/namespaces/nodes/services/deployments are synced. Any
   other critical informer still loading (ingresses, jobs, etc.) is
   promoted to deferred and joins the topology when ready.

No hard timeout: deferred informers retry indefinitely. The user-facing
"give up" semantic that the 60s cap implied was always misleading —
informers stayed subscribed regardless. The patience window is a
"render with what you have" boundary, not a failure cliff.

Cache progress is wired into the connection's progressMessage so the
connecting screen shows "Loading cluster data… X of Y ready" instead of
a static "Loading workloads…". After first paint, slower informers
arrive coalesced into 5s windows (was 3s) to keep the topology graph
from jittering, and the home view shows a small inline indicator
listing the kinds still loading.

Mechanism lives in pkg/k8score (CacheConfig.PatienceWindow,
CacheConfig.MinimalSet, CacheConfig.SyncProgress); skyhook-connector's
legacy SyncTimeout path is unchanged.
---
 internal/k8s/cache.go                |  57 +++++++++-
 internal/server/sse.go               |  12 +-
 pkg/k8score/cache.go                 | 164 ++++++++++++++++++++++-----
 pkg/k8score/cache_test.go            | 112 ++++++++++++++++++
 pkg/k8score/types.go                 |  30 ++++-
 web/src/components/home/HomeView.tsx |  14 ++-
 6 files changed, 353 insertions(+), 36 deletions(-)

diff --git a/internal/k8s/cache.go b/internal/k8s/cache.go
index 18186f391..314389a2f 100644
--- a/internal/k8s/cache.go
+++ b/internal/k8s/cache.go
@@ -65,6 +65,31 @@ var deferredResources = map[string]bool{
 	"limitranges":              true, // audit inheritance lookups, not first-render
 }
 
+// minimalFirstPaintSet is the irreducible subset of critical informers the
+// home dashboard needs to feel coherent. Once these are synced (and the
+// patience window has elapsed), the cache returns and slower critical
+// informers (ingresses, jobs, cronjobs, etc.) continue loading in the
+// background, joining the topology when they arrive.
+//
+// Pods are deliberately included even though they are typically the
+// largest kind — without pods the topology and resource counts are
+// essentially empty. The patience window absorbs pod-sync latency on
+// most clusters; only when it doesn't do we render with what we have.
+var minimalFirstPaintSet = map[string]bool{
+	"pods":        true,
+	"namespaces":  true,
+	"nodes":       true,
+	"services":    true,
+	"deployments": true,
+}
+
+// firstPaintPatience is how long we wait for ALL critical informers before
+// falling back to the minimal set. On most clusters the full critical set
+// syncs well inside this window, so first paint is complete and there is
+// no progressive fill-in. Slow clusters fall through to the minimal-set
+// gate and render with whatever is ready then.
+const firstPaintPatience = 8 * time.Second
+
 // ResourceChange is a type alias for the canonical definition in pkg/k8score.
 type ResourceChange = k8score.ResourceChange
 
@@ -139,7 +164,9 @@ func InitResourceCache(ctx context.Context) error {
 			Namespace:           permResult.Namespace,
 			DebugEvents:         DebugEvents,
 			TimingLogger:        logTiming,
-			SyncTimeout:         60 * time.Second,
+			PatienceWindow:      firstPaintPatience,
+			MinimalSet:          minimalFirstPaintSet,
+			SyncProgress:        emitSyncProgress,
 			DeferredSyncTimeout: 3 * time.Minute,
 
 			OnReceived: func(kind string) {
@@ -264,6 +291,34 @@ func recordK8sEventToTimeline(obj any) {
 	}
 }
 
+// emitSyncProgress is the SyncProgress callback wired into the resource
+// cache. It keeps the connection's progressMessage in step with the live
+// informer-sync count so the connecting screen shows "Loading cluster
+// data… X of Y resource types ready" instead of a frozen "Loading
+// workloads…". After first paint (minimalReady=true), the cache returns
+// and the connection state flips to "connected" — further progress lives
+// in the deferred-loading indicator on the home dashboard.
+func emitSyncProgress(synced, total int, minimalReady bool) {
+	if total == 0 {
+		return
+	}
+	// Only update while we're still in the connecting phase. Once
+	// connected, the connecting screen is gone and the message is moot.
+	if GetConnectionStatus().State != StateConnecting {
+		return
+	}
+	var msg string
+	switch {
+	case synced == total:
+		msg = "Finalizing…"
+	case minimalReady:
+		msg = fmt.Sprintf("Loading cluster data… %d of %d ready (showing partial)", synced, total)
+	default:
+		msg = fmt.Sprintf("Loading cluster data… %d of %d ready", synced, total)
+	}
+	UpdateConnectionProgress(msg)
+}
+
 // isNoisyResource returns true if this resource generates constant updates that aren't interesting
 func isNoisyResource(kind, name, op string) bool {
 	if op != "update" {
diff --git a/internal/server/sse.go b/internal/server/sse.go
index 1be039f30..5ab544e4c 100644
--- a/internal/server/sse.go
+++ b/internal/server/sse.go
@@ -405,12 +405,16 @@ func (b *SSEBroadcaster) watchResourceChanges() {
 	}
 
 	// Debounce strategy:
-	// - During warmup (initial sync + CRD discovery): 3s to avoid constant
-	//   topology rebuilds from dynamic informer syncs. The UI shows a connecting
-	//   spinner anyway, so the delay is invisible.
+	// - During warmup: critical informers that didn't make the patience
+	//   window (e.g. ingresses, jobs, replicasets) plus deferred informers
+	//   plus dynamic CRD informers all stream in over the next 10–60s. We
+	//   want the topology graph to settle in a few coherent paints, not
+	//   jump on every arrival, so we coalesce into 5s windows. The UI is
+	//   already on the home view by this point with a "loading more" hint;
+	//   the slight delay is preferable to a fidgety graph.
 	// - After warmup: re-evaluate based on cluster size. Large clusters (>5000
 	//   resources) use 5s; smaller clusters use 500ms.
-	const warmupDebounce = 3 * time.Second
+	const warmupDebounce = 5 * time.Second
 	debounceDuration := warmupDebounce
 	b.watchMu.Lock()
 	warmupCh := b.warmupDone // local copy under lock; nil-ed after firing to avoid closed-channel spin
diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go
index a7b2dc273..309d6db11 100644
--- a/pkg/k8score/cache.go
+++ b/pkg/k8score/cache.go
@@ -85,9 +85,12 @@ type informerSetup struct {
 
 // NewResourceCache creates and starts a ResourceCache from the given config.
 // Startup has three tiers:
-//   - Critical informers block until synced (or SyncTimeout, then promoted to deferred)
-//   - Deferred informers sync in the background after critical completes
-//   - Background informers (e.g. Events) sync independently on their own goroutine
+//   - Critical informers block until synced. If both PatienceWindow and
+//     MinimalSet are set, the cache returns as soon as the minimal set is
+//     ready *after* the patience window elapses (rest of critical promoted
+//     to deferred). Otherwise, falls back to the legacy SyncTimeout path.
+//   - Deferred informers sync in the background after critical completes.
+//   - Background informers (e.g. Events) sync independently on their own goroutine.
 func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 	if cfg.Client == nil {
 		return nil, fmt.Errorf("CacheConfig.Client must not be nil")
@@ -273,22 +276,87 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 		}()
 	}
 
-	// Phase 1: Wait for critical informers with periodic progress logging.
-	// Log every 5s so stuck syncs are visible in logs (previously silent).
+	// Phase 1: Wait for critical informers.
+	//
+	// Two modes:
+	//   - Patience+MinimalSet (preferred): wait for ALL critical until the
+	//     patience window elapses. Once elapsed, return as soon as the
+	//     minimal set is ready; promote the rest of critical to deferred.
+	//     Keep waiting indefinitely until MinimalSet is satisfied — there
+	//     is no hard timeout, the loading UI handles the wait.
+	//   - SyncTimeout (legacy): wait for ALL critical, hard-cap at the
+	//     timeout, promote everything still pending.
+	useMinimalSet := cfg.PatienceWindow > 0 && len(cfg.MinimalSet) > 0
 	timedOut := false
+	patienceElapsed := false
 	if len(criticalSyncFuncs) > 0 {
-		progressTicker := time.NewTicker(5 * time.Second)
+		// Build the index of "must-sync" entries for the minimal-set check.
+		// Filter to entries that are critical AND in MinimalSet (deferred
+		// types or unknown keys don't gate first paint).
+		var minimalEntries []informerEntry
+		if useMinimalSet {
+			for _, e := range allEntries {
+				if e.deferred {
+					continue
+				}
+				if cfg.MinimalSet[e.key] {
+					minimalEntries = append(minimalEntries, e)
+				}
+			}
+		}
+
+		progressTicker := time.NewTicker(1 * time.Second)
 		defer progressTicker.Stop()
 
+		var patienceCh <-chan time.Time
+		if useMinimalSet {
+			patienceTimer := time.NewTimer(cfg.PatienceWindow)
+			defer patienceTimer.Stop()
+			patienceCh = patienceTimer.C
+		}
+
 		var deadlineCh <-chan time.Time
-		if cfg.SyncTimeout > 0 {
+		if !useMinimalSet && cfg.SyncTimeout > 0 {
 			deadline := time.NewTimer(cfg.SyncTimeout)
 			defer deadline.Stop()
 			deadlineCh = deadline.C
 		}
 
+		// emitProgress reports current sync progress to the application
+		// callback (typically wired to the connection's progressMessage).
+		emitProgress := func(minimalReady bool) {
+			if cfg.SyncProgress == nil {
+				return
+			}
+			rc.informerMu.RLock()
+			var synced, total int
+			for _, s := range rc.informerStatuses {
+				if s.Deferred {
+					continue
+				}
+				total++
+				if s.Synced {
+					synced++
+				}
+			}
+			rc.informerMu.RUnlock()
+			cfg.SyncProgress(synced, total, minimalReady)
+		}
+
+		minimalReady := func() bool {
+			for _, e := range minimalEntries {
+				if !e.synced() {
+					return false
+				}
+			}
+			return true
+		}
+
+		// Initial progress emission so UI shows "0/N" right away.
+		emitProgress(false)
+
 		for {
-			// Check if all critical informers are synced
+			// Check if all critical informers are synced — preferred exit.
 			allSynced := true
 			for _, fn := range criticalSyncFuncs {
 				if !fn() {
@@ -300,14 +368,23 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 				break
 			}
 
-			// Wait for either: stopCh, deadline, or next progress tick
+			// Once the patience window has elapsed, exit as soon as the
+			// minimal set is ready. Until then, keep waiting for everything.
+			if patienceElapsed && useMinimalSet && minimalReady() {
+				break
+			}
+
 			select {
 			case <-stopCh:
 				return nil, fmt.Errorf("failed to sync critical resource caches")
+			case <-patienceCh:
+				patienceElapsed = true
+				patienceCh = nil // disable further receives on closed timer
 			case <-deadlineCh:
 				timedOut = true
 			case <-progressTicker.C:
-				// Log which informers are still pending, with item counts
+				// Log + emit progress callback. Log every tick; the application
+				// callback decides whether to surface anything (e.g. via SSE).
 				counts := rc.GetKindObjectCounts()
 				rc.informerMu.RLock()
 				var synced, pendingParts []string
@@ -323,8 +400,11 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 					}
 				}
 				rc.informerMu.RUnlock()
-				stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s",
-					len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", "))
+				if len(pendingParts) > 0 {
+					stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s",
+						len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", "))
+				}
+				emitProgress(useMinimalSet && patienceElapsed && minimalReady())
 			default:
 				time.Sleep(100 * time.Millisecond)
 			}
@@ -334,34 +414,60 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 		}
 	}
 
-	if timedOut {
-		// Promote unsynced critical informers to deferred so they continue
-		// syncing in the background. The UI renders with partial data.
-		var promoted []string
-		rc.informerMu.Lock()
-		for i, e := range allEntries {
-			if e.deferred || e.synced() {
-				continue
-			}
-			promoted = append(promoted, e.kind)
-			// Add to deferred tracking (the informer is already running from Phase 1)
-			deferredSyncFuncs = append(deferredSyncFuncs, e.synced)
-			deferredKeys = append(deferredKeys, e.key)
-			// Mark as deferred in diagnostics
-			rc.informerStatuses[i].Deferred = true
+	// Promotion: any critical informer not yet synced gets reclassified as
+	// deferred so the cache can return. Applies to both the SyncTimeout
+	// (legacy) path and the MinimalSet (new) path. When everything synced
+	// in time, this is a no-op.
+	var promoted []string
+	rc.informerMu.Lock()
+	for i, e := range allEntries {
+		if e.deferred || e.synced() {
+			continue
 		}
-		rc.informerMu.Unlock()
+		promoted = append(promoted, e.kind)
+		deferredSyncFuncs = append(deferredSyncFuncs, e.synced)
+		deferredKeys = append(deferredKeys, e.key)
+		rc.informerStatuses[i].Deferred = true
+	}
+	rc.informerMu.Unlock()
+
+	switch {
+	case timedOut:
 		stdlog.Printf("WARNING: Critical sync timed out after %v — promoting %d informers to deferred: %s",
 			cfg.SyncTimeout, len(promoted), strings.Join(promoted, ", "))
 		stdlog.Printf("UI will render with partial data; promoted informers continue syncing in background")
 		logf("    Phase 1 sync TIMED OUT (%d critical, %d promoted to deferred): %v",
 			len(criticalSyncFuncs), len(promoted), time.Since(syncStart))
 		rc.promotedKinds = promoted
-	} else {
+	case patienceElapsed && len(promoted) > 0:
+		stdlog.Printf("First-paint ready after %v: minimal set synced; %d slower informers continue in background: %s",
+			time.Since(syncStart), len(promoted), strings.Join(promoted, ", "))
+		logf("    Phase 1 minimal-set sync (%d/%d critical, %d still loading): %v",
+			len(criticalSyncFuncs)-len(promoted), len(criticalSyncFuncs), len(promoted), time.Since(syncStart))
+		rc.promotedKinds = promoted
+	default:
 		logf("    Phase 1 sync (%d critical informers): %v", len(criticalSyncFuncs), time.Since(syncStart))
 		stdlog.Printf("Critical resource caches synced in %v — UI can render", time.Since(syncStart))
 	}
 
+	// Final progress emission so the UI's "X of Y ready" indicator settles
+	// to the correct state immediately upon return.
+	if cfg.SyncProgress != nil {
+		rc.informerMu.RLock()
+		var synced, total int
+		for _, s := range rc.informerStatuses {
+			if s.Deferred {
+				continue
+			}
+			total++
+			if s.Synced {
+				synced++
+			}
+		}
+		rc.informerMu.RUnlock()
+		cfg.SyncProgress(synced, total, true)
+	}
+
 	// Log per-type resource counts for startup diagnostics
 	if counts := rc.GetKindObjectCounts(); len(counts) > 0 {
 		total := 0
diff --git a/pkg/k8score/cache_test.go b/pkg/k8score/cache_test.go
index 17e20621d..0699da2ca 100644
--- a/pkg/k8score/cache_test.go
+++ b/pkg/k8score/cache_test.go
@@ -176,6 +176,118 @@ func TestNewResourceCache_DeferredSync_PartialFailure(t *testing.T) {
 	}
 }
 
+// TestNewResourceCache_MinimalSet_AllFast verifies that when the patience
+// window is set but every informer syncs quickly, NewResourceCache returns
+// via the all-critical-synced path (not the minimal-set fallback). No
+// informers should be promoted, syncProgress should fire to completion.
+func TestNewResourceCache_MinimalSet_AllFast(t *testing.T) {
+	client := fake.NewSimpleClientset()
+
+	var lastSynced, lastTotal int
+	var lastReady bool
+	var progMu sync.Mutex
+
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods:        true,
+			Services:    true,
+			Deployments: true,
+			Ingresses:   true,
+		},
+		PatienceWindow: 2 * time.Second,
+		MinimalSet: map[string]bool{
+			Pods:     true,
+			Services: true,
+		},
+		SyncProgress: func(synced, total int, ready bool) {
+			progMu.Lock()
+			lastSynced, lastTotal, lastReady = synced, total, ready
+			progMu.Unlock()
+		},
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	// All four critical informers should be synced — no promotion.
+	if got := rc.PromotedKinds(); len(got) != 0 {
+		t.Errorf("expected no promoted kinds when all critical synced fast, got %v", got)
+	}
+
+	progMu.Lock()
+	defer progMu.Unlock()
+	if lastTotal != 4 {
+		t.Errorf("expected SyncProgress.total=4, got %d", lastTotal)
+	}
+	if lastSynced != 4 {
+		t.Errorf("expected SyncProgress.synced=4, got %d", lastSynced)
+	}
+	if !lastReady {
+		t.Error("expected final SyncProgress to report minimalReady=true")
+	}
+}
+
+// TestNewResourceCache_MinimalSet_Promotion verifies the slow-cluster path:
+// when a non-minimal critical informer can't sync within the patience
+// window but minimal-set members are ready, NewResourceCache returns and
+// promotes the slow informer to deferred. The promoted informer must
+// continue running and eventually become available.
+func TestNewResourceCache_MinimalSet_Promotion(t *testing.T) {
+	client := fake.NewSimpleClientset()
+
+	// Make ingresses LIST fail. The reflector retries with backoff, so
+	// HasSynced() stays false, but the failure returns immediately —
+	// no shared tracker lock held, pods/services LIST proceed normally.
+	// Ingress is critical (not in MinimalSet) so it should be the one
+	// that gets promoted.
+	client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) {
+		return true, nil, fmt.Errorf("simulated slow API: ingresses unavailable")
+	})
+
+	start := time.Now()
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods:      true,
+			Services:  true,
+			Ingresses: true,
+		},
+		PatienceWindow: 200 * time.Millisecond,
+		MinimalSet: map[string]bool{
+			Pods:     true,
+			Services: true,
+		},
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	elapsed := time.Since(start)
+	if elapsed < 200*time.Millisecond {
+		t.Errorf("expected return after at least the patience window (200ms), got %v", elapsed)
+	}
+	// Cap upper bound generously: patience window + a few jitter ticks.
+	if elapsed > 3*time.Second {
+		t.Errorf("returned far later than patience window — minimal-set fallback didn't trigger? elapsed=%v", elapsed)
+	}
+
+	promoted := rc.PromotedKinds()
+	if len(promoted) != 1 || promoted[0] != "Ingress" {
+		t.Errorf("expected Ingress to be promoted, got %v", promoted)
+	}
+
+	// Minimal-set listers should be available immediately.
+	if rc.Pods() == nil {
+		t.Error("expected Pods() lister available after first paint")
+	}
+	if rc.Services() == nil {
+		t.Error("expected Services() lister available after first paint")
+	}
+}
+
 func TestNewResourceCache_Callbacks(t *testing.T) {
 	// Pre-create a pod so the informer fires an add event
 	pod := &corev1.Pod{
diff --git a/pkg/k8score/types.go b/pkg/k8score/types.go
index 928aa92bc..154c91787 100644
--- a/pkg/k8score/types.go
+++ b/pkg/k8score/types.go
@@ -144,9 +144,37 @@ type CacheConfig struct {
 	// SyncTimeout is the maximum time to wait for critical informers to sync
 	// before proceeding with partial data. Unsynced critical informers are
 	// promoted to deferred and continue syncing in the background.
-	// Zero means wait indefinitely (original behavior).
+	// Zero means wait indefinitely (original behavior). Prefer PatienceWindow
+	// + MinimalSet for application-driven first-paint, this remains for
+	// backwards compat (skyhook-connector still uses it).
 	SyncTimeout time.Duration
 
+	// PatienceWindow is the soft deadline after which the cache returns as
+	// soon as MinimalSet (below) is fully synced. Critical informers not in
+	// MinimalSet that are still syncing are promoted to deferred so the
+	// caller can render a useful first paint while the rest stream in.
+	//
+	// When 0 or when MinimalSet is empty, this falls back to the legacy
+	// behavior controlled by SyncTimeout.
+	PatienceWindow time.Duration
+
+	// MinimalSet is the subset of critical resource types (keyed by
+	// ResourceType, e.g. Pods, Services) that the application considers
+	// the irreducible minimum for a useful first render. Once these are
+	// synced AND the patience window has elapsed, NewResourceCache
+	// returns even if other critical informers are still syncing.
+	//
+	// Keys must reference types also present in ResourceTypes. Members
+	// that are deferred or absent are ignored.
+	MinimalSet map[string]bool
+
+	// SyncProgress is invoked roughly every second during the critical
+	// sync phase with the current progress. It is also invoked once when
+	// first paint becomes ready (allCritical=true OR minimalReady=true
+	// after the patience window elapsed). Application code uses this to
+	// surface a "loading X of Y resource types" indicator.
+	SyncProgress func(synced, total int, minimalReady bool)
+
 	// DeferredSyncTimeout caps how long we wait for deferred informers to
 	// finish syncing before giving up. When the deadline fires, deferredDone
 	// is closed and deferredFailed is set so any still-unsynced types return
diff --git a/web/src/components/home/HomeView.tsx b/web/src/components/home/HomeView.tsx
index 7da0f5928..c43c5f811 100644
--- a/web/src/components/home/HomeView.tsx
+++ b/web/src/components/home/HomeView.tsx
@@ -11,7 +11,7 @@ import { NetworkPolicyCoverageCard } from './NetworkPolicyCoverageCard'
 import { CostCard } from './CostCard'
 import { AuditCard, PaneLoader, StatusDot, mapHealthToTone } from '@skyhook-io/k8s-ui'
 import { ClusterHealthCard } from './ClusterHealthCard'
-import { AlertTriangle, Shield } from 'lucide-react'
+import { AlertTriangle, Loader2, Shield } from 'lucide-react'
 import { clsx } from 'clsx'
 
 interface HomeViewProps {
@@ -58,9 +58,21 @@ export function HomeView({ namespaces, topology, onNavigateToView, onNavigateToR
 
   const hasProblems = data.problems && data.problems.length > 0
 
+  const stillLoading = data.deferredLoading || (data.partialData && data.partialData.length > 0)
+
   return (
     <div className="flex-1 overflow-y-auto">
       <div className="max-w-[1600px] mx-auto px-6 py-6 space-y-6">
+        {stillLoading && (
+          <div className="flex items-center gap-2 text-xs text-theme-text-tertiary">
+            <Loader2 className="w-3 h-3 animate-spin" />
+            <span>
+              {data.partialData && data.partialData.length > 0
+                ? `Still loading: ${data.partialData.join(', ')}`
+                : 'Loading remaining resources…'}
+            </span>
+          </div>
+        )}
         {/* Row 1: Cluster Health Card (combined health + resource counts) */}
         <ClusterHealthCard
           health={data.health}

From ef972a5835f2d539b47f41676a092de733a61101 Mon Sep 17 00:00:00 2001
From: Nadav Erell <nadaverell@gmail.com>
Date: Mon, 27 Apr 2026 02:10:10 +0300
Subject: [PATCH 2/4] review fixes: minimal-set typo guard, live partial-data,
 comment cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Guard MinimalSet against typos / unmatched keys: log a warning at
  cache construction listing keys that don't correspond to any enabled
  critical informer, plus a separate warning when the effective minimal
  set is empty (would otherwise return at PatienceWindow with nothing
  meaningfully gated).

- Add PendingPromotedKinds() — live-filtered view of promoted-at-first-
  paint informers that still aren't synced. Dashboard banner switches
  to this so the "still loading" pill drains as kinds arrive instead
  of showing the snapshot from connect time.

- After patience elapses, log distinguishes "first-paint blocked"
  (only minimal-set kinds matter) from generic "critical sync progress"
  so operators can see the actual blocker.

- New tests: legacy SyncTimeout promotion, MinimalSet typo, PatienceWindow
  without MinimalSet, PendingPromotedKinds live-filter contract.

- Comment cleanup: drop commit-flavored "(legacy)/(new)" parentheticals,
  fix SyncProgress godoc parameter names, trim restating-what comments.
---
 internal/k8s/cache.go        |  25 +++---
 internal/server/dashboard.go |  10 ++-
 pkg/k8score/cache.go         | 112 +++++++++++++++++++-------
 pkg/k8score/cache_test.go    | 151 +++++++++++++++++++++++++++++++++++
 pkg/k8score/types.go         |  19 +++--
 web/src/api/client.ts        |   2 +-
 6 files changed, 261 insertions(+), 58 deletions(-)

diff --git a/internal/k8s/cache.go b/internal/k8s/cache.go
index 314389a2f..0cc7aa22a 100644
--- a/internal/k8s/cache.go
+++ b/internal/k8s/cache.go
@@ -65,16 +65,12 @@ var deferredResources = map[string]bool{
 	"limitranges":              true, // audit inheritance lookups, not first-render
 }
 
-// minimalFirstPaintSet is the irreducible subset of critical informers the
-// home dashboard needs to feel coherent. Once these are synced (and the
-// patience window has elapsed), the cache returns and slower critical
-// informers (ingresses, jobs, cronjobs, etc.) continue loading in the
-// background, joining the topology when they arrive.
-//
-// Pods are deliberately included even though they are typically the
-// largest kind — without pods the topology and resource counts are
-// essentially empty. The patience window absorbs pod-sync latency on
-// most clusters; only when it doesn't do we render with what we have.
+// minimalFirstPaintSet is the subset of critical informers the home
+// dashboard needs to feel coherent. Pods are included despite being
+// typically the largest kind — without pods the topology graph and
+// resource counts are empty. The patience window absorbs pod-sync
+// latency on healthy clusters; on slow ones, the user sees a working
+// home view sooner with a "still loading" hint for the rest.
 var minimalFirstPaintSet = map[string]bool{
 	"pods":        true,
 	"namespaces":  true,
@@ -293,11 +289,10 @@ func recordK8sEventToTimeline(obj any) {
 
 // emitSyncProgress is the SyncProgress callback wired into the resource
 // cache. It keeps the connection's progressMessage in step with the live
-// informer-sync count so the connecting screen shows "Loading cluster
-// data… X of Y resource types ready" instead of a frozen "Loading
-// workloads…". After first paint (minimalReady=true), the cache returns
-// and the connection state flips to "connected" — further progress lives
-// in the deferred-loading indicator on the home dashboard.
+// informer-sync count so the connecting screen ticks up instead of
+// sitting on a static message during a 30–60s sync. Once the cache
+// returns and connection state flips to "connected", further progress
+// lives in the home dashboard's deferred-loading indicator.
 func emitSyncProgress(synced, total int, minimalReady bool) {
 	if total == 0 {
 		return
diff --git a/internal/server/dashboard.go b/internal/server/dashboard.go
index 2e3da8618..924496368 100644
--- a/internal/server/dashboard.go
+++ b/internal/server/dashboard.go
@@ -249,10 +249,12 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
 	// may be incomplete because deferred informers are still syncing.
 	resp.DeferredLoading = !cache.IsDeferredSynced()
 
-	// If critical sync timed out, tell the frontend which resource kinds
-	// may be missing so it can show a banner.
-	if promoted := cache.PromotedKinds(); len(promoted) > 0 && resp.DeferredLoading {
-		resp.PartialData = promoted
+	// If critical informers were promoted at first paint, tell the
+	// frontend which kinds are STILL loading (live-filtered, not the
+	// snapshot from connect time) so the banner doesn't list kinds that
+	// have since populated.
+	if pending := cache.PendingPromotedKinds(); len(pending) > 0 {
+		resp.PartialData = pending
 	}
 
 	// --- Slow network calls: run in parallel ---
diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go
index 309d6db11..2753641f9 100644
--- a/pkg/k8score/cache.go
+++ b/pkg/k8score/cache.go
@@ -85,10 +85,10 @@ type informerSetup struct {
 
 // NewResourceCache creates and starts a ResourceCache from the given config.
 // Startup has three tiers:
-//   - Critical informers block until synced. If both PatienceWindow and
-//     MinimalSet are set, the cache returns as soon as the minimal set is
-//     ready *after* the patience window elapses (rest of critical promoted
-//     to deferred). Otherwise, falls back to the legacy SyncTimeout path.
+//   - Critical informers block until synced. With PatienceWindow + MinimalSet,
+//     the cache returns as soon as the minimal set is ready *after* the
+//     patience window elapses (rest of critical promoted to deferred). With
+//     SyncTimeout, the cache returns at most after the timeout.
 //   - Deferred informers sync in the background after critical completes.
 //   - Background informers (e.g. Events) sync independently on their own goroutine.
 func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
@@ -279,13 +279,13 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 	// Phase 1: Wait for critical informers.
 	//
 	// Two modes:
-	//   - Patience+MinimalSet (preferred): wait for ALL critical until the
-	//     patience window elapses. Once elapsed, return as soon as the
-	//     minimal set is ready; promote the rest of critical to deferred.
-	//     Keep waiting indefinitely until MinimalSet is satisfied — there
-	//     is no hard timeout, the loading UI handles the wait.
-	//   - SyncTimeout (legacy): wait for ALL critical, hard-cap at the
-	//     timeout, promote everything still pending.
+	//   - Patience+MinimalSet: wait for ALL critical until the patience
+	//     window elapses. Once elapsed, return as soon as the minimal set
+	//     is ready; promote the rest of critical to deferred. Keep waiting
+	//     indefinitely until MinimalSet is satisfied — there is no hard
+	//     timeout, the loading UI handles the wait.
+	//   - SyncTimeout: wait for ALL critical, hard-cap at the timeout,
+	//     promote everything still pending.
 	useMinimalSet := cfg.PatienceWindow > 0 && len(cfg.MinimalSet) > 0
 	timedOut := false
 	patienceElapsed := false
@@ -295,7 +295,11 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 		// types or unknown keys don't gate first paint).
 		var minimalEntries []informerEntry
 		if useMinimalSet {
+			knownCritical := make(map[string]bool, len(allEntries))
 			for _, e := range allEntries {
+				if !e.deferred {
+					knownCritical[e.key] = true
+				}
 				if e.deferred {
 					continue
 				}
@@ -303,6 +307,23 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 					minimalEntries = append(minimalEntries, e)
 				}
 			}
+			// Validate MinimalSet keys: typos or kinds not enabled produce a
+			// silently-empty minimalEntries → cache returns ~PatienceWindow
+			// later with nothing meaningful synced. Surface that loud.
+			var unknown []string
+			for k := range cfg.MinimalSet {
+				if !knownCritical[k] {
+					unknown = append(unknown, k)
+				}
+			}
+			if len(unknown) > 0 {
+				sort.Strings(unknown)
+				stdlog.Printf("WARNING: MinimalSet keys not registered as critical informers (typo or RBAC-denied?): %s",
+					strings.Join(unknown, ", "))
+			}
+			if len(minimalEntries) == 0 {
+				stdlog.Printf("WARNING: MinimalSet matched no enabled critical informers; first paint will fire as soon as PatienceWindow elapses regardless of sync state")
+			}
 		}
 
 		progressTicker := time.NewTicker(1 * time.Second)
@@ -352,11 +373,15 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 			return true
 		}
 
-		// Initial progress emission so UI shows "0/N" right away.
 		emitProgress(false)
 
+		// Set of minimal-set keys for the post-patience log discrimination.
+		minimalKindSet := make(map[string]bool, len(minimalEntries))
+		for _, e := range minimalEntries {
+			minimalKindSet[e.kind] = true
+		}
+
 		for {
-			// Check if all critical informers are synced — preferred exit.
 			allSynced := true
 			for _, fn := range criticalSyncFuncs {
 				if !fn() {
@@ -368,8 +393,6 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 				break
 			}
 
-			// Once the patience window has elapsed, exit as soon as the
-			// minimal set is ready. Until then, keep waiting for everything.
 			if patienceElapsed && useMinimalSet && minimalReady() {
 				break
 			}
@@ -379,15 +402,13 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 				return nil, fmt.Errorf("failed to sync critical resource caches")
 			case <-patienceCh:
 				patienceElapsed = true
-				patienceCh = nil // disable further receives on closed timer
+				patienceCh = nil // disable further receives on the drained timer channel
 			case <-deadlineCh:
 				timedOut = true
 			case <-progressTicker.C:
-				// Log + emit progress callback. Log every tick; the application
-				// callback decides whether to surface anything (e.g. via SSE).
 				counts := rc.GetKindObjectCounts()
 				rc.informerMu.RLock()
-				var synced, pendingParts []string
+				var synced, pendingParts, minimalPending []string
 				for _, s := range rc.informerStatuses {
 					if s.Deferred {
 						continue
@@ -397,12 +418,23 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 					} else {
 						n := counts[s.Kind]
 						pendingParts = append(pendingParts, fmt.Sprintf("%s(%d)", s.Kind, n))
+						if minimalKindSet[s.Kind] {
+							minimalPending = append(minimalPending, s.Kind)
+						}
 					}
 				}
 				rc.informerMu.RUnlock()
 				if len(pendingParts) > 0 {
-					stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s",
-						len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", "))
+					// After patience elapses, the actual blocker for first paint
+					// is the minimal-set subset, not all of critical — surface it
+					// distinctly so operators know which kind is holding render.
+					if patienceElapsed && len(minimalPending) > 0 {
+						stdlog.Printf("First-paint blocked: %d/%d minimal-set kinds still syncing (%.0fs elapsed) — pending: %s",
+							len(minimalPending), len(minimalEntries), time.Since(syncStart).Seconds(), strings.Join(minimalPending, ", "))
+					} else {
+						stdlog.Printf("Critical sync progress: %d/%d synced (%.0fs elapsed) — pending: %s",
+							len(synced), len(synced)+len(pendingParts), time.Since(syncStart).Seconds(), strings.Join(pendingParts, ", "))
+					}
 				}
 				emitProgress(useMinimalSet && patienceElapsed && minimalReady())
 			default:
@@ -414,10 +446,8 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 		}
 	}
 
-	// Promotion: any critical informer not yet synced gets reclassified as
-	// deferred so the cache can return. Applies to both the SyncTimeout
-	// (legacy) path and the MinimalSet (new) path. When everything synced
-	// in time, this is a no-op.
+	// Reclassify any critical informer still pending as deferred so the
+	// cache can return. No-op on the all-synced path.
 	var promoted []string
 	rc.informerMu.Lock()
 	for i, e := range allEntries {
@@ -450,8 +480,6 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 		stdlog.Printf("Critical resource caches synced in %v — UI can render", time.Since(syncStart))
 	}
 
-	// Final progress emission so the UI's "X of Y ready" indicator settles
-	// to the correct state immediately upon return.
 	if cfg.SyncProgress != nil {
 		rc.informerMu.RLock()
 		var synced, total int
@@ -867,7 +895,10 @@ func (rc *ResourceCache) ChangesRaw() chan ResourceChange {
 }
 
 // PromotedKinds returns the list of resource kinds that were promoted from
-// critical to deferred due to SyncTimeout. Empty if sync completed normally.
+// critical to deferred at first paint. Empty if sync completed normally.
+// This is a snapshot from cache construction and does NOT shrink as
+// promoted informers eventually sync — use PendingPromotedKinds for the
+// live "still loading" view (e.g. for UI banners).
 func (rc *ResourceCache) PromotedKinds() []string {
 	if rc == nil {
 		return nil
@@ -875,6 +906,31 @@ func (rc *ResourceCache) PromotedKinds() []string {
 	return rc.promotedKinds
 }
 
+// PendingPromotedKinds returns the subset of PromotedKinds whose informers
+// have not yet completed their initial sync. Drains to empty as the
+// background informers finish, so a UI bound to this method shows a
+// truthful "still loading" indicator.
+func (rc *ResourceCache) PendingPromotedKinds() []string {
+	if rc == nil || len(rc.promotedKinds) == 0 {
+		return nil
+	}
+	rc.informerMu.RLock()
+	defer rc.informerMu.RUnlock()
+	syncedByKind := make(map[string]bool, len(rc.informerStatuses))
+	for _, s := range rc.informerStatuses {
+		if s.Synced {
+			syncedByKind[s.Kind] = true
+		}
+	}
+	var pending []string
+	for _, k := range rc.promotedKinds {
+		if !syncedByKind[k] {
+			pending = append(pending, k)
+		}
+	}
+	return pending
+}
+
 // IsSyncComplete returns true after the initial critical informer sync.
 func (rc *ResourceCache) IsSyncComplete() bool {
 	if rc == nil {
diff --git a/pkg/k8score/cache_test.go b/pkg/k8score/cache_test.go
index 0699da2ca..75bfaa322 100644
--- a/pkg/k8score/cache_test.go
+++ b/pkg/k8score/cache_test.go
@@ -288,6 +288,157 @@ func TestNewResourceCache_MinimalSet_Promotion(t *testing.T) {
 	}
 }
 
+// TestNewResourceCache_SyncTimeout_Promotion covers the legacy hard-cap
+// path used by skyhook-connector. Without PatienceWindow/MinimalSet, a
+// stuck critical informer must promote at SyncTimeout and the cache
+// must return.
+func TestNewResourceCache_SyncTimeout_Promotion(t *testing.T) {
+	client := fake.NewSimpleClientset()
+	client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) {
+		return true, nil, fmt.Errorf("simulated stuck API")
+	})
+
+	start := time.Now()
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods:      true,
+			Ingresses: true,
+		},
+		SyncTimeout: 200 * time.Millisecond,
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	elapsed := time.Since(start)
+	if elapsed < 200*time.Millisecond {
+		t.Errorf("expected return after SyncTimeout (200ms), got %v", elapsed)
+	}
+	if elapsed > 3*time.Second {
+		t.Errorf("returned far later than SyncTimeout — promotion didn't fire? elapsed=%v", elapsed)
+	}
+
+	promoted := rc.PromotedKinds()
+	if len(promoted) != 1 || promoted[0] != "Ingress" {
+		t.Errorf("expected Ingress to be promoted, got %v", promoted)
+	}
+}
+
+// TestNewResourceCache_MinimalSet_UnknownKey verifies the validation
+// log path: a typo or kind not enabled in ResourceTypes results in an
+// empty effective minimal set; the cache returns at PatienceWindow with
+// nothing meaningfully gating first paint. This is intentionally
+// permissive (we don't fail construction) but the operator must see a
+// warning. We don't capture log output here — just verify the cache
+// still returns and shape is consistent.
+func TestNewResourceCache_MinimalSet_UnknownKey(t *testing.T) {
+	client := fake.NewSimpleClientset()
+
+	start := time.Now()
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods: true,
+		},
+		PatienceWindow: 200 * time.Millisecond,
+		MinimalSet: map[string]bool{
+			"pod": true, // typo — should be "pods"
+		},
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	// Pods syncs instantly on fake client, so allCritical fires before
+	// patience even elapses — promoted should be empty.
+	if got := rc.PromotedKinds(); len(got) != 0 {
+		t.Errorf("expected no promoted kinds (everything synced), got %v", got)
+	}
+	if rc.Pods() == nil {
+		t.Error("expected Pods() lister to be available")
+	}
+	// Sanity: returned in reasonable time despite the bogus minimal-set key
+	if elapsed := time.Since(start); elapsed > 3*time.Second {
+		t.Errorf("returned far too late with bogus MinimalSet key: %v", elapsed)
+	}
+}
+
+// TestNewResourceCache_PatienceWindow_WithoutMinimalSet verifies the
+// edge case where only PatienceWindow is set: useMinimalSet is false,
+// so behavior degrades to "wait indefinitely for all critical" with no
+// hard cap. With a fake client that syncs instantly, this should just
+// return on the all-synced path.
+func TestNewResourceCache_PatienceWindow_WithoutMinimalSet(t *testing.T) {
+	client := fake.NewSimpleClientset()
+
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods: true,
+		},
+		PatienceWindow: 100 * time.Millisecond,
+		// MinimalSet intentionally nil
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	if got := rc.PromotedKinds(); len(got) != 0 {
+		t.Errorf("expected no promotion without MinimalSet, got %v", got)
+	}
+	if rc.Pods() == nil {
+		t.Error("expected Pods() lister to be available")
+	}
+}
+
+// TestPendingPromotedKinds verifies the live-filtered accessor that the
+// dashboard banner relies on: starts equal to PromotedKinds, and shrinks
+// as informers eventually sync.
+func TestPendingPromotedKinds(t *testing.T) {
+	client := fake.NewSimpleClientset()
+	client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) {
+		return true, nil, fmt.Errorf("simulated slow API")
+	})
+
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods:      true,
+			Services:  true,
+			Ingresses: true,
+		},
+		PatienceWindow: 200 * time.Millisecond,
+		MinimalSet: map[string]bool{
+			Pods:     true,
+			Services: true,
+		},
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	// At first paint, Ingress is both Promoted and Pending.
+	if got := rc.PromotedKinds(); len(got) != 1 || got[0] != "Ingress" {
+		t.Fatalf("PromotedKinds: expected [Ingress], got %v", got)
+	}
+	if got := rc.PendingPromotedKinds(); len(got) != 1 || got[0] != "Ingress" {
+		t.Errorf("PendingPromotedKinds: expected [Ingress], got %v", got)
+	}
+	// PromotedKinds is a stable historical snapshot, PendingPromotedKinds
+	// is the live view — verify they are distinct concepts and don't
+	// share backing storage.
+	promoted := rc.PromotedKinds()
+	pending := rc.PendingPromotedKinds()
+	if len(promoted) > 0 && len(pending) > 0 && &promoted[0] == &pending[0] {
+		t.Error("PromotedKinds and PendingPromotedKinds must not share backing array")
+	}
+}
+
 func TestNewResourceCache_Callbacks(t *testing.T) {
 	// Pre-create a pod so the informer fires an add event
 	pod := &corev1.Pod{
diff --git a/pkg/k8score/types.go b/pkg/k8score/types.go
index 154c91787..853ef8f30 100644
--- a/pkg/k8score/types.go
+++ b/pkg/k8score/types.go
@@ -141,12 +141,10 @@ type CacheConfig struct {
 	// In connector mode this is true.
 	SuppressInitialAdds bool
 
-	// SyncTimeout is the maximum time to wait for critical informers to sync
-	// before proceeding with partial data. Unsynced critical informers are
-	// promoted to deferred and continue syncing in the background.
-	// Zero means wait indefinitely (original behavior). Prefer PatienceWindow
-	// + MinimalSet for application-driven first-paint, this remains for
-	// backwards compat (skyhook-connector still uses it).
+	// SyncTimeout is a hard cap on the critical-informer wait. When it
+	// fires, unsynced critical informers are promoted to deferred and the
+	// cache returns. Used when PatienceWindow is zero. Zero means wait
+	// indefinitely.
 	SyncTimeout time.Duration
 
 	// PatienceWindow is the soft deadline after which the cache returns as
@@ -169,10 +167,11 @@ type CacheConfig struct {
 	MinimalSet map[string]bool
 
 	// SyncProgress is invoked roughly every second during the critical
-	// sync phase with the current progress. It is also invoked once when
-	// first paint becomes ready (allCritical=true OR minimalReady=true
-	// after the patience window elapsed). Application code uses this to
-	// surface a "loading X of Y resource types" indicator.
+	// sync phase, and once more when first paint is ready. `synced` and
+	// `total` count critical informers; `minimalReady` is true when the
+	// minimal set is satisfied AND the patience window has elapsed (i.e.
+	// the cache is about to return on the partial-paint path).
+	// Application code uses this to drive a "loading X of Y" indicator.
 	SyncProgress func(synced, total int, minimalReady bool)
 
 	// DeferredSyncTimeout caps how long we wait for deferred informers to
diff --git a/web/src/api/client.ts b/web/src/api/client.ts
index ebff45692..31c6afd1a 100644
--- a/web/src/api/client.ts
+++ b/web/src/api/client.ts
@@ -272,7 +272,7 @@ export interface DashboardResponse {
   audit: DashboardAudit | null
   nodeVersionSkew: { versions: Record<string, string[]>; minVersion: string; maxVersion: string } | null
   deferredLoading?: boolean // True while deferred informers (secrets, events, etc.) are still syncing
-  partialData?: string[] // Resource kinds still loading after first paint (slow-cluster fallback)
+  partialData?: string[] // Critical kinds promoted at first paint that haven't yet finished syncing (live-filtered)
   accessRestricted?: boolean // True when user has no namespace access (RBAC)
 }
 

From f1d53e6d5bb126327a5a735c3fb311882cbabb3b Mon Sep 17 00:00:00 2001
From: Nadav Erell <nadaverell@gmail.com>
Date: Sun, 3 May 2026 01:55:01 +0300
Subject: [PATCH 3/4] review followups: 5min sync backstop + map clone + dedup
 helper + drain test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three review items from the diagnostics PR (#603) and self-review of #544:

- SyncTimeout now applies on the patience+minimal-set path too, as a
  hard backstop. Without it, a permanently-stuck minimal-set informer
  (e.g. severely throttled API server, the #598 scenario) would trap
  the caller on the connecting screen forever. New default in Radar is
  5 min — generous enough that honest-slow clusters complete naturally,
  short enough that a broken cluster doesn't make Radar feel hung.
  When the backstop fires the existing timedOut path handles promotion,
  so the user falls back to today's experience (zeros + "Still loading"
  hint) instead of an indefinite wait.

- Clone CacheConfig.MinimalSet defensively, matching the pattern for
  ResourceTypes / DeferredTypes. No behavior change for current callers
  (Radar passes a top-level var) but consistent.

- New test TestNewResourceCache_MinimalSet_BackstopFires covers the
  worst case: a kind in the minimal set never syncs, SyncTimeout fires,
  cache returns with that kind promoted.

- New test TestPendingPromotedKinds_Drains exercises the live-filtering
  claim — flips a failing reactor to success and verifies the pending
  list shrinks.

- DiagnosticsOverlay: extract getPendingInformers() helper. The Set+
  filter+sort logic was duplicated verbatim between PendingInformers
  (UI) and formatForGitHub (markdown export); future contract changes
  could silently desync the two.
---
 internal/k8s/cache.go                        |  10 ++
 pkg/k8score/cache.go                         |  15 +--
 pkg/k8score/cache_test.go                    | 116 +++++++++++++++++++
 pkg/k8score/types.go                         |   5 +-
 web/src/components/ui/DiagnosticsOverlay.tsx |  24 ++--
 5 files changed, 149 insertions(+), 21 deletions(-)

diff --git a/internal/k8s/cache.go b/internal/k8s/cache.go
index 0cc7aa22a..e345f643f 100644
--- a/internal/k8s/cache.go
+++ b/internal/k8s/cache.go
@@ -86,6 +86,15 @@ var minimalFirstPaintSet = map[string]bool{
 // gate and render with whatever is ready then.
 const firstPaintPatience = 8 * time.Second
 
+// firstPaintBackstop is the hard upper bound on the critical-sync wait.
+// If the minimal set still hasn't synced after this long, give up and
+// render with whatever's available — the user gets the same partial-data
+// experience they'd see today (zeros + "Still loading: …" hint) instead
+// of being trapped on the connecting screen indefinitely. Picked to be
+// much longer than a healthy cluster's sync time but short enough that
+// a permanently-throttled API server doesn't make Radar feel broken.
+const firstPaintBackstop = 5 * time.Minute
+
 // ResourceChange is a type alias for the canonical definition in pkg/k8score.
 type ResourceChange = k8score.ResourceChange
 
@@ -162,6 +171,7 @@ func InitResourceCache(ctx context.Context) error {
 			TimingLogger:        logTiming,
 			PatienceWindow:      firstPaintPatience,
 			MinimalSet:          minimalFirstPaintSet,
+			SyncTimeout:         firstPaintBackstop,
 			SyncProgress:        emitSyncProgress,
 			DeferredSyncTimeout: 3 * time.Minute,
 
diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go
index 2753641f9..60692ea69 100644
--- a/pkg/k8score/cache.go
+++ b/pkg/k8score/cache.go
@@ -117,6 +117,7 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 	// Clone caller-owned maps to prevent mutation after construction.
 	cfg.ResourceTypes = maps.Clone(cfg.ResourceTypes)
 	cfg.DeferredTypes = maps.Clone(cfg.DeferredTypes)
+	cfg.MinimalSet = maps.Clone(cfg.MinimalSet)
 
 	stopCh := make(chan struct{})
 	changes := make(chan ResourceChange, channelSize)
@@ -278,14 +279,14 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 
 	// Phase 1: Wait for critical informers.
 	//
-	// Two modes:
+	// Two modes (combinable):
 	//   - Patience+MinimalSet: wait for ALL critical until the patience
 	//     window elapses. Once elapsed, return as soon as the minimal set
-	//     is ready; promote the rest of critical to deferred. Keep waiting
-	//     indefinitely until MinimalSet is satisfied — there is no hard
-	//     timeout, the loading UI handles the wait.
-	//   - SyncTimeout: wait for ALL critical, hard-cap at the timeout,
-	//     promote everything still pending.
+	//     is ready; promote the rest of critical to deferred.
+	//   - SyncTimeout: hard upper bound. When it fires, promote everything
+	//     still pending and return. Acts as a backstop on the minimal-set
+	//     path so a permanently-stuck informer doesn't trap the caller
+	//     forever (caller still gets to render with whatever synced).
 	useMinimalSet := cfg.PatienceWindow > 0 && len(cfg.MinimalSet) > 0
 	timedOut := false
 	patienceElapsed := false
@@ -337,7 +338,7 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 		}
 
 		var deadlineCh <-chan time.Time
-		if !useMinimalSet && cfg.SyncTimeout > 0 {
+		if cfg.SyncTimeout > 0 {
 			deadline := time.NewTimer(cfg.SyncTimeout)
 			defer deadline.Stop()
 			deadlineCh = deadline.C
diff --git a/pkg/k8score/cache_test.go b/pkg/k8score/cache_test.go
index 75bfaa322..edb4b2e3f 100644
--- a/pkg/k8score/cache_test.go
+++ b/pkg/k8score/cache_test.go
@@ -3,6 +3,7 @@ package k8score
 import (
 	"fmt"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -439,6 +440,121 @@ func TestPendingPromotedKinds(t *testing.T) {
 	}
 }
 
+// TestPendingPromotedKinds_Drains verifies the live-filtering claim: as
+// a promoted informer eventually catches up and reports HasSynced=true,
+// it leaves PendingPromotedKinds. PromotedKinds (the snapshot) does not
+// change. Without this, a UI banner would list kinds forever even after
+// they finished loading.
+func TestPendingPromotedKinds_Drains(t *testing.T) {
+	client := fake.NewSimpleClientset()
+
+	// Toggle: fail Ingress LIST until we flip the flag, then succeed.
+	// The reflector retries on backoff so HasSynced flips when LIST
+	// stops failing.
+	var failIngress atomic.Bool
+	failIngress.Store(true)
+	client.PrependReactor("list", "ingresses", func(action k8stesting.Action) (bool, runtime.Object, error) {
+		if failIngress.Load() {
+			return true, nil, fmt.Errorf("simulated transient failure")
+		}
+		return false, nil, nil // pass through to default tracker
+	})
+
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods:      true,
+			Services:  true,
+			Ingresses: true,
+		},
+		PatienceWindow: 200 * time.Millisecond,
+		MinimalSet: map[string]bool{
+			Pods:     true,
+			Services: true,
+		},
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	// Sanity: at construction, Ingress is pending.
+	if got := rc.PendingPromotedKinds(); len(got) != 1 || got[0] != "Ingress" {
+		t.Fatalf("PendingPromotedKinds at start: expected [Ingress], got %v", got)
+	}
+
+	// Flip the reactor to succeed, then poll for the live view to drain.
+	failIngress.Store(false)
+
+	deadline := time.Now().Add(15 * time.Second)
+	for time.Now().Before(deadline) {
+		if len(rc.PendingPromotedKinds()) == 0 {
+			break
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+	if got := rc.PendingPromotedKinds(); len(got) != 0 {
+		t.Errorf("PendingPromotedKinds did not drain after Ingress LIST began succeeding; still pending: %v", got)
+	}
+
+	// PromotedKinds is the snapshot — must NOT shrink.
+	if got := rc.PromotedKinds(); len(got) != 1 || got[0] != "Ingress" {
+		t.Errorf("PromotedKinds (snapshot) should not shrink; expected [Ingress], got %v", got)
+	}
+}
+
+// TestNewResourceCache_MinimalSet_BackstopFires covers the worst case on
+// the patience+minimal-set path: a kind that's IN the minimal set never
+// syncs. Without a backstop the cache would block in NewResourceCache
+// forever, trapping the caller on a connecting screen. SyncTimeout is
+// the safety net — it must promote everything still pending (including
+// minimal-set members) and let the cache return.
+func TestNewResourceCache_MinimalSet_BackstopFires(t *testing.T) {
+	client := fake.NewSimpleClientset()
+	// Pods is in MinimalSet — make its LIST fail forever.
+	client.PrependReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
+		return true, nil, fmt.Errorf("simulated permanently-stuck API")
+	})
+
+	start := time.Now()
+	rc, err := NewResourceCache(CacheConfig{
+		Client: client,
+		ResourceTypes: map[string]bool{
+			Pods:     true,
+			Services: true,
+		},
+		PatienceWindow: 100 * time.Millisecond,
+		MinimalSet: map[string]bool{
+			Pods:     true,
+			Services: true,
+		},
+		SyncTimeout: 500 * time.Millisecond,
+	})
+	if err != nil {
+		t.Fatalf("NewResourceCache failed: %v", err)
+	}
+	defer rc.Stop()
+
+	elapsed := time.Since(start)
+	if elapsed < 500*time.Millisecond {
+		t.Errorf("expected return after at least SyncTimeout (500ms), got %v", elapsed)
+	}
+	if elapsed > 3*time.Second {
+		t.Errorf("returned far later than SyncTimeout — backstop didn't fire? elapsed=%v", elapsed)
+	}
+
+	// Pods stuck → must be promoted; Services synced → must not be promoted.
+	promoted := rc.PromotedKinds()
+	if len(promoted) != 1 || promoted[0] != "Pod" {
+		t.Errorf("expected only Pod to be promoted by backstop, got %v", promoted)
+	}
+
+	// Lister still available (it's just empty).
+	if rc.Pods() == nil {
+		t.Error("expected Pods() lister to be available even after backstop promotion")
+	}
+}
+
 func TestNewResourceCache_Callbacks(t *testing.T) {
 	// Pre-create a pod so the informer fires an add event
 	pod := &corev1.Pod{
diff --git a/pkg/k8score/types.go b/pkg/k8score/types.go
index 853ef8f30..4f37ecdc0 100644
--- a/pkg/k8score/types.go
+++ b/pkg/k8score/types.go
@@ -143,8 +143,9 @@ type CacheConfig struct {
 
 	// SyncTimeout is a hard cap on the critical-informer wait. When it
 	// fires, unsynced critical informers are promoted to deferred and the
-	// cache returns. Used when PatienceWindow is zero. Zero means wait
-	// indefinitely.
+	// cache returns. Always applies, including when PatienceWindow is set
+	// — in that case it acts as a backstop so a permanently-stuck informer
+	// doesn't trap the caller forever. Zero means wait indefinitely.
 	SyncTimeout time.Duration
 
 	// PatienceWindow is the soft deadline after which the cache returns as
diff --git a/web/src/components/ui/DiagnosticsOverlay.tsx b/web/src/components/ui/DiagnosticsOverlay.tsx
index 088059751..7f7bf49fb 100644
--- a/web/src/components/ui/DiagnosticsOverlay.tsx
+++ b/web/src/components/ui/DiagnosticsOverlay.tsx
@@ -358,13 +358,8 @@ function InformersSection({ data }: { data: DiagnosticsSnapshot }) {
 }
 
 function PendingInformers({ sync }: { sync: DiagCacheSyncStatus }) {
-  const pendingNames = new Set([
-    ...(sync.pendingCritical ?? []),
-    ...(sync.pendingDeferred ?? []),
-  ])
-  const pending = sync.informers.filter((i) => pendingNames.has(i.kind))
+  const pending = getPendingInformers(sync)
   if (pending.length === 0) return null
-  pending.sort((a, b) => Number(a.deferred) - Number(b.deferred) || a.kind.localeCompare(b.kind))
   return (
     <div className="mt-1.5 pt-1.5 border-t border-theme-border-light">
       <span className="text-[10px] text-theme-text-tertiary uppercase">Pending Informers ({pending.length})</span>
@@ -380,6 +375,16 @@ function PendingInformers({ sync }: { sync: DiagCacheSyncStatus }) {
   )
 }
 
+function getPendingInformers(sync: DiagCacheSyncStatus): DiagInformerSyncStatus[] {
+  const pendingNames = new Set([
+    ...(sync.pendingCritical ?? []),
+    ...(sync.pendingDeferred ?? []),
+  ])
+  return sync.informers
+    .filter((i) => pendingNames.has(i.kind))
+    .sort((a, b) => Number(a.deferred) - Number(b.deferred) || a.kind.localeCompare(b.kind))
+}
+
 function formatSyncPhase(phase: DiagSyncPhase): string {
   switch (phase) {
     case 'not_started': return 'not started'
@@ -592,13 +597,8 @@ function formatForGitHub(data: DiagnosticsSnapshot, includeRawJson = true): stri
       if (sync.promotedKinds && sync.promotedKinds.length > 0) {
         lines.push(`- **Promoted to Deferred:** ${sync.promotedKinds.join(', ')}`)
       }
-      const pendingNames = new Set([
-        ...(sync.pendingCritical ?? []),
-        ...(sync.pendingDeferred ?? []),
-      ])
-      const pending = sync.informers.filter((i) => pendingNames.has(i.kind))
+      const pending = getPendingInformers(sync)
       if (pending.length > 0) {
-        pending.sort((a, b) => Number(a.deferred) - Number(b.deferred) || a.kind.localeCompare(b.kind))
         const parts = pending.map((i) => `${i.kind}(${i.deferred ? 'deferred' : 'critical'},${i.items.toLocaleString()} items)`)
         lines.push(`- **Pending:** ${parts.join(', ')}`)
       }

From f31b5413d7c6eee9e7faaf6972cb8486fdeafd86 Mon Sep 17 00:00:00 2001
From: Nadav Erell <nadaverell@gmail.com>
Date: Sun, 3 May 2026 02:18:07 +0300
Subject: [PATCH 4/4] fix(cache): final SyncProgress emit uses HasSynced, not
 lagging Synced flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Phase 1 loop exits on direct HasSynced() calls but the final
SyncProgress emit was counting from informerStatuses[].Synced — which
is set asynchronously by per-informer tracking goroutines polling at
10ms. On the all-synced happy path there's a sub-100ms window where
the loop has exited but the trackers haven't caught up, so synced <
total here even though everything is actually synced. Combined with
hardcoded minimalReady=true, Radar's connecting screen would briefly
show 'Loading cluster data… 9 of 10 ready (showing partial)' before
transitioning to connected, instead of 'Finalizing…'.

Read informerStatuses[].Deferred under lock (post-promotion classification)
but use e.synced() (HasSynced) for the actual sync check — same source
the loop used, so the count is consistent with the exit condition.
---
 pkg/k8score/cache.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pkg/k8score/cache.go b/pkg/k8score/cache.go
index 60692ea69..dae4f2e1b 100644
--- a/pkg/k8score/cache.go
+++ b/pkg/k8score/cache.go
@@ -482,14 +482,20 @@ func NewResourceCache(cfg CacheConfig) (*ResourceCache, error) {
 	}
 
 	if cfg.SyncProgress != nil {
+		// Count via e.synced() (same source as the Phase 1 loop) rather than
+		// informerStatuses[].Synced, which is set asynchronously by per-informer
+		// tracking goroutines and can lag by ~10ms after HasSynced() flips. On
+		// the all-synced happy path the lag would otherwise produce synced<total
+		// here even though we just exited the loop on allSynced — surfaced to
+		// callers as a misleading "showing partial" final message.
 		rc.informerMu.RLock()
 		var synced, total int
-		for _, s := range rc.informerStatuses {
-			if s.Deferred {
+		for i, e := range allEntries {
+			if rc.informerStatuses[i].Deferred {
 				continue
 			}
 			total++
-			if s.Synced {
+			if e.synced() {
 				synced++
 			}
 		}