Skip to content

Commit 195d5e3

Browse files
fix: replace os.Exit in goroutine with context cancellation for graceful shutdown
The goroutine in main.go that waits for webhook readiness and sets up controllers was calling os.Exit(1) on failure, which kills the process immediately without releasing the leader election lease. Other replicas cannot acquire leadership until the lease TTL expires (default 15s). Additionally, WaitReady() was an infinite loop with no context or timeout, risking a zombie leader state if the webhook health check never passes. This commit: - Wraps the signal handler context with context.WithCancel - Replaces os.Exit(1) in the goroutine with cancel() + return, triggering the manager's graceful shutdown path - Updates WaitReady to accept a context.Context so it can exit when the context is cancelled Fixes #2401 Signed-off-by: abhaygoudannavar <abhaysgoudnvr@gmail.com>
1 parent 749e8f2 commit 195d5e3

File tree

4 files changed

+114
-6
lines changed

4 files changed

+114
-6
lines changed

main.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package main
1818

1919
import (
20+
"context"
2021
"flag"
2122
"net/http"
2223
_ "net/http/pprof"
@@ -164,7 +165,9 @@ func main() {
164165
})
165166
}
166167

167-
ctx := ctrl.SetupSignalHandler()
168+
signalCtx := ctrl.SetupSignalHandler()
169+
ctx, cancel := context.WithCancel(signalCtx)
170+
defer cancel()
168171
cfg := ctrl.GetConfigOrDie()
169172
setRestConfig(cfg)
170173
cfg.UserAgent = "kruise-manager"
@@ -256,15 +259,17 @@ func main() {
256259

257260
go func() {
258261
setupLog.Info("wait webhook ready")
259-
if err = webhook.WaitReady(); err != nil {
262+
if err = webhook.WaitReady(ctx); err != nil {
260263
setupLog.Error(err, "unable to wait webhook ready")
261-
os.Exit(1)
264+
cancel()
265+
return
262266
}
263267

264268
setupLog.Info("setup controllers")
265269
if err = controller.SetupWithManager(mgr); err != nil {
266270
setupLog.Error(err, "unable to setup controllers")
267-
os.Exit(1)
271+
cancel()
272+
return
268273
}
269274
}()
270275

pkg/util/inplaceupdate/inplace_update_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,3 +648,65 @@ func TestRefresh(t *testing.T) {
648648
}
649649
}
650650
}
651+
652+
func TestUpdateNextBatchReadsFromClone(t *testing.T) {
653+
aHourAgo := metav1.NewTime(time.Unix(time.Now().Add(-time.Hour).Unix(), 0))
654+
655+
oldClock := Clock
656+
Clock = testingclock.NewFakeClock(aHourAgo.Time)
657+
defer func() {
658+
Clock = oldClock
659+
}()
660+
661+
// storedPod is the pod in the fake client (what GetPod returns as "clone").
662+
// It does NOT have the InPlaceUpdateState annotation.
663+
storedPod := &v1.Pod{
664+
ObjectMeta: metav1.ObjectMeta{
665+
Name: "pod-stale-test",
666+
Namespace: "default", // Fixed: Added namespace
667+
Labels: map[string]string{
668+
apps.StatefulSetRevisionLabel: "new-revision",
669+
},
670+
},
671+
Spec: v1.PodSpec{
672+
Containers: []v1.Container{{Name: "c1", Image: "c1-img2"}, {Name: "c2", Image: "c2-img1"}},
673+
ReadinessGates: []v1.PodReadinessGate{{ConditionType: appspub.InPlaceUpdateReady}},
674+
},
675+
Status: v1.PodStatus{
676+
ContainerStatuses: []v1.ContainerStatus{
677+
{Name: "c1", ImageID: "c1-img2-ID"},
678+
{Name: "c2", ImageID: "c2-img1-ID"},
679+
},
680+
},
681+
}
682+
683+
// stalePod is what gets passed to Refresh — it's a stale snapshot that still
684+
// has the InPlaceUpdateState annotation with NextContainerImages.
685+
stalePod := storedPod.DeepCopy()
686+
stalePod.Annotations = map[string]string{
687+
appspub.InPlaceUpdateStateKey: util.DumpJSON(appspub.InPlaceUpdateState{
688+
Revision: "new-revision",
689+
UpdateTimestamp: aHourAgo,
690+
NextContainerImages: map[string]string{"c2": "c2-img2"},
691+
}),
692+
}
693+
694+
cli := fake.NewClientBuilder().WithObjects(storedPod).Build()
695+
ctrl := New(cli, revisionadapter.NewDefaultImpl())
696+
697+
res := ctrl.Refresh(stalePod, nil)
698+
if res.RefreshErr != nil {
699+
t.Fatalf("Refresh should not fail, got: %v", res.RefreshErr)
700+
}
701+
702+
// Verify the stored pod was NOT modified — updateNextBatch should have
703+
// returned early because the clone has no InPlaceUpdateState annotation.
704+
got := &v1.Pod{}
705+
// Fixed: Include Namespace in types.NamespacedName
706+
if err := cli.Get(context.TODO(), types.NamespacedName{Namespace: storedPod.Namespace, Name: storedPod.Name}, got); err != nil {
707+
t.Fatalf("failed to get pod: %v", err)
708+
}
709+
if got.Spec.Containers[1].Image != "c2-img1" {
710+
t.Fatalf("expected c2 image to remain c2-img1 (unchanged), got %s", got.Spec.Containers[1].Image)
711+
}
712+
}

pkg/webhook/server.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,16 @@ func Checker(req *http.Request) error {
133133
return health.Checker(req)
134134
}
135135

136-
func WaitReady() error {
136+
func WaitReady(ctx context.Context) error {
137137
startTS := time.Now()
138138
var err error
139139
for {
140+
select {
141+
case <-ctx.Done():
142+
return fmt.Errorf("context canceled while waiting for webhook ready: %w", ctx.Err())
143+
default:
144+
}
145+
140146
duration := time.Since(startTS)
141147
if err = Checker(nil); err == nil {
142148
return nil
@@ -147,5 +153,4 @@ func WaitReady() error {
147153
}
148154
time.Sleep(time.Second * 2)
149155
}
150-
151156
}

pkg/webhook/server_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
Copyright 2024 The Kruise Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package webhook
18+
19+
import (
20+
"context"
21+
"strings"
22+
"testing"
23+
)
24+
25+
func TestWaitReadyCancel(t *testing.T) {
26+
ctx, cancel := context.WithCancel(context.Background())
27+
cancel() // instantly cancel the context
28+
29+
err := WaitReady(ctx)
30+
if err == nil {
31+
t.Fatalf("expected error, got nil")
32+
}
33+
if !strings.Contains(err.Error(), "context canceled while waiting for webhook ready") {
34+
t.Fatalf("unexpected error message: %v", err)
35+
}
36+
}

0 commit comments

Comments
 (0)