Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
if err != nil || level == unknownUpgradeLevel {
return nil, fmt.Errorf("failed to determine upgrade level: %w", err)
}
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...)
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, w.adminRESTConfig)...)
} else {
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,14 @@ func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode,
return *topo, nil
}

// isTNFJobClusterOperatorReason matches ClusterOperator condition Reason values emitted while
// two-node fencing (TNF) batch Jobs run in openshift-etcd. The cluster-etcd-operator maps
// active Job state into etcd's ClusterOperator with reasons shaped like
// tnf-<workflow>_JobRunning (including a per-job hash suffix on some Jobs, e.g. tnf-auth-job-master-0-64736551_JobRunning).
func isTNFJobClusterOperatorReason(reason string) bool {
return strings.HasPrefix(reason, "tnf-") && strings.HasSuffix(reason, "_JobRunning")
}

// isInUpgradeWindow determines if the given eventInterval falls within an upgrade window.
// UpgradeStart and UpgradeRollback events start upgrade windows and can end and already started upgrade window.
// UpgradeComplete and UpgradeFailed events end upgrade windows; if there was not an already started upgrade window,
Expand Down Expand Up @@ -290,6 +298,11 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
strings.Contains(condition.Message, `Waiting for Deployment`) {
return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on two node"
}
case "etcd":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
isTNFJobClusterOperatorReason(condition.Reason) {
return "clusteroperator/etcd may report Available=False while a TNF batch Job is running on dual-replica topology (CEO JobRunning condition reasons)"
}
}
}

Expand Down Expand Up @@ -366,13 +379,28 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
return "https://issues.redhat.com/browse/OCPBUGS-62517"
}
case "openshift-apiserver":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
(condition.Reason == "APIServerDeployment_NoDeployment" ||
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
if isTwoNode && condition.Reason == "APIServices_PreconditionNotReady" {
return "openshift-apiserver may briefly report Available=False with APIServices_PreconditionNotReady during dual-replica upgrade or fencing when aggregated API preconditions lag behind member recovery"
}
if condition.Reason == "APIServerDeployment_NoDeployment" ||
condition.Reason == "APIServerDeployment_NoPod" ||
condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" ||
condition.Reason == "APIServerDeployment_UnavailablePod" ||
condition.Reason == "APIServices_Error") {
return "https://issues.redhat.com/browse/OCPBUGS-23746"
condition.Reason == "APIServices_Error" {
return "https://issues.redhat.com/browse/OCPBUGS-23746"
}
}
case "openshift-samples":
if isTwoNode {
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
condition.Reason == "SampleUpsertsPending" {
return "openshift-samples may report Available=False with SampleUpsertsPending when sample CR writes hit transient apiserver errors during DualReplica disruptive upgrades"
}
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue &&
condition.Reason == "APIServerServiceUnavailableError" {
return "openshift-samples may report Degraded with APIServerServiceUnavailableError when the API server is briefly unavailable during DualReplica upgrades"
}
}
case "operator-lifecycle-manager-packageserver":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" {
Expand Down Expand Up @@ -600,11 +628,21 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
return ret
}

func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase {
func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
var ret []*junitapi.JUnitTestCase
upgradeWindows := getUpgradeWindows(events)
multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1

isTwoNode := false
if clientConfig != nil {
topology, err := getControlPlaneTopology(clientConfig)
if err != nil {
logrus.Warnf("Error checking for ControlPlaneTopology configuration for MCO co-progressing monitor (unable to apply two-node TNF exceptions): %v", err)
} else {
isTwoNode = topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode
}
}

var machineConfigProgressingStart time.Time
var eventsInUpgradeWindows monitorapi.Intervals

Expand Down Expand Up @@ -711,6 +749,22 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,

except = func(co string, reason string) string {
switch co {
case "authentication":
if isTwoNode && reason == "APIServerDeployment_NewGeneration" {
return "authentication operator may roll oauth-apiserver (APIServerDeployment_NewGeneration) during DualReplica upgrades while machine-config is progressing"
}
case "etcd":
if isTwoNode {
if reason == "NodeInstaller" {
return "clusteroperator/etcd may report Progressing=True while etcd static pods roll to a new revision (NodeInstaller) during DualReplica upgrades while machine-config is progressing"
}
if reason == "EtcdMembers_MembersNotStarted" {
return "clusteroperator/etcd may report Progressing=True while an etcd member is still joining (EtcdMembers_MembersNotStarted) during DualReplica fencing or replacement"
}
if isTNFJobClusterOperatorReason(reason) {
return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)"
}
}
case "console":
if reason == "SyncLoopRefresh_InProgress" {
return "https://issues.redhat.com/browse/OCPBUGS-64688"
Expand Down Expand Up @@ -751,6 +805,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
if strings.HasSuffix(reason, "ControllerManager_Deploying") {
return "https://issues.redhat.com/browse/OCPBUGS-62635"
}
case "openshift-apiserver":
if isTwoNode && reason == "OperatorConfig_NewGeneration" {
return "openshift-apiserver operator may reconcile openshiftapiserveroperatorconfigs (OperatorConfig_NewGeneration) during DualReplica upgrades while machine-config is progressing"
}
case "operator-lifecycle-manager-packageserver":
if reason == "" {
return "https://issues.redhat.com/browse/OCPBUGS-63672"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,3 +370,26 @@ func Test_patchUpgradeWithConfigClient(t *testing.T) {
})
}
}

func TestIsTNFJobClusterOperatorReason(t *testing.T) {
tests := []struct {
reason string
want bool
}{
{"tnf-setup-job_JobRunning", true},
{"tnf-fencing-job_JobRunning", true},
{"tnf-auth-job-master-0-64736551_JobRunning", true},
{"tnf-update-setup-job-master-1-abc12345_JobRunning", true},
{"tnf-after-setup-job-master-0-deadbeef_JobRunning", true},
{"EtcdMembersProgressing", false},
{"NodeInstaller_InstallerPodRunning", false},
{"tnf-setup-job_JobComplete", false},
{"setup-job_JobRunning", false},
{"", false},
}
for _, tt := range tests {
t.Run(tt.reason, func(t *testing.T) {
assert.Equal(t, tt.want, isTNFJobClusterOperatorReason(tt.reason))
})
}
}