diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go index 5e0fad828ca0..63d17ff98c0a 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go @@ -96,7 +96,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C if err != nil || level == unknownUpgradeLevel { return nil, fmt.Errorf("failed to determine upgrade level: %w", err) } - junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...) + junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, w.adminRESTConfig)...) } else { junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) } diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 54a5bb2ae696..928f02f9bed8 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -162,6 +162,14 @@ func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, return *topo, nil } +// isTNFJobClusterOperatorReason matches ClusterOperator condition Reason values emitted while +// two-node fencing (TNF) batch Jobs run in openshift-etcd. The cluster-etcd-operator maps +// active Job state into etcd's ClusterOperator with reasons shaped like +// tnf-_JobRunning (including a per-job hash suffix on some Jobs, e.g. tnf-auth-job-master-0-64736551_JobRunning). +func isTNFJobClusterOperatorReason(reason string) bool { + return strings.HasPrefix(reason, "tnf-") && strings.HasSuffix(reason, "_JobRunning") +} + // isInUpgradeWindow determines if the given eventInterval falls within an upgrade window. // UpgradeStart and UpgradeRollback events start upgrade windows and can end and already started upgrade window. // UpgradeComplete and UpgradeFailed events end upgrade windows; if there was not an already started upgrade window, @@ -290,6 +298,11 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf strings.Contains(condition.Message, `Waiting for Deployment`) { return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on two node" } + case "etcd": + if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && + isTNFJobClusterOperatorReason(condition.Reason) { + return "clusteroperator/etcd may report Available=False while a TNF batch Job is running on dual-replica topology (CEO JobRunning condition reasons)" + } } } @@ -366,13 +379,28 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf return "https://issues.redhat.com/browse/OCPBUGS-62517" } case "openshift-apiserver": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - (condition.Reason == "APIServerDeployment_NoDeployment" || + if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse { + if isTwoNode && condition.Reason == "APIServices_PreconditionNotReady" { + return "openshift-apiserver may briefly report Available=False with APIServices_PreconditionNotReady during dual-replica upgrade or fencing when aggregated API preconditions lag behind member recovery" + } + if condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServerDeployment_UnavailablePod" || - condition.Reason == "APIServices_Error") { - return "https://issues.redhat.com/browse/OCPBUGS-23746" + condition.Reason == "APIServices_Error" { + return "https://issues.redhat.com/browse/OCPBUGS-23746" + } + } + case "openshift-samples": + if isTwoNode { + if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && + condition.Reason == "SampleUpsertsPending" { + return "openshift-samples may report Available=False with SampleUpsertsPending when sample CR writes hit transient apiserver errors during DualReplica disruptive upgrades" + } + if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue && + condition.Reason == "APIServerServiceUnavailableError" { + return "openshift-samples may report Degraded with APIServerServiceUnavailableError when the API server is briefly unavailable during DualReplica upgrades" + } } case "operator-lifecycle-manager-packageserver": if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" { @@ -600,11 +628,21 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] return ret } -func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase { +func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, clientConfig *rest.Config) []*junitapi.JUnitTestCase { var ret []*junitapi.JUnitTestCase upgradeWindows := getUpgradeWindows(events) multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1 + isTwoNode := false + if clientConfig != nil { + topology, err := getControlPlaneTopology(clientConfig) + if err != nil { + logrus.Warnf("Error checking for ControlPlaneTopology configuration for MCO co-progressing monitor (unable to apply two-node TNF exceptions): %v", err) + } else { + isTwoNode = topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode + } + } + var machineConfigProgressingStart time.Time var eventsInUpgradeWindows monitorapi.Intervals @@ -711,6 +749,22 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, except = func(co string, reason string) string { switch co { + case "authentication": + if isTwoNode && reason == "APIServerDeployment_NewGeneration" { + return "authentication operator may roll oauth-apiserver (APIServerDeployment_NewGeneration) during DualReplica upgrades while machine-config is progressing" + } + case "etcd": + if isTwoNode { + if reason == "NodeInstaller" { + return "clusteroperator/etcd may report Progressing=True while etcd static pods roll to a new revision (NodeInstaller) during DualReplica upgrades while machine-config is progressing" + } + if reason == "EtcdMembers_MembersNotStarted" { + return "clusteroperator/etcd may report Progressing=True while an etcd member is still joining (EtcdMembers_MembersNotStarted) during DualReplica fencing or replacement" + } + if isTNFJobClusterOperatorReason(reason) { + return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)" + } + } case "console": if reason == "SyncLoopRefresh_InProgress" { return "https://issues.redhat.com/browse/OCPBUGS-64688" @@ -751,6 +805,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, if strings.HasSuffix(reason, "ControllerManager_Deploying") { return "https://issues.redhat.com/browse/OCPBUGS-62635" } + case "openshift-apiserver": + if isTwoNode && reason == "OperatorConfig_NewGeneration" { + return "openshift-apiserver operator may reconcile openshiftapiserveroperatorconfigs (OperatorConfig_NewGeneration) during DualReplica upgrades while machine-config is progressing" + } case "operator-lifecycle-manager-packageserver": if reason == "" { return "https://issues.redhat.com/browse/OCPBUGS-63672" diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go index 017b8e966d6c..c6185d4a1751 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go @@ -370,3 +370,26 @@ func Test_patchUpgradeWithConfigClient(t *testing.T) { }) } } + +func TestIsTNFJobClusterOperatorReason(t *testing.T) { + tests := []struct { + reason string + want bool + }{ + {"tnf-setup-job_JobRunning", true}, + {"tnf-fencing-job_JobRunning", true}, + {"tnf-auth-job-master-0-64736551_JobRunning", true}, + {"tnf-update-setup-job-master-1-abc12345_JobRunning", true}, + {"tnf-after-setup-job-master-0-deadbeef_JobRunning", true}, + {"EtcdMembersProgressing", false}, + {"NodeInstaller_InstallerPodRunning", false}, + {"tnf-setup-job_JobComplete", false}, + {"setup-job_JobRunning", false}, + {"", false}, + } + for _, tt := range tests { + t.Run(tt.reason, func(t *testing.T) { + assert.Equal(t, tt.want, isTNFJobClusterOperatorReason(tt.reason)) + }) + } +}