From af356f9e1ac7bb645c8239512b8782bb91d3e9b0 Mon Sep 17 00:00:00 2001 From: "Shiva Krishna, Merla" Date: Wed, 10 Jun 2026 08:30:10 -0700 Subject: [PATCH] Fix remove redundant component label added for backward compatibility with upgrade tests * As part of GitHub issue #1079 we had introduced redundant component label in both kubelet plugin and CD controller. This was mainly for compatibility with upgrade tests (checkpoint validation) * However, this caused duplicate keys to be rendered and parser validation failures with GitOps like ArgoCD/FluxCD. * Now that we have a stable release in upstream registries, we can safely remove this label with previous name. Signed-off-by: Shiva Krishna, Merla --- .../helm/dra-driver-nvidia-gpu/templates/controller.yaml | 1 - .../dra-driver-nvidia-gpu/templates/kubeletplugin.yaml | 1 - tests/bats/Makefile | 4 ++-- tests/bats/helpers.sh | 6 ++---- tests/bats/test_gpu_updowngrade.bats | 8 ++++---- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/deployments/helm/dra-driver-nvidia-gpu/templates/controller.yaml b/deployments/helm/dra-driver-nvidia-gpu/templates/controller.yaml index c90abad21..2c70880e2 100644 --- a/deployments/helm/dra-driver-nvidia-gpu/templates/controller.yaml +++ b/deployments/helm/dra-driver-nvidia-gpu/templates/controller.yaml @@ -35,7 +35,6 @@ spec: labels: {{- include "dra-driver-nvidia-gpu.templateLabels" . | nindent 8 }} {{- include "dra-driver-nvidia-gpu.selectorLabels" (dict "context" . "componentName" "controller") | nindent 8 }} - nvidia-dra-driver-gpu-component: controller spec: {{- if .Values.controller.priorityClassName }} priorityClassName: {{ .Values.controller.priorityClassName }} diff --git a/deployments/helm/dra-driver-nvidia-gpu/templates/kubeletplugin.yaml b/deployments/helm/dra-driver-nvidia-gpu/templates/kubeletplugin.yaml index 4ab86bcd7..ec66931cf 100644 --- a/deployments/helm/dra-driver-nvidia-gpu/templates/kubeletplugin.yaml +++ b/deployments/helm/dra-driver-nvidia-gpu/templates/kubeletplugin.yaml @@ -38,7 +38,6 @@ spec: labels: {{- include "dra-driver-nvidia-gpu.templateLabels" . | nindent 8 }} {{- include "dra-driver-nvidia-gpu.selectorLabels" (dict "context" . "componentName" "kubelet-plugin") | nindent 8 }} - nvidia-dra-driver-gpu-component: kubelet-plugin spec: {{- if .Values.kubeletPlugin.priorityClassName }} priorityClassName: {{ .Values.kubeletPlugin.priorityClassName }} diff --git a/tests/bats/Makefile b/tests/bats/Makefile index bf56bd9c8..f097e32dd 100644 --- a/tests/bats/Makefile +++ b/tests/bats/Makefile @@ -20,8 +20,8 @@ TEST_CHART_REPO ?= "oci://gcr.io/k8s-staging-nvidia/charts/dra-driver-nvidia-gpu TEST_CHART_VERSION ?= "$(VERSION_STAGING_CHART)" # The baseline Helm chart to test upgrades from and downgrades to. -TEST_CHART_LASTSTABLE_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu" -TEST_CHART_LASTSTABLE_VERSION ?= "25.12.0-0882da87-chart" +TEST_CHART_LASTSTABLE_REPO ?= "oci://registry.k8s.io/dra-driver-nvidia/charts/dra-driver-nvidia-gpu" +TEST_CHART_LASTSTABLE_VERSION ?= "0.4.0" # If not "false": the to-be-tested Helm chart is installed from the local # filesystem (from `deployments/helm/dra-driver-nvidia-gpu`). Make sure diff --git a/tests/bats/helpers.sh b/tests/bats/helpers.sh index 71b829b23..5bf943a23 100644 --- a/tests/bats/helpers.sh +++ b/tests/bats/helpers.sh @@ -85,10 +85,8 @@ iupgrade_wait() { # not natively supported by `kubectl wait`, hence this must be something of # the shape # `kubectl get pods ... | xargs -I{} kubectl wait --for=condition=Ready pod/{} ` - # TODO: change `nvidia-dra-driver-gpu-component` when last stable supports both, the - # new and the old label key sleep 1 - kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=15s + kubectl wait --for=condition=READY pods -A -l dra-driver-nvidia-gpu-component=kubelet-plugin --timeout=15s # Again, log current state. kubectl get pods -n dra-driver-nvidia-gpu @@ -96,7 +94,7 @@ iupgrade_wait() { # That one should be obvious now, but make that guarantee explicit for # consuming tests. Skip when compute domains are disabled (no controller deployment). if [ "${DISABLE_COMPUTE_DOMAINS:-}" != "true" ]; then - kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=controller --timeout=10s + kubectl wait --for=condition=READY pods -A -l dra-driver-nvidia-gpu-component=controller --timeout=10s fi # maybe: check version on labels (to confirm that we set labels correctly) log "iupgrade_wait: done" diff --git a/tests/bats/test_gpu_updowngrade.bats b/tests/bats/test_gpu_updowngrade.bats index 74722d4ce..c6e6bd6bc 100644 --- a/tests/bats/test_gpu_updowngrade.bats +++ b/tests/bats/test_gpu_updowngrade.bats @@ -54,7 +54,7 @@ bats::on_failure() { _node=$(kubectl get pod "${_podname}" -o jsonpath='{.spec.nodeName}') log "workload runs on node: ${_node}" _kpod=$(kubectl get pods -n dra-driver-nvidia-gpu \ - -l nvidia-dra-driver-gpu-component=kubelet-plugin \ + -l dra-driver-nvidia-gpu-component=kubelet-plugin \ --field-selector spec.nodeName="${_node}" \ -o jsonpath='{.items[0].metadata.name}') log "kubelet-plugin pod on that node: ${_kpod}" @@ -119,7 +119,7 @@ bats::on_failure() { _node=$(kubectl get pod "${_podname}" -o jsonpath='{.spec.nodeName}') log "workload runs on node: ${_node}" _kpod=$(kubectl get pods -n dra-driver-nvidia-gpu \ - -l nvidia-dra-driver-gpu-component=kubelet-plugin \ + -l dra-driver-nvidia-gpu-component=kubelet-plugin \ --field-selector spec.nodeName="${_node}" \ -o jsonpath='{.items[0].metadata.name}') log "kubelet-plugin pod on that node: ${_kpod}" @@ -172,7 +172,7 @@ bats::on_failure() { # Stage 2: pick any kubelet plugin pod. local _kpod _node _kpod=$(kubectl get pods -n dra-driver-nvidia-gpu \ - -l nvidia-dra-driver-gpu-component=kubelet-plugin \ + -l dra-driver-nvidia-gpu-component=kubelet-plugin \ -o jsonpath='{.items[0].metadata.name}') _node=$(kubectl get pod -n dra-driver-nvidia-gpu "${_kpod}" -o jsonpath='{.spec.nodeName}') log "targeting plugin pod ${_kpod} on node ${_node}" @@ -200,7 +200,7 @@ bats::on_failure() { local _newkpod="" _deadline=$((SECONDS + 60)) while true; do _newkpod=$(kubectl get pods -n dra-driver-nvidia-gpu \ - -l nvidia-dra-driver-gpu-component=kubelet-plugin \ + -l dra-driver-nvidia-gpu-component=kubelet-plugin \ --field-selector spec.nodeName="${_node}" \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || true if [ -n "${_newkpod}" ] && [ "${_newkpod}" != "${_kpod}" ]; then