From 9734b95f5cebaaf3e968cdb3acc266badc5fbe2c Mon Sep 17 00:00:00 2001 From: Swati Gupta Date: Tue, 9 Jun 2026 22:39:12 +0000 Subject: [PATCH] Stop injecting implicit TimeSlicing and reset on unprepare Signed-off-by: Swati Gupta mig cleanup Signed-off-by: Swati Gupta make applyconfing checkcum compatible Signed-off-by: Swati Gupta --- api/nvidia.com/resource/v1beta1/gpuconfig.go | 20 ++------------------ api/nvidia.com/resource/v1beta1/migconfig.go | 17 ++--------------- cmd/gpu-kubelet-plugin/device_state.go | 20 +++++++++++++++----- 3 files changed, 19 insertions(+), 38 deletions(-) diff --git a/api/nvidia.com/resource/v1beta1/gpuconfig.go b/api/nvidia.com/resource/v1beta1/gpuconfig.go index 0e6c45342..4b39011cb 100644 --- a/api/nvidia.com/resource/v1beta1/gpuconfig.go +++ b/api/nvidia.com/resource/v1beta1/gpuconfig.go @@ -33,34 +33,18 @@ type GpuConfig struct { // DefaultGpuConfig provides the default GPU configuration. func DefaultGpuConfig() *GpuConfig { - config := &GpuConfig{ + return &GpuConfig{ TypeMeta: metav1.TypeMeta{ APIVersion: GroupName + "/" + Version, Kind: GpuConfigKind, }, } - - if featuregates.Enabled(featuregates.TimeSlicingSettings) { - config.Sharing = &GpuSharing{ - Strategy: TimeSlicingStrategy, - TimeSlicingConfig: &TimeSlicingConfig{ - Interval: ptr.To(DefaultTimeSlice), - }, - } - } - - return config } // Normalize updates a GpuConfig config with implied default values based on other settings. func (c *GpuConfig) Normalize() error { if c.Sharing == nil { - if !featuregates.Enabled(featuregates.TimeSlicingSettings) { - return nil - } - c.Sharing = &GpuSharing{ - Strategy: TimeSlicingStrategy, - } + return nil } if featuregates.Enabled(featuregates.TimeSlicingSettings) { diff --git a/api/nvidia.com/resource/v1beta1/migconfig.go b/api/nvidia.com/resource/v1beta1/migconfig.go index ab9e9d41e..00b7d5807 100644 --- a/api/nvidia.com/resource/v1beta1/migconfig.go +++ b/api/nvidia.com/resource/v1beta1/migconfig.go @@ -32,31 +32,18 @@ type MigDeviceConfig struct { // DefaultMigDeviceConfig provides the default Mig Device configuration. func DefaultMigDeviceConfig() *MigDeviceConfig { - config := &MigDeviceConfig{ + return &MigDeviceConfig{ TypeMeta: metav1.TypeMeta{ APIVersion: GroupName + "/" + Version, Kind: MigDeviceConfigKind, }, } - - if featuregates.Enabled(featuregates.TimeSlicingSettings) { - config.Sharing = &MigDeviceSharing{ - Strategy: TimeSlicingStrategy, - } - } - - return config } // Normalize updates a MigDeviceConfig config with implied default values based on other settings. func (c *MigDeviceConfig) Normalize() error { if c.Sharing == nil { - if !featuregates.Enabled(featuregates.TimeSlicingSettings) { - return nil - } - c.Sharing = &MigDeviceSharing{ - Strategy: TimeSlicingStrategy, - } + return nil } if featuregates.Enabled(featuregates.MPSSupport) { diff --git a/cmd/gpu-kubelet-plugin/device_state.go b/cmd/gpu-kubelet-plugin/device_state.go index 469069104..b439516f0 100644 --- a/cmd/gpu-kubelet-plugin/device_state.go +++ b/cmd/gpu-kubelet-plugin/device_state.go @@ -30,6 +30,7 @@ import ( "sync" "time" + "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/pmezard/go-difflib/difflib" resourceapi "k8s.io/api/resource/v1" "k8s.io/apimachinery/pkg/runtime" @@ -37,6 +38,7 @@ import ( "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" cperrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" + "k8s.io/utils/ptr" cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" "github.com/sirupsen/logrus" @@ -57,6 +59,7 @@ type DeviceConfigState struct { MpsControlDaemonID string `json:"mpsControlDaemonID"` Config configapi.Interface `json:"-"` // don't serialize this. containerEdits *cdiapi.ContainerEdits + TimeSliceApplied *bool `json:"timeSliceApplied,omitempty"` } type DeviceState struct { @@ -940,12 +943,18 @@ func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, dev return fmt.Errorf("error stopping MPS control daemon: %w", err) } } - - // Go back to default time-slicing for all full GPUs. - if featuregates.Enabled(featuregates.TimeSlicingSettings) { - tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig + // Reset when time-slicing was applied at prepare (true), or when the + // checkpoint predates timeSliceApplied (nil — legacy implicit time-slicing). + if featuregates.Enabled(featuregates.TimeSlicingSettings) && + ptr.Deref(group.ConfigState.TimeSliceApplied, true) { + defaultInterval := configapi.DefaultTimeSlice + tsc := &configapi.TimeSlicingConfig{Interval: &defaultInterval} if err := s.tsManager.SetTimeSlice(group.Devices.GpuUUIDs(), tsc); err != nil { - return fmt.Errorf("error setting timeslice for devices: %w", err) + if err == nvml.ERROR_NOT_SUPPORTED { + klog.Warningf("Unprepare: skip resetting time-slice policy for devices: %v", err) + } else { + return fmt.Errorf("error setting timeslice for devices: %w", err) + } } } @@ -1062,6 +1071,7 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S if err != nil { return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) } + configState.TimeSliceApplied = ptr.To(true) } }