Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 2 additions & 18 deletions api/nvidia.com/resource/v1beta1/gpuconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,34 +33,18 @@ type GpuConfig struct {

// DefaultGpuConfig provides the default GPU configuration.
func DefaultGpuConfig() *GpuConfig {
config := &GpuConfig{
return &GpuConfig{
TypeMeta: metav1.TypeMeta{
APIVersion: GroupName + "/" + Version,
Kind: GpuConfigKind,
},
}

if featuregates.Enabled(featuregates.TimeSlicingSettings) {
config.Sharing = &GpuSharing{
Strategy: TimeSlicingStrategy,
TimeSlicingConfig: &TimeSlicingConfig{
Interval: ptr.To(DefaultTimeSlice),
},
}
}

return config
}
Comment on lines 34 to 42

// Normalize updates a GpuConfig config with implied default values based on other settings.
func (c *GpuConfig) Normalize() error {
if c.Sharing == nil {
if !featuregates.Enabled(featuregates.TimeSlicingSettings) {
return nil
}
c.Sharing = &GpuSharing{
Strategy: TimeSlicingStrategy,
}
return nil
}

if featuregates.Enabled(featuregates.TimeSlicingSettings) {
Expand Down
17 changes: 2 additions & 15 deletions api/nvidia.com/resource/v1beta1/migconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,18 @@ type MigDeviceConfig struct {

// DefaultMigDeviceConfig provides the default Mig Device configuration.
func DefaultMigDeviceConfig() *MigDeviceConfig {
config := &MigDeviceConfig{
return &MigDeviceConfig{
TypeMeta: metav1.TypeMeta{
APIVersion: GroupName + "/" + Version,
Kind: MigDeviceConfigKind,
},
}

if featuregates.Enabled(featuregates.TimeSlicingSettings) {
config.Sharing = &MigDeviceSharing{
Strategy: TimeSlicingStrategy,
}
}

return config
}

// Normalize updates a MigDeviceConfig config with implied default values based on other settings.
func (c *MigDeviceConfig) Normalize() error {
if c.Sharing == nil {
if !featuregates.Enabled(featuregates.TimeSlicingSettings) {
return nil
}
c.Sharing = &MigDeviceSharing{
Strategy: TimeSlicingStrategy,
}
return nil
}

if featuregates.Enabled(featuregates.MPSSupport) {
Expand Down
20 changes: 15 additions & 5 deletions cmd/gpu-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ import (
"sync"
"time"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/pmezard/go-difflib/difflib"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/dynamic-resource-allocation/kubeletplugin"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
cperrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
"k8s.io/utils/ptr"
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"

"github.com/sirupsen/logrus"
Expand All @@ -57,6 +59,7 @@ type DeviceConfigState struct {
MpsControlDaemonID string `json:"mpsControlDaemonID"`
Config configapi.Interface `json:"-"` // don't serialize this.
containerEdits *cdiapi.ContainerEdits
TimeSliceApplied *bool `json:"timeSliceApplied,omitempty"`
}

type DeviceState struct {
Expand Down Expand Up @@ -940,12 +943,18 @@ func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, dev
return fmt.Errorf("error stopping MPS control daemon: %w", err)
}
}

// Go back to default time-slicing for all full GPUs.
if featuregates.Enabled(featuregates.TimeSlicingSettings) {
tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig
// Reset when time-slicing was applied at prepare (true), or when the
// checkpoint predates timeSliceApplied (nil — legacy implicit time-slicing).
if featuregates.Enabled(featuregates.TimeSlicingSettings) &&
ptr.Deref(group.ConfigState.TimeSliceApplied, true) {
defaultInterval := configapi.DefaultTimeSlice
tsc := &configapi.TimeSlicingConfig{Interval: &defaultInterval}
if err := s.tsManager.SetTimeSlice(group.Devices.GpuUUIDs(), tsc); err != nil {
return fmt.Errorf("error setting timeslice for devices: %w", err)
if err == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Unprepare: skip resetting time-slice policy for devices: %v", err)
} else {
return fmt.Errorf("error setting timeslice for devices: %w", err)
}
}
}

Expand Down Expand Up @@ -1062,6 +1071,7 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
if err != nil {
return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
configState.TimeSliceApplied = ptr.To(true)
}
}

Expand Down