Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions cmd/gpu-kubelet-plugin/deviceinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import (
"k8s.io/utils/ptr"
)

const compatibilityNumaNodeAttribute resourceapi.QualifiedName = "dra.net/numaNode"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@enoodle instead of adding attributes with a custom prefix, better to use derived attributes feature that is proposed. Also, if we decide to publish an attribute for numaNode, it will be a list attribute to support modern coherent memory systems.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently, dra-net, dra-cpu and dra-sriov-vf all seem to use same prefix to advertise this attribute. This could be an intermittent solution until the derived attributes and list-attributes features are GA to allow use cases where topology alignment using numaNode make sense. @varunrsekar WDYT?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m not a fan of the short-term solution. GPU DRA driver does not own the “dra.net/numaNode” attribute. If anything, we should wait for this KEP kubernetes/enhancements#6073 to be merged and use the standardized attribute under the resource.kubernetes.io domain.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see that the standard attribute is now considered for 1.37. Thought there was a push back on this in favor of derived attributes feature which would have been only alpha in 1.37. @enoodle if kubernetes/enhancements#6073 is approved, then we can immediately use the same prefix in our driver. We have to propose same with the other drivers too.


// Represents a specific, full, physical GPU device.
type GpuInfo struct {
UUID string `json:"uuid"`
Expand All @@ -44,6 +46,7 @@ type GpuInfo struct {
pciBusID string
pciBusIDAttr *deviceattribute.DeviceAttribute
pcieRootAttr *deviceattribute.DeviceAttribute
numaNode *int
migProfiles []*MigProfileInfo
addressingMode *string

Expand Down Expand Up @@ -184,9 +187,7 @@ func (d *GpuInfo) Attributes() map[resourceapi.QualifiedName]resourceapi.DeviceA
attrs[d.pcieRootAttr.Name] = d.pcieRootAttr.Value
}

if d.pciBusIDAttr != nil {
attrs[d.pciBusIDAttr.Name] = d.pciBusIDAttr.Value
}
addCompatibilityNumaNodeAttribute(attrs, d.numaNode)

if d.addressingMode != nil {
attrs["addressingMode"] = resourceapi.DeviceAttribute{
Expand Down Expand Up @@ -272,5 +273,16 @@ func (d *VfioDeviceInfo) GetDevice() resourceapi.Device {
device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value
}

addCompatibilityNumaNodeAttribute(device.Attributes, &d.numaNode)

return device
}

func addCompatibilityNumaNodeAttribute(attrs map[resourceapi.QualifiedName]resourceapi.DeviceAttribute, numaNode *int) {
if numaNode == nil || *numaNode < 0 {
return
}
attrs[compatibilityNumaNodeAttribute] = resourceapi.DeviceAttribute{
IntValue: ptr.To(int64(*numaNode)),
}
}
75 changes: 75 additions & 0 deletions cmd/gpu-kubelet-plugin/deviceinfo_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
Copyright The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"testing"

resourceapi "k8s.io/api/resource/v1"
"k8s.io/utils/ptr"

"github.com/stretchr/testify/require"
)

func newTestGpuInfo(numaNode *int) *GpuInfo {
return &GpuInfo{
UUID: "GPU-test",
minor: 0,
productName: "NVIDIA Test GPU",
brand: "NVIDIA",
architecture: "Test",
cudaComputeCapability: "9.0",
driverVersion: "580.0.0",
cudaDriverVersion: "13.0",
numaNode: numaNode,
}
}

func requireNumaNodeAttribute(t *testing.T, attrs map[resourceapi.QualifiedName]resourceapi.DeviceAttribute, expected int64) {
t.Helper()

attr, ok := attrs[compatibilityNumaNodeAttribute]
require.True(t, ok)
require.NotNil(t, attr.IntValue)
require.Equal(t, expected, *attr.IntValue)
}

func TestGpuInfoAttributesIncludeCompatibilityNumaNode(t *testing.T) {
gpu := newTestGpuInfo(ptr.To(1))

requireNumaNodeAttribute(t, gpu.Attributes(), 1)
}

func TestCommonMigAttributesIncludeCompatibilityNumaNode(t *testing.T) {
parent := newTestGpuInfo(ptr.To(2))

requireNumaNodeAttribute(t, CommonAttributesMig(parent, "1g.10gb"), 2)
}

func TestVfioDeviceIncludesCompatibilityNumaNode(t *testing.T) {
vfio := &VfioDeviceInfo{
UUID: "vfio-test",
deviceID: "0x1234",
vendorID: "0x10de",
index: 0,
productName: "NVIDIA Test GPU",
numaNode: 3,
addressableMemoryBytes: 1024,
}

requireNumaNodeAttribute(t, vfio.GetDevice().Attributes, 3)
}
2 changes: 2 additions & 0 deletions cmd/gpu-kubelet-plugin/mig.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ func CommonAttributesMig(parent *GpuInfo, profileName string) map[resourceapi.Qu
attrs[parent.pcieRootAttr.Name] = parent.pcieRootAttr.Value
}

addCompatibilityNumaNodeAttribute(attrs, parent.numaNode)

return attrs
}

Expand Down
33 changes: 33 additions & 0 deletions cmd/gpu-kubelet-plugin/nvlib.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,11 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
return nil, fmt.Errorf("error getting PCI bus ID for device %d: %w", index, err)
}

numaNode, err := l.discoverNumaNode(pciBusID, device.GetNumaNodeId)
if err != nil {
return nil, fmt.Errorf("error getting NUMA node ID for device %d: %w", index, err)
}

// Get the memory-addressing mode supported by the device.
// On coherent-memory systems, the possible modes are:
// - HMM (Hardware Memory Management)
Expand Down Expand Up @@ -611,13 +616,41 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
pciBusID: pciBusID,
pciBusIDAttr: pciBusIDAttr,
pcieRootAttr: pcieRootAttr,
numaNode: numaNode,
migProfiles: migProfiles,
addressingMode: addressingMode,
}

return gpuInfo, nil
}

func (l deviceLib) discoverNumaNode(pciBusID string, getNVMLNumaNode func() (int, nvml.Return)) (*int, error) {
if node, ret := getNVMLNumaNode(); ret == nvml.SUCCESS {
if node < 0 {
return nil, nil
}
return &node, nil
} else if ret != nvml.ERROR_NOT_SUPPORTED && ret != nvml.ERROR_FUNCTION_NOT_FOUND && ret != nvml.ERROR_NOT_FOUND {
return nil, fmt.Errorf("nvmlDeviceGetNumaNodeId returned %v", ret)
} else {
klog.V(4).Infof("NVML NUMA node ID unavailable for PCI bus ID %s, falling back to PCI sysfs: %v", pciBusID, ret)
}

if l.nvpci == nil {
return nil, nil
}

gpu, err := l.nvpci.GetGPUByPciBusID(pciBusID)
if err != nil {
klog.Warningf("error getting PCI device for PCI bus ID %s, continuing without NUMA node attribute: %v", pciBusID, err)
return nil, nil
}
if gpu == nil || gpu.NumaNode < 0 {
return nil, nil
}
return &gpu.NumaNode, nil
}

func (l deviceLib) enumerateGpuVfioDevices(perGPUAllocatable *PerGPUAllocatableDevices) error {
// Discover PCI devices.
gpuPciDevices, err := l.nvpci.GetGPUs()
Expand Down
72 changes: 72 additions & 0 deletions cmd/gpu-kubelet-plugin/nvlib_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
Copyright The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"testing"

"github.com/NVIDIA/go-nvlib/pkg/nvpci"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/stretchr/testify/require"
)

func TestDiscoverNumaNodeUsesNVMLWhenAvailable(t *testing.T) {
lib := deviceLib{}

numaNode, err := lib.discoverNumaNode("0000:9f:00.0", func() (int, nvml.Return) {
return 1, nvml.SUCCESS
})

require.NoError(t, err)
require.NotNil(t, numaNode)
require.Equal(t, 1, *numaNode)
}

func TestDiscoverNumaNodeFallsBackToPCIWhenNVMLUnsupported(t *testing.T) {
pci := &nvpci.InterfaceMock{
GetGPUByPciBusIDFunc: func(pciBusID string) (*nvpci.NvidiaPCIDevice, error) {
require.Equal(t, "0000:9f:00.0", pciBusID)
return &nvpci.NvidiaPCIDevice{NumaNode: 2}, nil
},
}
lib := deviceLib{nvpci: pci}

numaNode, err := lib.discoverNumaNode("0000:9f:00.0", func() (int, nvml.Return) {
return 0, nvml.ERROR_NOT_SUPPORTED
})

require.NoError(t, err)
require.NotNil(t, numaNode)
require.Equal(t, 2, *numaNode)
require.Len(t, pci.GetGPUByPciBusIDCalls(), 1)
}

func TestDiscoverNumaNodeOmitsUnknownPCINumaNode(t *testing.T) {
pci := &nvpci.InterfaceMock{
GetGPUByPciBusIDFunc: func(string) (*nvpci.NvidiaPCIDevice, error) {
return &nvpci.NvidiaPCIDevice{NumaNode: -1}, nil
},
}
lib := deviceLib{nvpci: pci}

numaNode, err := lib.discoverNumaNode("0000:9f:00.0", func() (int, nvml.Return) {
return 0, nvml.ERROR_NOT_SUPPORTED
})

require.NoError(t, err)
require.Nil(t, numaNode)
}