Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/gpu-kubelet-plugin/deviceinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ type VfioDeviceInfo struct {
iommuGroup int
iommuFDEnabled bool
addressableMemoryBytes uint64
preConfigureDriver string
}

// CanonicalName returns the nameused for device announcement (in ResourceSlice
Expand Down
10 changes: 10 additions & 0 deletions cmd/gpu-kubelet-plugin/nvlib.go
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,16 @@ func (l deviceLib) enumerateGpuVfioDevices(perGPUAllocatable *PerGPUAllocatableD
for idx, pci := range gpuPciDevices {
klog.Infof("Adding VFIO device for discovered GPU PCI device: %s", pci.Address)

driver, err := getDriver(pciDevicesPath, pci.Address)
if err != nil {
klog.Warningf("Skipping VFIO device %s: unable to read driver: %v", pci.Address, err)
continue
}
if driver != vfioPciDriver {
klog.Infof("Skipping VFIO device %s: bound to %q, not %q", pci.Address, driver, vfioPciDriver)
continue
}

parent := perGPUAllocatable.GetGPUDeviceByPCIBusID(pci.Address)
if parent != nil && !parent.Gpu.vfioEnabled {
klog.Infof("Skipping VFIO device for discovered GPU PCI device: %s, vfio is not enabled", pci.Address)
Expand Down
44 changes: 29 additions & 15 deletions cmd/gpu-kubelet-plugin/vfio-cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package main

import (
"fmt"
"os"
"path/filepath"
"strings"

Expand Down Expand Up @@ -59,7 +60,13 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool
},
}

// IOMMU API device is not requested. Exit early.
// Always include /dev/vfio/vfio for legacy VFIO — it's required by
// libvirt to detect VFIO support. enableAPIDevice controls whether the
// preferred IOMMU backend device is also added.
edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{
Path: filepath.Join(vfioDevicesRoot, "vfio"),
})

if !enableAPIDevice {
return edits, nil
}
Expand All @@ -69,10 +76,6 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool
edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{
Path: iommuDevicePath,
})
} else {
edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{
Path: filepath.Join(vfioDevicesRoot, "vfio"),
})
}

return edits, nil
Expand All @@ -85,27 +88,29 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool
// We automatically assume we want the legacy device if PreferIommuFD policy is not selected.
// If more policies are added in the future, the handler needs to be enhanced to support them.
func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD bool) ([]cdispec.Device, error) {
nvpci := nvpci.New()
pciDeviceInfo, err := nvpci.GetGPUByPciBusID(pciBusID)
if err != nil {
return nil, fmt.Errorf("error getting PCI device info for GPU %q: %w", pciBusID, err)
}

devNodes := make([]*cdispec.DeviceNode, 0)

if preferIommuFD && h.iommuFDEnabled {
// The IOMMUFD cdev is located at /dev/vfio/devices/<vfioX> and is
// expected to be available if IOMMUFD is enabled on the node and the GPU is
// bound to the vfio driver.
nvpci := nvpci.New()
pciDeviceInfo, err := nvpci.GetGPUByPciBusID(pciBusID)
if err != nil {
return nil, fmt.Errorf("error getting PCI device info for GPU %q: %w", pciBusID, err)
}
if !strings.HasPrefix(pciDeviceInfo.IommuFD, "vfio") {
return nil, fmt.Errorf("missing iommufd cdev for GPU %q", pciDeviceInfo.Address)
}
devNodes = append(devNodes, &cdispec.DeviceNode{
Path: filepath.Join(vfioDevicesPath, pciDeviceInfo.IommuFD),
})
} else {
// Read IOMMU group directly from sysfs — works for GPUs on any
// driver including vfio-pci (nvpci may not find vfio-bound GPUs).
iommuGroup, err := getIommuGroupFromSysfs(pciBusID)
if err != nil {
return nil, fmt.Errorf("error getting IOMMU group for GPU %q: %w", pciBusID, err)
}
devNodes = append(devNodes, &cdispec.DeviceNode{
Path: filepath.Join(vfioDevicesRoot, fmt.Sprintf("%d", pciDeviceInfo.IommuGroup)),
Path: filepath.Join(vfioDevicesRoot, iommuGroup),
})
}
devSpecs := []cdispec.Device{
Expand All @@ -117,3 +122,12 @@ func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD
}
return devSpecs, nil
}

func getIommuGroupFromSysfs(pciBusID string) (string, error) {
iommuLink := filepath.Join(pciDevicesPath, pciBusID, "iommu_group")
target, err := os.Readlink(iommuLink)
if err != nil {
return "", fmt.Errorf("failed to read IOMMU group symlink for %s: %w", pciBusID, err)
}
return filepath.Base(target), nil
}
78 changes: 63 additions & 15 deletions cmd/gpu-kubelet-plugin/vfio-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -147,6 +148,7 @@ func (vm *VfioPciManager) Configure(ctx context.Context, info *VfioDeviceInfo) e
if err != nil {
return fmt.Errorf("error getting driver details for GPU %q: %w", info.PciBusID, err)
}
info.preConfigureDriver = driver

// Skip if the GPU is already bound to the vfio-pci driver.
if driver == vm.driver {
Expand Down Expand Up @@ -192,6 +194,14 @@ func (vm *VfioPciManager) Unconfigure(ctx context.Context, info *VfioDeviceInfo)
return nil
}

// If the GPU was pre-bound to vfio-pci (either tracked from Configure
// or detected via kernel cmdline vfio-pci.ids), leave it on vfio-pci.
// Rebinding to nvidia on NVLink systems hangs indefinitely.
if info.preConfigureDriver == vfioPciDriver || isVfioPciPrebound(info.deviceID) {
klog.Infof("GPU %s was pre-bound to vfio-pci, leaving on vfio-pci", info.PciBusID)
return nil
}

// Change the GPU driver to nvidia.
err := vm.changeDriver(info.PciBusID, nvidiaDriver)
if err != nil {
Expand Down Expand Up @@ -295,19 +305,19 @@ func (vm *VfioPciManager) disableGPUPersistenceMode(pciAddress string) error {

// Check if the vfio_pci module is loaded.
func checkVfioPCIModuleLoaded() (bool, error) {
f, err := os.Stat(filepath.Join(hostRoot, sysModulePath, vfioPciModule))
if err != nil {
if os.IsNotExist(err) {
return false, nil
for _, root := range []string{hostRoot, ""} {
f, err := os.Stat(filepath.Join(root, sysModulePath, vfioPciModule))
if err != nil {
if os.IsNotExist(err) {
continue
}
return false, fmt.Errorf("failed to check if vfio_pci module is loaded: %w", err)
}
if f.IsDir() {
return true, nil
}
return false, fmt.Errorf("failed to check if vfio_pci module is loaded: %w", err)
}

if !f.IsDir() {
return false, nil
}

return true, nil
return false, nil
}

// Load the vfio_pci module.
Expand All @@ -322,10 +332,20 @@ func loadVfioPciModule() error {

// Check if IOMMU is enabled.
func checkIommuEnabled() (bool, error) {
f, err := os.Open(filepath.Join(hostRoot, kernelIommuGroupPath))
if os.IsNotExist(err) {
return false, nil
for _, root := range []string{hostRoot, ""} {
enabled, err := checkIommuEnabledAt(filepath.Join(root, kernelIommuGroupPath))
if err != nil {
continue
}
if enabled {
return true, nil
}
}
return false, nil
}

func checkIommuEnabledAt(path string) (bool, error) {
f, err := os.Open(path)
if err != nil {
return false, err
}
Expand All @@ -337,10 +357,38 @@ func checkIommuEnabled() (bool, error) {
if err != nil {
return false, err
}

return true, nil
}

// isVfioPciPrebound checks if a device ID is in the kernel cmdline
// vfio-pci.ids parameter, indicating the GPU was pre-bound to vfio-pci
// at boot and should not be rebound to nvidia during Unconfigure.
func isVfioPciPrebound(deviceID string) bool {
if deviceID == "" {
return false
}
cmdline, err := os.ReadFile(filepath.Join(hostRoot, "/proc/cmdline"))
if err != nil {
return false
}
// Strip 0x prefix for comparison (vfio-pci.ids uses bare hex).
id := strings.TrimPrefix(deviceID, "0x")
for _, param := range strings.Fields(string(cmdline)) {
if !strings.HasPrefix(param, "vfio-pci.ids=") {
continue
}
ids := strings.TrimPrefix(param, "vfio-pci.ids=")
for _, entry := range strings.Split(ids, ",") {
// Entries are vendor:device, e.g. 10de:2330
parts := strings.Split(entry, ":")
if len(parts) == 2 && strings.EqualFold(parts[1], id) {
return true
}
}
}
return false
}

// Check if IOMMUFD is enabled.
// We correlate the IOMMUFD support with the presence of the /dev/iommu API device.
func checkIommuFDEnabled() (bool, error) {
Expand Down