From 60a48d0b951d9712bda6a165b038ae7c315d7825 Mon Sep 17 00:00:00 2001 From: John Hull Date: Wed, 29 Apr 2026 21:55:13 -0500 Subject: [PATCH 1/5] Always include /dev/vfio/vfio in CDI spec, use sysfs for IOMMU group lookup --- cmd/gpu-kubelet-plugin/vfio-cdi.go | 40 ++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/cmd/gpu-kubelet-plugin/vfio-cdi.go b/cmd/gpu-kubelet-plugin/vfio-cdi.go index 8e5e4cdf9..08173c8b9 100644 --- a/cmd/gpu-kubelet-plugin/vfio-cdi.go +++ b/cmd/gpu-kubelet-plugin/vfio-cdi.go @@ -18,6 +18,7 @@ package main import ( "fmt" + "os" "path/filepath" "strings" @@ -59,7 +60,13 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool }, } - // IOMMU API device is not requested. Exit early. + // Always include /dev/vfio/vfio for legacy VFIO — it's required by + // libvirt to detect VFIO support. enableAPIDevice controls whether the + // preferred IOMMU backend device is also added. + edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{ + Path: filepath.Join(vfioDevicesRoot, "vfio"), + }) + if !enableAPIDevice { return edits, nil } @@ -85,18 +92,14 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool // We automatically assume we want the legacy device if PreferIommuFD policy is not selected. // If more policies are added in the future, the handler needs to be enhanced to support them. func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD bool) ([]cdispec.Device, error) { - nvpci := nvpci.New() - pciDeviceInfo, err := nvpci.GetGPUByPciBusID(pciBusID) - if err != nil { - return nil, fmt.Errorf("error getting PCI device info for GPU %q: %w", pciBusID, err) - } - devNodes := make([]*cdispec.DeviceNode, 0) if preferIommuFD && h.iommuFDEnabled { - // The IOMMUFD cdev is located at /dev/vfio/devices/ and is - // expected to be available if IOMMUFD is enabled on the node and the GPU is - // bound to the vfio driver. + nvpci := nvpci.New() + pciDeviceInfo, err := nvpci.GetGPUByPciBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("error getting PCI device info for GPU %q: %w", pciBusID, err) + } if !strings.HasPrefix(pciDeviceInfo.IommuFD, "vfio") { return nil, fmt.Errorf("missing iommufd cdev for GPU %q", pciDeviceInfo.Address) } @@ -104,8 +107,14 @@ func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD Path: filepath.Join(vfioDevicesPath, pciDeviceInfo.IommuFD), }) } else { + // Read IOMMU group directly from sysfs — works for GPUs on any + // driver including vfio-pci (nvpci may not find vfio-bound GPUs). + iommuGroup, err := getIommuGroupFromSysfs(pciBusID) + if err != nil { + return nil, fmt.Errorf("error getting IOMMU group for GPU %q: %w", pciBusID, err) + } devNodes = append(devNodes, &cdispec.DeviceNode{ - Path: filepath.Join(vfioDevicesRoot, fmt.Sprintf("%d", pciDeviceInfo.IommuGroup)), + Path: filepath.Join(vfioDevicesRoot, iommuGroup), }) } devSpecs := []cdispec.Device{ @@ -117,3 +126,12 @@ func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD } return devSpecs, nil } + +func getIommuGroupFromSysfs(pciBusID string) (string, error) { + iommuLink := filepath.Join(pciDevicesPath, pciBusID, "iommu_group") + target, err := os.Readlink(iommuLink) + if err != nil { + return "", fmt.Errorf("failed to read IOMMU group symlink for %s: %w", pciBusID, err) + } + return filepath.Base(target), nil +} From 9ac21464f07a8f3d38c2716405bfdb75978a5ada Mon Sep 17 00:00:00 2001 From: John Hull Date: Thu, 30 Apr 2026 10:07:00 -0500 Subject: [PATCH 2/5] Filter VFIO device discovery to only include GPUs bound to vfio-pci Previously, enumerateGpuVfioDevices treated any NVIDIA GPU not on the nvidia driver as a VFIO candidate. This caused driverless GPUs (stuck after a failed unbind) and nvidia-bound GPUs to be advertised in the ResourceSlice as allocatable VFIO devices. When the scheduler picked one, the prepare would fail or hang trying to unbind from nvidia. Check the actual kernel driver binding via sysfs before adding a GPU to the VFIO device list. Only GPUs currently bound to vfio-pci are advertised. --- cmd/gpu-kubelet-plugin/nvlib.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmd/gpu-kubelet-plugin/nvlib.go b/cmd/gpu-kubelet-plugin/nvlib.go index 8e596f350..cc77ef726 100644 --- a/cmd/gpu-kubelet-plugin/nvlib.go +++ b/cmd/gpu-kubelet-plugin/nvlib.go @@ -621,6 +621,16 @@ func (l deviceLib) enumerateGpuVfioDevices(perGPUAllocatable *PerGPUAllocatableD for idx, pci := range gpuPciDevices { klog.Infof("Adding VFIO device for discovered GPU PCI device: %s", pci.Address) + driver, err := getDriver(pciDevicesPath, pci.Address) + if err != nil { + klog.Warningf("Skipping VFIO device %s: unable to read driver: %v", pci.Address, err) + continue + } + if driver != vfioPciDriver { + klog.Infof("Skipping VFIO device %s: bound to %q, not %q", pci.Address, driver, vfioPciDriver) + continue + } + parent := perGPUAllocatable.GetGPUDeviceByPCIBusID(pci.Address) if parent != nil && !parent.Gpu.vfioEnabled { klog.Infof("Skipping VFIO device for discovered GPU PCI device: %s, vfio is not enabled", pci.Address) From 847beede4a0766b5c69b36301db314068163843c Mon Sep 17 00:00:00 2001 From: John Hull Date: Thu, 30 Apr 2026 12:46:02 -0500 Subject: [PATCH 3/5] Skip Unconfigure rebind for GPUs pre-bound to vfio-pci Track the driver binding before Configure and check it in Unconfigure. If the GPU was already on vfio-pci (pre-bound at boot via kernel cmdline), leave it on vfio-pci instead of rebinding to nvidia. On H100 SXM5 systems with NVLink, rebinding to nvidia hangs indefinitely during fabric reconfiguration. --- cmd/gpu-kubelet-plugin/deviceinfo.go | 1 + cmd/gpu-kubelet-plugin/vfio-device.go | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/cmd/gpu-kubelet-plugin/deviceinfo.go b/cmd/gpu-kubelet-plugin/deviceinfo.go index 01a13e917..1bef5643e 100644 --- a/cmd/gpu-kubelet-plugin/deviceinfo.go +++ b/cmd/gpu-kubelet-plugin/deviceinfo.go @@ -95,6 +95,7 @@ type VfioDeviceInfo struct { iommuGroup int iommuFDEnabled bool addressableMemoryBytes uint64 + preConfigureDriver string } // CanonicalName returns the nameused for device announcement (in ResourceSlice diff --git a/cmd/gpu-kubelet-plugin/vfio-device.go b/cmd/gpu-kubelet-plugin/vfio-device.go index 54c935daa..1ce223262 100644 --- a/cmd/gpu-kubelet-plugin/vfio-device.go +++ b/cmd/gpu-kubelet-plugin/vfio-device.go @@ -147,6 +147,7 @@ func (vm *VfioPciManager) Configure(ctx context.Context, info *VfioDeviceInfo) e if err != nil { return fmt.Errorf("error getting driver details for GPU %q: %w", info.PciBusID, err) } + info.preConfigureDriver = driver // Skip if the GPU is already bound to the vfio-pci driver. if driver == vm.driver { @@ -192,6 +193,14 @@ func (vm *VfioPciManager) Unconfigure(ctx context.Context, info *VfioDeviceInfo) return nil } + // If the GPU was already on vfio-pci before Configure (pre-bound at + // boot), leave it on vfio-pci. Rebinding to nvidia on NVLink systems + // hangs indefinitely during fabric reconfiguration. + if info.preConfigureDriver == vfioPciDriver { + klog.Infof("GPU %s was pre-bound to vfio-pci, leaving on vfio-pci", info.PciBusID) + return nil + } + // Change the GPU driver to nvidia. err := vm.changeDriver(info.PciBusID, nvidiaDriver) if err != nil { From 1efeef54e3d566dda3defea15e430f41fdf6600d Mon Sep 17 00:00:00 2001 From: John Hull Date: Thu, 30 Apr 2026 14:21:22 -0500 Subject: [PATCH 4/5] Fix vfio_pci/IOMMU sysfs checks in containers The VfioPciManager checks /sys/module/vfio_pci and /sys/kernel/ iommu_groups to verify module loading and IOMMU support. In containers where /host-root is bind-mounted from host /, the container's own /sys mount doesn't expose host sysfs at /host-root/sys. Fall back to checking the unprefixed sysfs path when the host-root prefixed path doesn't exist. --- cmd/gpu-kubelet-plugin/vfio-device.go | 88 ++++++++++++++++++--------- 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/cmd/gpu-kubelet-plugin/vfio-device.go b/cmd/gpu-kubelet-plugin/vfio-device.go index 1ce223262..0d349716e 100644 --- a/cmd/gpu-kubelet-plugin/vfio-device.go +++ b/cmd/gpu-kubelet-plugin/vfio-device.go @@ -23,6 +23,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "sync" "time" @@ -193,10 +194,10 @@ func (vm *VfioPciManager) Unconfigure(ctx context.Context, info *VfioDeviceInfo) return nil } - // If the GPU was already on vfio-pci before Configure (pre-bound at - // boot), leave it on vfio-pci. Rebinding to nvidia on NVLink systems - // hangs indefinitely during fabric reconfiguration. - if info.preConfigureDriver == vfioPciDriver { + // If the GPU was pre-bound to vfio-pci (either tracked from Configure + // or detected via kernel cmdline vfio-pci.ids), leave it on vfio-pci. + // Rebinding to nvidia on NVLink systems hangs indefinitely. + if info.preConfigureDriver == vfioPciDriver || isVfioPciPrebound(info.deviceID) { klog.Infof("GPU %s was pre-bound to vfio-pci, leaving on vfio-pci", info.PciBusID) return nil } @@ -304,19 +305,19 @@ func (vm *VfioPciManager) disableGPUPersistenceMode(pciAddress string) error { // Check if the vfio_pci module is loaded. func checkVfioPCIModuleLoaded() (bool, error) { - f, err := os.Stat(filepath.Join(hostRoot, sysModulePath, vfioPciModule)) - if err != nil { - if os.IsNotExist(err) { - return false, nil + for _, root := range []string{hostRoot, ""} { + f, err := os.Stat(filepath.Join(root, sysModulePath, vfioPciModule)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return false, fmt.Errorf("failed to check if vfio_pci module is loaded: %w", err) + } + if f.IsDir() { + return true, nil } - return false, fmt.Errorf("failed to check if vfio_pci module is loaded: %w", err) - } - - if !f.IsDir() { - return false, nil } - - return true, nil + return false, nil } // Load the vfio_pci module. @@ -331,23 +332,54 @@ func loadVfioPciModule() error { // Check if IOMMU is enabled. func checkIommuEnabled() (bool, error) { - f, err := os.Open(filepath.Join(hostRoot, kernelIommuGroupPath)) - if os.IsNotExist(err) { - return false, nil - } - if err != nil { - return false, err + for _, root := range []string{hostRoot, ""} { + f, err := os.Open(filepath.Join(root, kernelIommuGroupPath)) + if os.IsNotExist(err) { + continue + } + if err != nil { + continue + } + defer f.Close() + _, err = f.Readdirnames(1) + if err == io.EOF { + continue + } + if err != nil { + return false, err + } + return true, nil } - defer f.Close() - _, err = f.Readdirnames(1) - if err == io.EOF { - return false, nil + return false, nil +} + +// isVfioPciPrebound checks if a device ID is in the kernel cmdline +// vfio-pci.ids parameter, indicating the GPU was pre-bound to vfio-pci +// at boot and should not be rebound to nvidia during Unconfigure. +func isVfioPciPrebound(deviceID string) bool { + if deviceID == "" { + return false } + cmdline, err := os.ReadFile(filepath.Join(hostRoot, "/proc/cmdline")) if err != nil { - return false, err + return false } - - return true, nil + // Strip 0x prefix for comparison (vfio-pci.ids uses bare hex). + id := strings.TrimPrefix(deviceID, "0x") + for _, param := range strings.Fields(string(cmdline)) { + if !strings.HasPrefix(param, "vfio-pci.ids=") { + continue + } + ids := strings.TrimPrefix(param, "vfio-pci.ids=") + for _, entry := range strings.Split(ids, ",") { + // Entries are vendor:device, e.g. 10de:2330 + parts := strings.Split(entry, ":") + if len(parts) == 2 && strings.EqualFold(parts[1], id) { + return true + } + } + } + return false } // Check if IOMMUFD is enabled. From 71e8efdce35b19fbeec68769ecf6c58db6f9b9d9 Mon Sep 17 00:00:00 2001 From: John Hull Date: Fri, 1 May 2026 14:56:43 -0500 Subject: [PATCH 5/5] Fix duplicate /dev/vfio/vfio in CDI spec and defer in loop - Remove duplicate /dev/vfio/vfio append in GetCommonEdits else branch (already added unconditionally above) - Extract checkIommuEnabledAt to avoid defer f.Close() inside loop --- cmd/gpu-kubelet-plugin/vfio-cdi.go | 4 ---- cmd/gpu-kubelet-plugin/vfio-device.go | 31 ++++++++++++++++----------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cmd/gpu-kubelet-plugin/vfio-cdi.go b/cmd/gpu-kubelet-plugin/vfio-cdi.go index 08173c8b9..fbde23b62 100644 --- a/cmd/gpu-kubelet-plugin/vfio-cdi.go +++ b/cmd/gpu-kubelet-plugin/vfio-cdi.go @@ -76,10 +76,6 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{ Path: iommuDevicePath, }) - } else { - edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{ - Path: filepath.Join(vfioDevicesRoot, "vfio"), - }) } return edits, nil diff --git a/cmd/gpu-kubelet-plugin/vfio-device.go b/cmd/gpu-kubelet-plugin/vfio-device.go index 0d349716e..c21192ef9 100644 --- a/cmd/gpu-kubelet-plugin/vfio-device.go +++ b/cmd/gpu-kubelet-plugin/vfio-device.go @@ -333,26 +333,33 @@ func loadVfioPciModule() error { // Check if IOMMU is enabled. func checkIommuEnabled() (bool, error) { for _, root := range []string{hostRoot, ""} { - f, err := os.Open(filepath.Join(root, kernelIommuGroupPath)) - if os.IsNotExist(err) { - continue - } + enabled, err := checkIommuEnabledAt(filepath.Join(root, kernelIommuGroupPath)) if err != nil { continue } - defer f.Close() - _, err = f.Readdirnames(1) - if err == io.EOF { - continue - } - if err != nil { - return false, err + if enabled { + return true, nil } - return true, nil } return false, nil } +func checkIommuEnabledAt(path string) (bool, error) { + f, err := os.Open(path) + if err != nil { + return false, err + } + defer f.Close() + _, err = f.Readdirnames(1) + if err == io.EOF { + return false, nil + } + if err != nil { + return false, err + } + return true, nil +} + // isVfioPciPrebound checks if a device ID is in the kernel cmdline // vfio-pci.ids parameter, indicating the GPU was pre-bound to vfio-pci // at boot and should not be rebound to nvidia during Unconfigure.