diff --git a/cmd/gpu-kubelet-plugin/deviceinfo.go b/cmd/gpu-kubelet-plugin/deviceinfo.go index 01a13e917..1bef5643e 100644 --- a/cmd/gpu-kubelet-plugin/deviceinfo.go +++ b/cmd/gpu-kubelet-plugin/deviceinfo.go @@ -95,6 +95,7 @@ type VfioDeviceInfo struct { iommuGroup int iommuFDEnabled bool addressableMemoryBytes uint64 + preConfigureDriver string } // CanonicalName returns the nameused for device announcement (in ResourceSlice diff --git a/cmd/gpu-kubelet-plugin/nvlib.go b/cmd/gpu-kubelet-plugin/nvlib.go index 8e596f350..cc77ef726 100644 --- a/cmd/gpu-kubelet-plugin/nvlib.go +++ b/cmd/gpu-kubelet-plugin/nvlib.go @@ -621,6 +621,16 @@ func (l deviceLib) enumerateGpuVfioDevices(perGPUAllocatable *PerGPUAllocatableD for idx, pci := range gpuPciDevices { klog.Infof("Adding VFIO device for discovered GPU PCI device: %s", pci.Address) + driver, err := getDriver(pciDevicesPath, pci.Address) + if err != nil { + klog.Warningf("Skipping VFIO device %s: unable to read driver: %v", pci.Address, err) + continue + } + if driver != vfioPciDriver { + klog.Infof("Skipping VFIO device %s: bound to %q, not %q", pci.Address, driver, vfioPciDriver) + continue + } + parent := perGPUAllocatable.GetGPUDeviceByPCIBusID(pci.Address) if parent != nil && !parent.Gpu.vfioEnabled { klog.Infof("Skipping VFIO device for discovered GPU PCI device: %s, vfio is not enabled", pci.Address) diff --git a/cmd/gpu-kubelet-plugin/vfio-cdi.go b/cmd/gpu-kubelet-plugin/vfio-cdi.go index 8e5e4cdf9..fbde23b62 100644 --- a/cmd/gpu-kubelet-plugin/vfio-cdi.go +++ b/cmd/gpu-kubelet-plugin/vfio-cdi.go @@ -18,6 +18,7 @@ package main import ( "fmt" + "os" "path/filepath" "strings" @@ -59,7 +60,13 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool }, } - // IOMMU API device is not requested. Exit early. + // Always include /dev/vfio/vfio for legacy VFIO — it's required by + // libvirt to detect VFIO support. enableAPIDevice controls whether the + // preferred IOMMU backend device is also added. + edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{ + Path: filepath.Join(vfioDevicesRoot, "vfio"), + }) + if !enableAPIDevice { return edits, nil } @@ -69,10 +76,6 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{ Path: iommuDevicePath, }) - } else { - edits.DeviceNodes = append(edits.DeviceNodes, &cdispec.DeviceNode{ - Path: filepath.Join(vfioDevicesRoot, "vfio"), - }) } return edits, nil @@ -85,18 +88,14 @@ func (h *vfioCDIHandler) GetCommonEdits(enableAPIDevice bool, preferIommuFD bool // We automatically assume we want the legacy device if PreferIommuFD policy is not selected. // If more policies are added in the future, the handler needs to be enhanced to support them. func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD bool) ([]cdispec.Device, error) { - nvpci := nvpci.New() - pciDeviceInfo, err := nvpci.GetGPUByPciBusID(pciBusID) - if err != nil { - return nil, fmt.Errorf("error getting PCI device info for GPU %q: %w", pciBusID, err) - } - devNodes := make([]*cdispec.DeviceNode, 0) if preferIommuFD && h.iommuFDEnabled { - // The IOMMUFD cdev is located at /dev/vfio/devices/ and is - // expected to be available if IOMMUFD is enabled on the node and the GPU is - // bound to the vfio driver. + nvpci := nvpci.New() + pciDeviceInfo, err := nvpci.GetGPUByPciBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("error getting PCI device info for GPU %q: %w", pciBusID, err) + } if !strings.HasPrefix(pciDeviceInfo.IommuFD, "vfio") { return nil, fmt.Errorf("missing iommufd cdev for GPU %q", pciDeviceInfo.Address) } @@ -104,8 +103,14 @@ func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD Path: filepath.Join(vfioDevicesPath, pciDeviceInfo.IommuFD), }) } else { + // Read IOMMU group directly from sysfs — works for GPUs on any + // driver including vfio-pci (nvpci may not find vfio-bound GPUs). + iommuGroup, err := getIommuGroupFromSysfs(pciBusID) + if err != nil { + return nil, fmt.Errorf("error getting IOMMU group for GPU %q: %w", pciBusID, err) + } devNodes = append(devNodes, &cdispec.DeviceNode{ - Path: filepath.Join(vfioDevicesRoot, fmt.Sprintf("%d", pciDeviceInfo.IommuGroup)), + Path: filepath.Join(vfioDevicesRoot, iommuGroup), }) } devSpecs := []cdispec.Device{ @@ -117,3 +122,12 @@ func (h *vfioCDIHandler) GetDeviceSpecsByPCIBusID(pciBusID string, preferIommuFD } return devSpecs, nil } + +func getIommuGroupFromSysfs(pciBusID string) (string, error) { + iommuLink := filepath.Join(pciDevicesPath, pciBusID, "iommu_group") + target, err := os.Readlink(iommuLink) + if err != nil { + return "", fmt.Errorf("failed to read IOMMU group symlink for %s: %w", pciBusID, err) + } + return filepath.Base(target), nil +} diff --git a/cmd/gpu-kubelet-plugin/vfio-device.go b/cmd/gpu-kubelet-plugin/vfio-device.go index 54c935daa..c21192ef9 100644 --- a/cmd/gpu-kubelet-plugin/vfio-device.go +++ b/cmd/gpu-kubelet-plugin/vfio-device.go @@ -23,6 +23,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "sync" "time" @@ -147,6 +148,7 @@ func (vm *VfioPciManager) Configure(ctx context.Context, info *VfioDeviceInfo) e if err != nil { return fmt.Errorf("error getting driver details for GPU %q: %w", info.PciBusID, err) } + info.preConfigureDriver = driver // Skip if the GPU is already bound to the vfio-pci driver. if driver == vm.driver { @@ -192,6 +194,14 @@ func (vm *VfioPciManager) Unconfigure(ctx context.Context, info *VfioDeviceInfo) return nil } + // If the GPU was pre-bound to vfio-pci (either tracked from Configure + // or detected via kernel cmdline vfio-pci.ids), leave it on vfio-pci. + // Rebinding to nvidia on NVLink systems hangs indefinitely. + if info.preConfigureDriver == vfioPciDriver || isVfioPciPrebound(info.deviceID) { + klog.Infof("GPU %s was pre-bound to vfio-pci, leaving on vfio-pci", info.PciBusID) + return nil + } + // Change the GPU driver to nvidia. err := vm.changeDriver(info.PciBusID, nvidiaDriver) if err != nil { @@ -295,19 +305,19 @@ func (vm *VfioPciManager) disableGPUPersistenceMode(pciAddress string) error { // Check if the vfio_pci module is loaded. func checkVfioPCIModuleLoaded() (bool, error) { - f, err := os.Stat(filepath.Join(hostRoot, sysModulePath, vfioPciModule)) - if err != nil { - if os.IsNotExist(err) { - return false, nil + for _, root := range []string{hostRoot, ""} { + f, err := os.Stat(filepath.Join(root, sysModulePath, vfioPciModule)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return false, fmt.Errorf("failed to check if vfio_pci module is loaded: %w", err) + } + if f.IsDir() { + return true, nil } - return false, fmt.Errorf("failed to check if vfio_pci module is loaded: %w", err) - } - - if !f.IsDir() { - return false, nil } - - return true, nil + return false, nil } // Load the vfio_pci module. @@ -322,10 +332,20 @@ func loadVfioPciModule() error { // Check if IOMMU is enabled. func checkIommuEnabled() (bool, error) { - f, err := os.Open(filepath.Join(hostRoot, kernelIommuGroupPath)) - if os.IsNotExist(err) { - return false, nil + for _, root := range []string{hostRoot, ""} { + enabled, err := checkIommuEnabledAt(filepath.Join(root, kernelIommuGroupPath)) + if err != nil { + continue + } + if enabled { + return true, nil + } } + return false, nil +} + +func checkIommuEnabledAt(path string) (bool, error) { + f, err := os.Open(path) if err != nil { return false, err } @@ -337,10 +357,38 @@ func checkIommuEnabled() (bool, error) { if err != nil { return false, err } - return true, nil } +// isVfioPciPrebound checks if a device ID is in the kernel cmdline +// vfio-pci.ids parameter, indicating the GPU was pre-bound to vfio-pci +// at boot and should not be rebound to nvidia during Unconfigure. +func isVfioPciPrebound(deviceID string) bool { + if deviceID == "" { + return false + } + cmdline, err := os.ReadFile(filepath.Join(hostRoot, "/proc/cmdline")) + if err != nil { + return false + } + // Strip 0x prefix for comparison (vfio-pci.ids uses bare hex). + id := strings.TrimPrefix(deviceID, "0x") + for _, param := range strings.Fields(string(cmdline)) { + if !strings.HasPrefix(param, "vfio-pci.ids=") { + continue + } + ids := strings.TrimPrefix(param, "vfio-pci.ids=") + for _, entry := range strings.Split(ids, ",") { + // Entries are vendor:device, e.g. 10de:2330 + parts := strings.Split(entry, ":") + if len(parts) == 2 && strings.EqualFold(parts[1], id) { + return true + } + } + } + return false +} + // Check if IOMMUFD is enabled. // We correlate the IOMMUFD support with the presence of the /dev/iommu API device. func checkIommuFDEnabled() (bool, error) {