Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions cmd/gpu-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,18 @@ func (s *DeviceState) unprepareVfioDevices(ctx context.Context, devices Prepared
return fmt.Errorf("error unconfiguring vfio device %q: %w", device.Vfio.Device.DeviceName, err)
}
}

// Release the NVSwitch fabric partition before rebinding the GPUs to the
// nvidia driver
pciBusIDs := []string{}
for _, device := range devices {
if device.Vfio.Info != nil && device.Vfio.Info.PciBusID != "" {
pciBusIDs = append(pciBusIDs, device.Vfio.Info.PciBusID)
}
}
if err := s.deactivateFabricPartition(pciBusIDs); err != nil {
return err
}
return nil
}

Expand Down Expand Up @@ -997,6 +1009,16 @@ func (s *DeviceState) discoverSiblingAllocatables(device *AllocatableDevice) err
return fmt.Errorf("error adding allocatable device: %w", err)
}
device.Vfio.parent = gpu.Gpu

// The GPU has just switched from the vfio-pci driver back to the nvidia
// driver and is visible to NVML again. If its Fabric Manager attributes
// could not be discovered at startup (because it was already bound to
// vfio-pci then), refresh the FM module mapping from NVML and re-attach
// the gpuModuleId/partitionN attributes to the in-memory VFIO device so
// the republished ResourceSlice carries them.
if err := s.nvdevlib.refreshFabricManagerInfo(device.Vfio); err != nil {
return fmt.Errorf("error refreshing fabric manager info for vfio device %q: %w", device.Vfio.CanonicalName(), err)
}
case MigStaticDeviceType:
// TODO: Implement once partitionable device is supported with PassthroughSupport feature gate.
return nil
Expand Down Expand Up @@ -1110,6 +1132,7 @@ func (s *DeviceState) applyVfioDeviceConfig(ctx context.Context, config *configa

configState.containerEdits = commonEdits
// Configure the vfio-pci devices.
pciBusIDs := make([]string, 0, len(results))
for _, r := range results {
device := s.perGPUAllocatable.GetAllocatableDevice(r.Device)
if device == nil {
Expand All @@ -1119,11 +1142,83 @@ func (s *DeviceState) applyVfioDeviceConfig(ctx context.Context, config *configa
if err != nil {
return nil, fmt.Errorf("error configuring vfio device %q: %w", r.Device, err)
}
pciBusIDs = append(pciBusIDs, device.Vfio.PciBusID)
}

// Program the NVSwitch fabric for this set of passthrough GPUs via Fabric
// Manager. No-op on non-HGX nodes / when FM is not wired up.
if err := s.activateFabricPartition(pciBusIDs); err != nil {
return nil, err
}

return &configState, nil
}

// fabricPartitionForPCIBusIDs resolves the FM partition formed by the given set
// of VFIO GPU PCI bus IDs. Returns ok=false (no error) when Fabric Manager is
// not wired up, when a GPU is unknown to FM, or when the set does not map to a
// single FM partition; in all of those cases partition (de)activation is
// skipped.
func (s *DeviceState) fabricPartitionForPCIBusIDs(pciBusIDs []string) (int, bool) {
fm := s.nvdevlib.fmManager
if fm == nil {
return 0, false
}
moduleIDs := make([]int, 0, len(pciBusIDs))
for _, pci := range pciBusIDs {
moduleID, ok := fm.GetModuleIDByPCI(pci)
if !ok {
klog.Warningf("Fabric Manager: no gpuModuleId for VFIO GPU at PCI %s; skipping partition (de)activation", pci)
return 0, false
}
moduleIDs = append(moduleIDs, moduleID)
}
partitionID, ok := fm.FindPartitionByModuleIDs(moduleIDs)
if !ok {
klog.Warningf("Fabric Manager: GPU module set %v does not match any FM partition; skipping partition (de)activation", moduleIDs)
return 0, false
}
return partitionID, true
}

// activateFabricPartition activates the FM partition formed by the given VFIO GPUs.
func (s *DeviceState) activateFabricPartition(pciBusIDs []string) error {
partitionID, ok := s.fabricPartitionForPCIBusIDs(pciBusIDs)
if !ok {
return nil
}
// Idempotency: a retried Prepare (e.g. after a later step failed) must not
// re-activate a partition this Manager already activated, which FM would
// reject as in-use.
if slices.Contains(s.nvdevlib.fmManager.ActivatedPartitions(), partitionID) {
klog.V(4).Infof("Fabric Manager: partition %d already active; skipping activation", partitionID)
return nil
}
klog.V(2).Infof("Fabric Manager: activating partition %d for %d-GPU passthrough claim", partitionID, len(pciBusIDs))
if err := s.nvdevlib.fmManager.ActivatePartition(partitionID); err != nil {
return fmt.Errorf("activating fabric partition %d: %w", partitionID, err)
}
return nil
}

// deactivateFabricPartition releases the FM partition formed by the given VFIO
// GPUs.
func (s *DeviceState) deactivateFabricPartition(pciBusIDs []string) error {
partitionID, ok := s.fabricPartitionForPCIBusIDs(pciBusIDs)
if !ok {
return nil
}
if !slices.Contains(s.nvdevlib.fmManager.ActivatedPartitions(), partitionID) {
klog.V(4).Infof("Fabric Manager: partition %d already inactive; skipping deactivation", partitionID)
return nil
}
klog.V(2).Infof("Fabric Manager: deactivating partition %d for %d-GPU passthrough claim", partitionID, len(pciBusIDs))
if err := s.nvdevlib.fmManager.DeactivatePartition(partitionID); err != nil {
return fmt.Errorf("deactivating fabric partition %d: %w", partitionID, err)
}
return nil
}

// GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver.
//
// Configs can either come from the resource claim itself or from the device
Expand Down
40 changes: 40 additions & 0 deletions cmd/gpu-kubelet-plugin/deviceinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/dynamic-resource-allocation/deviceattribute"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
)

Expand Down Expand Up @@ -97,6 +98,20 @@ type VfioDeviceInfo struct {
iommuGroup int
iommuFDEnabled bool
addressableMemoryBytes uint64

// Fabric Manager attributes (HGX systems with NVSwitch). Populated only
// when an FM Manager is available at discovery time
//
// gpuModuleID is the per-board physical ID returned by
// nvmlDeviceGetModuleId. It corresponds to the FM partition member
// physicalId
gpuModuleID int

// partitionsBySize maps an FM partition size (number of GPUs in the
// partition) to the partitionId of the partition of that size that
// includes this GPU. Used to publish the `partition1`/`partition2`/
// `partition4`/`partition8` device attributes.
partitionsBySize map[int]int
}

// CanonicalName returns the nameused for device announcement (in ResourceSlice
Expand Down Expand Up @@ -272,5 +287,30 @@ func (d *VfioDeviceInfo) GetDevice() resourceapi.Device {
device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value
}

d.addFabricManagerAttributes(device.Attributes)

return device
}

// addFabricManagerAttributes publishes the Fabric Manager-derived attributes
func (d *VfioDeviceInfo) addFabricManagerAttributes(attrs map[resourceapi.QualifiedName]resourceapi.DeviceAttribute) {
if d.gpuModuleID == 0 && len(d.partitionsBySize) == 0 {
klog.V(4).Infof("No Fabric Manager attributes for %s", d.CanonicalName())
return
}

klog.V(4).Infof("Adding Fabric Manager attributes for %s: gpuModuleId=%d partitionsBySize=%v",
d.CanonicalName(), d.gpuModuleID, d.partitionsBySize)
if d.gpuModuleID != 0 {
attrs["gpuModuleId"] = resourceapi.DeviceAttribute{
IntValue: ptr.To(int64(d.gpuModuleID)),
}
}

for size, partitionID := range d.partitionsBySize {
key := resourceapi.QualifiedName(fmt.Sprintf("partition%d", size))
attrs[key] = resourceapi.DeviceAttribute{
IntValue: ptr.To(int64(partitionID)),
}
}
}
7 changes: 7 additions & 0 deletions cmd/gpu-kubelet-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,13 @@ func (d *driver) Shutdown() error {
d.state.nvdevlib.alwaysShutdown()
}

// Tear down the long-lived Fabric Manager connection, if one was opened.
if d.state.nvdevlib.fmManager != nil {
if err := d.state.nvdevlib.fmManager.Close(); err != nil {
klog.Warningf("error closing Fabric Manager connection: %v", err)
}
}

if d.deviceHealthMonitor != nil {
d.deviceHealthMonitor.Stop()
}
Expand Down
131 changes: 131 additions & 0 deletions cmd/gpu-kubelet-plugin/nvlib.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
"k8s.io/dynamic-resource-allocation/deviceattribute"

"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/fabricmanager"
"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/featuregates"
)

Expand All @@ -49,6 +50,7 @@ type deviceLib struct {
gpuInfosByUUID map[string]*GpuInfo
gpuUUIDbyPCIBusID map[PCIBusID]string
devhandleByUUID map[string]nvml.Device
fmManager *fabricmanager.Manager
}

func newDeviceLib(driverRoot root) (*deviceLib, error) {
Expand Down Expand Up @@ -90,9 +92,77 @@ func newDeviceLib(driverRoot root) (*deviceLib, error) {
}
}

// Fabric Manager partitioning is only relevant for GPU passthrough on
// NVSwitch-based HGX nodes. When PassthroughSupport is enabled, try to
// open a long-lived connection to nv-fabricmanager so VFIO devices can be
// published with their gpuModuleId / partition attributes. Failure is
// non-fatal: on non-HGX nodes (or when FM is simply not running) we log
// and leave d.fmManager nil, and all FM-derived attributes are omitted.
if featuregates.Enabled(featuregates.PassthroughSupport) {
d.fmManager = d.tryOpenFabricManager()
}

return &d, nil
}

// Fabric Manager connection environment variables and their defaults.
const (
// fmAddressEnvvar selects the FM TCP transport and sets the host to
// connect to. When this variable is set (even to ""), TCP is used and an
// empty value falls back to defaultFMAddress.
fmAddressEnvvar = "NVIDIA_FABRICMANAGER_ADDRESS"
// fmUnixSocketEnvvar overrides the unix socket path used when TCP is not
// selected. An empty value falls back to defaultFMUnixSocket.
fmUnixSocketEnvvar = "NVIDIA_FABRICMANAGER_UNIX_SOCKET"
// fmLibraryPathEnvvar overrides the libnvfm.so path. An empty value falls
// back to defaultFMLibraryPath.
fmLibraryPathEnvvar = "NVIDIA_FABRICMANAGER_LIBRARY_PATH"

defaultFMAddress = "127.0.0.1"
defaultFMUnixSocket = "/run/nvidia-fabricmanager/socket"
defaultFMLibraryPath = "/usr/lib/libnvfm.so"
)

// tryOpenFabricManager attempts to build an FM Manager backed by go-nvfm.
func (l deviceLib) tryOpenFabricManager() *fabricmanager.Manager {
shutdown, ret := l.ensureNVML()
if ret != nvml.SUCCESS {
klog.Warningf("Fabric Manager: NVML unavailable, skipping FM discovery: %s", ret)
return nil
}
defer shutdown()

libPath := defaultFMLibraryPath
if v, ok := os.LookupEnv(fmLibraryPathEnvvar); ok && v != "" {
libPath = v
}
client := fabricmanager.NewClient(libPath)

params := fabricmanager.ConnectParams{}
if addr, ok := os.LookupEnv(fmAddressEnvvar); ok {
if addr == "" {
addr = defaultFMAddress
}
params.AddressInfo = addr
} else {
socket := defaultFMUnixSocket
if v, ok := os.LookupEnv(fmUnixSocketEnvvar); ok && v != "" {
socket = v
}
params.AddressInfo = socket
params.AddressIsUnixSocket = true
}

fmMgr, err := fabricmanager.Open(l.nvmllib, client, params)
if err != nil {
klog.Warningf("Fabric Manager not available, FM attributes will be omitted: %v", err)
return nil
}

klog.Infof("Fabric Manager connection established; FM partition attributes enabled")
return fmMgr
}

// prependPathListEnvvar prepends a specified list of strings to a specified envvar and returns its value.
func prependPathListEnvvar(envvar string, prepend ...string) string {
if len(prepend) == 0 {
Expand Down Expand Up @@ -700,9 +770,70 @@ func (l deviceLib) getVfioDeviceInfo(idx int, device *nvpci.NvidiaPCIDevice) (*V
addressableMemoryBytes: memoryBytes,
}

if err := l.attachFabricManagerInfo(vfioDeviceInfo); err != nil {
return nil, fmt.Errorf("error attaching fabric manager info for %s: %w", device.Address, err)
}

return vfioDeviceInfo, nil
}

// attachFabricManagerInfo populates the gpuModuleId and partitionN attributes
// on the given VFIO device from the FM Manager, if one is wired up. It is a
// no-op when fmManager is nil.
//
// A PCI bus ID known to the host but unknown to FM is treated as
// non-fatal: we log and skip the FM attributes for that GPU.
func (l deviceLib) attachFabricManagerInfo(d *VfioDeviceInfo) error {
if l.fmManager == nil {
return nil
}
moduleID, ok := l.fmManager.GetModuleIDByPCI(d.PciBusID)
if !ok {
klog.Warningf("Fabric Manager has no gpuModuleId for GPU PCI bus ID %s; publishing %s without FM attributes. "+
"This happens when FM could not supply a PCI address for this GPU (e.g. it was already bound to vfio-pci before nv-fabricmanager started).",
d.PciBusID, d.CanonicalName())
return nil
}
d.gpuModuleID = moduleID

bySize, err := l.fmManager.GetPartitionsBySizeByModuleID(moduleID)
if err != nil {
return fmt.Errorf("getting partition-by-size mapping for moduleId %d: %w", moduleID, err)
}
d.partitionsBySize = bySize
return nil
}

// refreshFabricManagerInfo refreshes the FM module mapping from NVML and
// re-attaches the FM attributes (gpuModuleId / partitionN) to the given VFIO
// device.
//
// It is used after a GPU has switched from the vfio-pci driver back to the
// nvidia driver (during unprepare). A GPU already bound to vfio-pci at plugin
// startup is invisible to NVML, so its gpuModuleId could not be discovered and
// its VFIO device was published without FM attributes. Once the GPU is back on
// the nvidia driver NVML can see it again: refresh the FM module mapping so the
// (PCI, gpuModuleId) pair is recorded, then re-attach the attributes so they
// are repopulated in the in-memory device and republished to the ResourceSlice.
//
// No-op when fmManager is nil (non-HGX nodes / FM not wired up).
func (l deviceLib) refreshFabricManagerInfo(d *VfioDeviceInfo) error {
if l.fmManager == nil {
return nil
}

shutdown, ret := l.ensureNVML()
if ret != nvml.SUCCESS {
return fmt.Errorf("ensureNVML failed: %w", ret)
}
defer shutdown()

if err := l.fmManager.RefreshModuleMapping(l.nvmllib); err != nil {
return fmt.Errorf("refreshing fabric manager module mapping: %w", err)
}
return l.attachFabricManagerInfo(d)
}

func (l deviceLib) getMigDevices(gpuInfo *GpuInfo) (map[string]*MigDeviceInfo, error) {
if !gpuInfo.migEnabled {
return nil, nil
Expand Down
Loading