Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions cmd/gpu-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,18 @@ func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, dev
}

func (s *DeviceState) unprepareVfioDevices(ctx context.Context, devices PreparedDeviceList) error {
// Release the NVSwitch fabric partition before rebinding the GPUs to the
// nvidia driver (reverse order of prepare).
pciBusIDs := []string{}
for _, device := range devices {
if device.Vfio.Info != nil && device.Vfio.Info.PciBusID != "" {
pciBusIDs = append(pciBusIDs, device.Vfio.Info.PciBusID)
}
}
if err := s.deactivateFabricPartition(pciBusIDs); err != nil {
return err
}

for _, device := range devices {
vfioAllocatable := s.perGPUAllocatable.GetAllocatableDevice(device.Vfio.Device.DeviceName)
if vfioAllocatable == nil {
Expand Down Expand Up @@ -1110,6 +1122,7 @@ func (s *DeviceState) applyVfioDeviceConfig(ctx context.Context, config *configa

configState.containerEdits = commonEdits
// Configure the vfio-pci devices.
pciBusIDs := make([]string, 0, len(results))
for _, r := range results {
device := s.perGPUAllocatable.GetAllocatableDevice(r.Device)
if device == nil {
Expand All @@ -1119,11 +1132,83 @@ func (s *DeviceState) applyVfioDeviceConfig(ctx context.Context, config *configa
if err != nil {
return nil, fmt.Errorf("error configuring vfio device %q: %w", r.Device, err)
}
pciBusIDs = append(pciBusIDs, device.Vfio.PciBusID)
}

// Program the NVSwitch fabric for this set of passthrough GPUs via Fabric
// Manager. No-op on non-HGX nodes / when FM is not wired up.
if err := s.activateFabricPartition(pciBusIDs); err != nil {
return nil, err
}

return &configState, nil
}

// fabricPartitionForPCIBusIDs resolves the FM partition formed by the given set
// of VFIO GPU PCI bus IDs. Returns ok=false (no error) when Fabric Manager is
// not wired up, when a GPU is unknown to FM, or when the set does not map to a
// single FM partition; in all of those cases partition (de)activation is
// skipped.
func (s *DeviceState) fabricPartitionForPCIBusIDs(pciBusIDs []string) (int, bool) {
fm := s.nvdevlib.fmManager
if fm == nil {
return 0, false
}
moduleIDs := make([]int, 0, len(pciBusIDs))
for _, pci := range pciBusIDs {
moduleID, ok := fm.GetModuleIDByPCI(pci)
if !ok {
klog.Warningf("Fabric Manager: no gpuModuleId for VFIO GPU at PCI %s; skipping partition (de)activation", pci)
return 0, false
}
moduleIDs = append(moduleIDs, moduleID)
}
partitionID, ok := fm.FindPartitionByModuleIDs(moduleIDs)
if !ok {
klog.Warningf("Fabric Manager: GPU module set %v does not match any FM partition; skipping partition (de)activation", moduleIDs)
return 0, false
}
return partitionID, true
}

// activateFabricPartition activates the FM partition formed by the given VFIO GPUs.
func (s *DeviceState) activateFabricPartition(pciBusIDs []string) error {
partitionID, ok := s.fabricPartitionForPCIBusIDs(pciBusIDs)
if !ok {
return nil
}
// Idempotency: a retried Prepare (e.g. after a later step failed) must not
// re-activate a partition this Manager already activated, which FM would
// reject as in-use.
if slices.Contains(s.nvdevlib.fmManager.ActivatedPartitions(), partitionID) {
klog.V(4).Infof("Fabric Manager: partition %d already active; skipping activation", partitionID)
return nil
}
klog.V(2).Infof("Fabric Manager: activating partition %d for %d-GPU passthrough claim", partitionID, len(pciBusIDs))
if err := s.nvdevlib.fmManager.ActivatePartition(partitionID); err != nil {
return fmt.Errorf("activating fabric partition %d: %w", partitionID, err)
}
return nil
}

// deactivateFabricPartition releases the FM partition formed by the given VFIO
// GPUs.
func (s *DeviceState) deactivateFabricPartition(pciBusIDs []string) error {
partitionID, ok := s.fabricPartitionForPCIBusIDs(pciBusIDs)
if !ok {
return nil
}
if !slices.Contains(s.nvdevlib.fmManager.ActivatedPartitions(), partitionID) {
klog.V(4).Infof("Fabric Manager: partition %d already inactive; skipping deactivation", partitionID)
return nil
}
klog.V(2).Infof("Fabric Manager: deactivating partition %d for %d-GPU passthrough claim", partitionID, len(pciBusIDs))
if err := s.nvdevlib.fmManager.DeactivatePartition(partitionID); err != nil {
return fmt.Errorf("deactivating fabric partition %d: %w", partitionID, err)
}
return nil
}

// GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver.
//
// Configs can either come from the resource claim itself or from the device
Expand Down
41 changes: 41 additions & 0 deletions cmd/gpu-kubelet-plugin/deviceinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/dynamic-resource-allocation/deviceattribute"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
)

Expand Down Expand Up @@ -97,6 +98,20 @@
iommuGroup int
iommuFDEnabled bool
addressableMemoryBytes uint64

// Fabric Manager attributes (HGX systems with NVSwitch). Populated only
// when an FM Manager is available at discovery time
//
// gpuModuleID is the per-board physical ID returned by
// nvmlDeviceGetModuleId. It corresponds to the FM partition member
// physicalId
gpuModuleID int

// partitionsBySize maps an FM partition size (number of GPUs in the
// partition) to the partitionId of the partition of that size that
// includes this GPU. Used to publish the `partition1`/`partition2`/
// `partition4`/`partition8` device attributes.
partitionsBySize map[int]int
}

// CanonicalName returns the nameused for device announcement (in ResourceSlice
Expand Down Expand Up @@ -272,5 +287,31 @@
device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value
}

d.addFabricManagerAttributes(device.Attributes)

return device
}

// addFabricManagerAttributes publishes the Fabric Manager-derived attributes

Check failure on line 295 in cmd/gpu-kubelet-plugin/deviceinfo.go

View workflow job for this annotation

GitHub Actions / basic / golang / check

Comment should end in a period (godot)
func (d *VfioDeviceInfo) addFabricManagerAttributes(attrs map[resourceapi.QualifiedName]resourceapi.DeviceAttribute) {
klog.Infof("!!!!!!!!!!!Adding Fabric Manager attributes for %s", d.CanonicalName())
if d.gpuModuleID == 0 && len(d.partitionsBySize) == 0 {
klog.Infof("!!!!!!!!!!!No Fabric Manager attributes for %s", d.CanonicalName())
return
}

klog.Infof("!!!!!!!!!!!gpuModuleID: %d", d.gpuModuleID)
klog.Infof("!!!!!!!!!!!partitionsBySize: %v", d.partitionsBySize)
if d.gpuModuleID != 0 {
attrs["gpuModuleId"] = resourceapi.DeviceAttribute{
IntValue: ptr.To(int64(d.gpuModuleID)),
}
}

for size, partitionID := range d.partitionsBySize {
key := resourceapi.QualifiedName(fmt.Sprintf("partition%d", size))
attrs[key] = resourceapi.DeviceAttribute{
IntValue: ptr.To(int64(partitionID)),
}
}
}
7 changes: 7 additions & 0 deletions cmd/gpu-kubelet-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,13 @@ func (d *driver) Shutdown() error {
d.state.nvdevlib.alwaysShutdown()
}

// Tear down the long-lived Fabric Manager connection, if one was opened.
if d.state.nvdevlib.fmManager != nil {
if err := d.state.nvdevlib.fmManager.Close(); err != nil {
klog.Warningf("error closing Fabric Manager connection: %v", err)
}
}

if d.deviceHealthMonitor != nil {
d.deviceHealthMonitor.Stop()
}
Expand Down
101 changes: 101 additions & 0 deletions cmd/gpu-kubelet-plugin/nvlib.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
"k8s.io/dynamic-resource-allocation/deviceattribute"

"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/fabricmanager"
"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/featuregates"
)

Expand All @@ -49,6 +50,7 @@ type deviceLib struct {
gpuInfosByUUID map[string]*GpuInfo
gpuUUIDbyPCIBusID map[PCIBusID]string
devhandleByUUID map[string]nvml.Device
fmManager *fabricmanager.Manager
}

func newDeviceLib(driverRoot root) (*deviceLib, error) {
Expand Down Expand Up @@ -90,9 +92,79 @@ func newDeviceLib(driverRoot root) (*deviceLib, error) {
}
}

// Fabric Manager partitioning is only relevant for GPU passthrough on
// NVSwitch-based HGX nodes. When PassthroughSupport is enabled, try to
// open a long-lived connection to nv-fabricmanager so VFIO devices can be
// published with their gpuModuleId / partition attributes. Failure is
// non-fatal: on non-HGX nodes (or when FM is simply not running) we log
// and leave d.fmManager nil, and all FM-derived attributes are omitted.
if featuregates.Enabled(featuregates.PassthroughSupport) {
d.fmManager = d.tryOpenFabricManager()
}

return &d, nil
}

// Fabric Manager connection environment variables and their defaults.
const (
// fmAddressEnvvar selects the FM TCP transport and sets the host to
// connect to. When this variable is set (even to ""), TCP is used and an
// empty value falls back to defaultFMAddress.
fmAddressEnvvar = "NVIDIA_FABRICMANAGER_ADDRESS"
// fmUnixSocketEnvvar overrides the unix socket path used when TCP is not
// selected. An empty value falls back to defaultFMUnixSocket.
fmUnixSocketEnvvar = "NVIDIA_FABRICMANAGER_UNIX_SOCKET"
// fmLibraryPathEnvvar overrides the libnvfm.so path. An empty value falls
// back to defaultFMLibraryPath.
fmLibraryPathEnvvar = "NVIDIA_FABRICMANAGER_LIBRARY_PATH"

defaultFMAddress = "127.0.0.1"
defaultFMUnixSocket = "/run/nvidia-fabricmanager/socket"
defaultFMLibraryPath = "/usr/lib/libnvfm.so"
)

// tryOpenFabricManager attempts to build an FM Manager backed by go-nvfm.
func (l deviceLib) tryOpenFabricManager() *fabricmanager.Manager {
klog.Infof("!!!!!!!!!!!tryOpenFabricManager")
shutdown, ret := l.ensureNVML()
if ret != nvml.SUCCESS {
klog.Warningf("Fabric Manager: NVML unavailable, skipping FM discovery: %s", ret)
return nil
}
defer shutdown()
klog.Infof("!!!!!!!!!!!ensureNVML done")

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ignore these debugging logs for now


libPath := defaultFMLibraryPath
if v, ok := os.LookupEnv(fmLibraryPathEnvvar); ok && v != "" {
libPath = v
}
client := fabricmanager.NewClient(libPath)

params := fabricmanager.ConnectParams{}
if addr, ok := os.LookupEnv(fmAddressEnvvar); ok {
if addr == "" {
addr = defaultFMAddress
}
params.AddressInfo = addr
} else {
socket := defaultFMUnixSocket
if v, ok := os.LookupEnv(fmUnixSocketEnvvar); ok && v != "" {
socket = v
}
params.AddressInfo = socket
params.AddressIsUnixSocket = true
}

fmMgr, err := fabricmanager.Open(l.nvmllib, client, params)
if err != nil {
klog.Warningf("Fabric Manager not available, FM attributes will be omitted: %v", err)
return nil
}

klog.Infof("!!!!!!!!!!!Fabric Manager connection established; FM partition attributes enabled")
return fmMgr
}

// prependPathListEnvvar prepends a specified list of strings to a specified envvar and returns its value.
func prependPathListEnvvar(envvar string, prepend ...string) string {
if len(prepend) == 0 {
Expand Down Expand Up @@ -700,9 +772,38 @@ func (l deviceLib) getVfioDeviceInfo(idx int, device *nvpci.NvidiaPCIDevice) (*V
addressableMemoryBytes: memoryBytes,
}

if err := l.attachFabricManagerInfo(vfioDeviceInfo); err != nil {
return nil, fmt.Errorf("error attaching fabric manager info for %s: %w", device.Address, err)
}

return vfioDeviceInfo, nil
}

// attachFabricManagerInfo populates the gpuModuleId and partitionN attributes
// on the given VFIO device from the FM Manager, if one is wired up. It is a
// no-op when fmManager is nil.
//
// A PCI bus ID known to the host but unknown to FM is treated as
// non-fatal: we log and skip the FM attributes for that GPU.
func (l deviceLib) attachFabricManagerInfo(d *VfioDeviceInfo) error {
if l.fmManager == nil {
return nil
}
moduleID, ok := l.fmManager.GetModuleIDByPCI(d.PciBusID)
if !ok {
klog.V(2).Infof("Fabric Manager has no record of GPU PCI bus ID %s; skipping FM attributes", d.PciBusID)
return nil
}
d.gpuModuleID = moduleID

bySize, err := l.fmManager.GetPartitionsBySizeByModuleID(moduleID)
if err != nil {
return fmt.Errorf("getting partition-by-size mapping for moduleId %d: %w", moduleID, err)
}
d.partitionsBySize = bySize
return nil
}

func (l deviceLib) getMigDevices(gpuInfo *GpuInfo) (map[string]*MigDeviceInfo, error) {
if !gpuInfo.migEnabled {
return nil, nil
Expand Down
20 changes: 20 additions & 0 deletions deployments/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,21 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/* \
&& /bin/bash-static --version

# Fabric Manager shared library (libnvfm.so), required at runtime by the
# gpu-kubelet-plugin to talk to nv-fabricmanager on NVSwitch-based HGX nodes.
# The distroless runtime image has no package manager, so install the FM SDK
# package here and copy the resulting library into the final image below.
FROM nvcr.io/nvidia/base/ubuntu:jammy-20260217 AS fabricmanager
# Install the FM SDK and stage libnvfm.so (+ versioned soname) into an
# arch-neutral directory. The package installs into the target arch's multiarch
# dir (e.g. x86_64-linux-gnu or aarch64-linux-gnu), so we normalize the location
# here and copy with -a to preserve the symlink chain.
RUN apt-get update \
&& apt-get install -y --no-install-recommends nvidia-fabricmanager-dev-580 \

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How can we ensure that versioning of libnvfm matches with driver installed by GPU operator?

&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives \
&& mkdir -p /fm-lib \
&& cp -a /usr/lib/*-linux-gnu/libnvfm.so* /fm-lib/

# Pull the nvidia-cdi-hook binary out of the relevant toolkit container
# (arch: TARGETPLATFORM, set via --platform).
FROM ${TOOLKIT_CONTAINER_IMAGE} AS toolkit
Expand Down Expand Up @@ -131,6 +146,11 @@ COPY LICENSE /

COPY --from=bash /bin/bash-static /bin/bash

# Fabric Manager SDK shared library (and its versioned soname) so the
# gpu-kubelet-plugin can dlopen libnvfm.so at runtime. Placed in /usr/lib so the
# path is arch-neutral and resolvable via an absolute dlopen.
COPY --from=fabricmanager /fm-lib/ /usr/lib/

COPY --from=toolkit /artifacts/rpm/usr/bin/nvidia-cdi-hook /usr/bin/nvidia-cdi-hook
COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-domain-controller
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
volumeMounts:
- name: run-nvidia-fabricmanager
mountPath: /run/nvidia-fabricmanager
- name: plugins-registry
mountPath: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }}
- name: plugins
Expand All @@ -319,6 +321,10 @@ spec:
{{- end }}
{{- end }}
volumes:
- name: run-nvidia-fabricmanager
hostPath:
path: /run/nvidia-fabricmanager
type: DirectoryOrCreate
- name: plugins-registry
hostPath:
path: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }}
Expand Down
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ require (
)

require (
github.com/NVIDIA/go-nvfm v0.0.0-20260528194329-0a8cb60d7cb1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
Expand Down Expand Up @@ -81,7 +82,7 @@ require (
golang.org/x/net v0.53.0 // indirect
golang.org/x/oauth2 v0.34.0 // indirect
golang.org/x/sync v0.20.0 // indirect
golang.org/x/sys v0.43.0 // indirect
golang.org/x/sys v0.45.0 // indirect
golang.org/x/term v0.42.0 // indirect
golang.org/x/text v0.36.0 // indirect
golang.org/x/tools v0.44.0 // indirect
Expand All @@ -96,3 +97,5 @@ require (
sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)

replace github.com/NVIDIA/go-nvfm => github.com/varunrsekar/go-nvfm v0.0.0-20260528194329-0a8cb60d7cb1
Loading
Loading