diff --git a/Makefile b/Makefile index 7191fba3f..3198b332e 100644 --- a/Makefile +++ b/Makefile @@ -83,7 +83,8 @@ PLUGINS ?= \ nri-memory-policy \ nri-memory-qos \ nri-memtierd \ - nri-sgx-epc + nri-sgx-epc \ + nri-resctrl-mon BINARIES ?= \ config-manager \ diff --git a/cmd/plugins/resctrl-mon/Dockerfile b/cmd/plugins/resctrl-mon/Dockerfile new file mode 100644 index 000000000..c833ca203 --- /dev/null +++ b/cmd/plugins/resctrl-mon/Dockerfile @@ -0,0 +1,40 @@ +ARG GO_VERSION=1.26 + +FROM golang:${GO_VERSION}-bookworm AS builder + +ARG IMAGE_VERSION +ARG BUILD_VERSION +ARG BUILD_BUILDID +ARG DEBUG=0 +ARG NORACE=0 +ARG SKIP_LICENSES=0 + +WORKDIR /go/builder + +# Fetch go dependencies in a separate layer for caching +COPY go.mod go.sum . +COPY pkg/topology/ pkg/topology/ +RUN --mount=type=cache,target=/go/pkg/mod/ go mod download + +# Build nri-resctrl-mon +COPY . . + +RUN --mount=type=cache,target=/go/pkg/mod/ \ + --mount=type=cache,target="/root/.cache/go-build" \ + make IMAGE_VERSION=${IMAGE_VERSION} \ + BUILD_VERSION=${BUILD_VERSION} \ + BUILD_BUILDID=${BUILD_BUILDID} \ + DEBUG=$DEBUG \ + NORACE=$NORACE \ + OTHER_IMAGE_TARGETS="" \ + BINARIES="" \ + PLUGINS=nri-resctrl-mon \ + clean install-go-licenses build-plugins-static licenses + +FROM gcr.io/distroless/static + +COPY --from=builder /go/builder/build/bin/nri-resctrl-mon /bin/nri-resctrl-mon +COPY --from=builder /go/builder/build/licenses/nri-resctrl-mon/ /licenses/nri-resctrl-mon/ +COPY --from=builder /go/builder/sample-configs/nri-resctrl-mon.yaml /etc/nri/resctrl-mon/config.yaml + +ENTRYPOINT ["/bin/nri-resctrl-mon", "-idx", "90", "-config", "/etc/nri/resctrl-mon/config.yaml"] diff --git a/cmd/plugins/resctrl-mon/main.go b/cmd/plugins/resctrl-mon/main.go new file mode 100644 index 000000000..eedf56599 --- /dev/null +++ b/cmd/plugins/resctrl-mon/main.go @@ -0,0 +1,90 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "flag" + "os" + + "github.com/containerd/nri/pkg/stub" + "github.com/sirupsen/logrus" +) + +var ( + log *logrus.Logger +) + +func main() { + var ( + pluginName string + pluginIdx string + configFile string + verbose bool + veryVerbose bool + err error + ) + + log = logrus.StandardLogger() + log.SetFormatter(&logrus.TextFormatter{ + PadLevelText: true, + }) + + flag.StringVar(&pluginName, "name", "", "plugin name to register to NRI") + flag.StringVar(&pluginIdx, "idx", "", "plugin index to register to NRI") + flag.StringVar(&configFile, "config", "", "configuration file name") + flag.BoolVar(&verbose, "v", false, "verbose output") + flag.BoolVar(&veryVerbose, "vv", false, "very verbose output") + flag.Parse() + + if verbose { + log.SetLevel(logrus.DebugLevel) + } + if veryVerbose { + log.SetLevel(logrus.TraceLevel) + } + + p := newPlugin() + + if configFile != "" { + log.Debugf("reading configuration from %q", configFile) + data, err := os.ReadFile(configFile) + if err != nil { + log.Fatalf("error reading configuration file %q: %s", configFile, err) + } + if err = p.setConfig(data); err != nil { + log.Fatalf("error applying configuration from file %q: %s", configFile, err) + } + } + + opts := []stub.Option{ + stub.WithOnClose(p.onClose), + } + if pluginName != "" { + opts = append(opts, stub.WithPluginName(pluginName)) + } + if pluginIdx != "" { + opts = append(opts, stub.WithPluginIdx(pluginIdx)) + } + + if p.stub, err = stub.New(p, opts...); err != nil { + log.Fatalf("failed to create plugin stub: %v", err) + } + + if err = p.stub.Run(context.Background()); err != nil { + log.Errorf("plugin exited (%v)", err) + os.Exit(1) + } +} diff --git a/cmd/plugins/resctrl-mon/plugin.go b/cmd/plugins/resctrl-mon/plugin.go new file mode 100644 index 000000000..3d171c1bc --- /dev/null +++ b/cmd/plugins/resctrl-mon/plugin.go @@ -0,0 +1,435 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "sigs.k8s.io/yaml" + + "github.com/containerd/nri/pkg/api" + "github.com/containerd/nri/pkg/stub" +) + +// plugin implements the NRI plugin interface for resctrl monitoring groups. +type plugin struct { + stub stub.Stub + config *pluginConfig + state *podState + rdt *resctrlOps + snapshots *snapshotStore +} + +const ( + defaultSnapshotDir = "/run/nri-resctrl-mon" + defaultSnapshotTTL = 5 * time.Minute +) + +// pluginConfig holds the runtime configuration for the plugin. +type pluginConfig struct { + // ResctrlPath is the mount point of the resctrl filesystem. + ResctrlPath string `json:"resctrlPath"` + + // Namespaces filters mon_group creation to pods in these namespaces. + // Empty list means all namespaces. + Namespaces []string `json:"namespaces"` + + // LabelSelector filters mon_group creation to pods matching these labels. + // Empty map means all pods. + LabelSelector map[string]string `json:"labelSelector"` + + // SnapshotDir is the directory for counter snapshot files. + // Default: /run/nri-resctrl-mon + SnapshotDir string `json:"snapshotDir"` + + // SnapshotTTL is how long completed snapshots are kept before pruning. + // Parsed as a Go duration string (e.g., "5m", "1h"). + // Default: 5m + SnapshotTTL string `json:"snapshotTTL"` +} + +func newPlugin() *plugin { + cfg := &pluginConfig{ + ResctrlPath: defaultResctrlPath, + SnapshotDir: defaultSnapshotDir, + SnapshotTTL: defaultSnapshotTTL.String(), + } + snap, err := newSnapshotStore(cfg.SnapshotDir, defaultSnapshotTTL) + if err != nil { + log.Warnf("failed to create snapshot store at %s: %v (snapshots disabled)", cfg.SnapshotDir, err) + } + return &plugin{ + config: cfg, + state: newPodState(), + rdt: newResctrlOps(cfg.ResctrlPath), + snapshots: snap, + } +} + +// Configure handles connecting to container runtime's NRI server. +func (p *plugin) Configure(ctx context.Context, config, runtime, version string) (stub.EventMask, error) { + log.Infof("Connected to %s %s...", runtime, version) + if config != "" { + log.Debugf("loading configuration from NRI server") + if err := p.setConfig([]byte(config)); err != nil { + return 0, err + } + } + return 0, nil +} + +// onClose handles losing connection to container runtime. +func (p *plugin) onClose() { + log.Infof("Connection to the runtime lost, exiting...") + os.Exit(0) +} + +// setConfig applies new plugin configuration. +func (p *plugin) setConfig(data []byte) error { + log.Tracef("setConfig: parsing\n---8<---\n%s\n--->8---", data) + cfg := pluginConfig{ + ResctrlPath: defaultResctrlPath, + SnapshotDir: defaultSnapshotDir, + SnapshotTTL: defaultSnapshotTTL.String(), + } + if err := yaml.Unmarshal(data, &cfg); err != nil { + return fmt.Errorf("setConfig: cannot parse configuration: %w", err) + } + resctrlPath := filepath.Clean(cfg.ResctrlPath) + if resctrlPath == "" || !filepath.IsAbs(resctrlPath) { + return fmt.Errorf("setConfig: resctrlPath must be an absolute path, got %q", cfg.ResctrlPath) + } + cfg.ResctrlPath = resctrlPath + + snapshotDir := filepath.Clean(cfg.SnapshotDir) + if snapshotDir == "" || !filepath.IsAbs(snapshotDir) { + return fmt.Errorf("setConfig: snapshotDir must be an absolute path, got %q", cfg.SnapshotDir) + } + // Prevent writing snapshots into the resctrl filesystem or sensitive system dirs. + for _, forbidden := range []string{resctrlPath, "/sys", "/proc", "/dev", "/etc"} { + if snapshotDir == forbidden || strings.HasPrefix(snapshotDir, forbidden+"/") { + return fmt.Errorf("setConfig: snapshotDir must not be under %s, got %q", forbidden, snapshotDir) + } + } + cfg.SnapshotDir = snapshotDir + + ttl, err := time.ParseDuration(cfg.SnapshotTTL) + if err != nil { + return fmt.Errorf("setConfig: invalid snapshotTTL %q: %w", cfg.SnapshotTTL, err) + } + + p.config = &cfg + p.rdt = newResctrlOps(cfg.ResctrlPath) + + snap, err := newSnapshotStore(cfg.SnapshotDir, ttl) + if err != nil { + log.Warnf("setConfig: failed to create snapshot store at %s: %v (snapshots disabled)", cfg.SnapshotDir, err) + p.snapshots = nil + } else { + p.snapshots = snap + } + + log.Debugf("configuration: resctrlPath=%s namespaces=%v labelSelector=%v snapshotDir=%s snapshotTTL=%s", + cfg.ResctrlPath, cfg.Namespaces, cfg.LabelSelector, cfg.SnapshotDir, cfg.SnapshotTTL) + return nil +} + +// Synchronize is called at plugin startup with the current set of pods and containers. +// It reconciles in-memory state with what exists on the resctrl filesystem. +func (p *plugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, containers []*api.Container) ([]*api.ContainerUpdate, error) { + log.Infof("synchronizing state: %d pods, %d containers", len(pods), len(containers)) + + // Build a lookup from sandbox ID to pod (containers reference + // pods by sandbox ID, not by Kubernetes UID). + podBySandboxID := make(map[string]*api.PodSandbox, len(pods)) + for _, pod := range pods { + podBySandboxID[pod.GetId()] = pod + } + + // Create mon_groups for running containers that don't have one, + // and write their PIDs to ensure monitoring is active after restart. + for _, ctr := range containers { + pod, ok := podBySandboxID[ctr.GetPodSandboxId()] + if !ok { + log.Debugf("Synchronize: container %s has no matching pod, skipping", ctr.GetName()) + continue + } + if !p.shouldMonitorPod(pod) { + continue + } + podUID := pod.GetUid() + rdtClass := getRDTClass(ctr) + if err := p.ensureMonGroup(podUID, ctr.GetId(), rdtClass); err != nil { + log.Warnf("Synchronize: failed to create mon_group for pod %s: %v", podUID, err) + continue + } + pid := int(ctr.GetPid()) + if pid > 0 { + monGroupDir := p.state.getMonGroupDir(podUID) + if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { + log.Warnf("Synchronize: failed to write PID %d for pod %s: %v", pid, podUID, err) + } else { + log.Debugf("Synchronize: assigned pid %d for pod %s", pid, podUID) + p.takeInitialSnapshot(podUID, monGroupDir) + } + } + } + + // Remove orphaned mon_groups from a previous plugin instance. + p.rdt.cleanOrphanedMonGroups(p.state) + + log.Infof("synchronization complete: tracking %d pods", p.state.podCount()) + return nil, nil +} + +// PostCreateContainer is called after the container is created but before +// it starts executing. The container PID is NOT yet available (pid=0) because +// the init process has not been started. We create the mon_group here so it +// is ready for PID assignment in StartContainer. +func (p *plugin) PostCreateContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + + log.Debugf("PostCreateContainer %s: pid=%d (expected 0)", ctrName, ctr.GetPid()) + + if !p.shouldMonitorPod(pod) { + log.Debugf("PostCreateContainer %s: pod filtered out, skipping", ctrName) + return nil + } + + rdtClass := getRDTClass(ctr) + if err := p.ensureMonGroup(podUID, ctr.GetId(), rdtClass); err != nil { + log.Warnf("PostCreateContainer %s: failed to create mon_group: %v", ctrName, err) + return nil // non-fatal: don't block container creation + } + + log.Infof("PostCreateContainer %s: mon_group ready, PID will be assigned in StartContainer", ctrName) + return nil +} + +// StartContainer is called just before the container process starts executing. +// At this point the init process has been created (via runc create) and the PID +// is available, but the process is paused and has NOT forked any threads yet. +// This is the ideal moment to write the PID to the resctrl mon_group tasks +// file: the kernel assigns the RMID to this PID, and when the process starts +// and forks threads they all inherit the RMID automatically. +// +// If the PID is not available (should not happen at this stage), we fall back +// to PostStartContainer which will write PIDs after the process starts. +func (p *plugin) StartContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + pid := int(ctr.GetPid()) + + log.Debugf("StartContainer %s: pid=%d", ctrName, pid) + + if !p.shouldMonitorPod(pod) { + return nil + } + + monGroupDir := p.state.getMonGroupDir(podUID) + if monGroupDir == "" { + log.Debugf("StartContainer %s: no mon_group (pod not tracked), skipping", ctrName) + return nil + } + + if pid > 0 { + if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { + log.Warnf("StartContainer %s: failed to write PID %d to tasks: %v", ctrName, pid, err) + } else { + log.Infof("StartContainer %s: assigned pid %d to mon_group %s (pre-start, no threads yet)", ctrName, pid, monGroupDir) + p.takeInitialSnapshot(podUID, monGroupDir) + } + } else { + log.Warnf("StartContainer %s: PID not available at pre-start, will retry in PostStartContainer", ctrName) + } + + return nil +} + +// PostStartContainer is called after the container process has been started. +// This is a fallback: if StartContainer did not have the PID (which should +// not happen on containerd ≥ 2.x), we write the init PID here. The init +// PID is sufficient because all child threads inherit the RMID. +func (p *plugin) PostStartContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + pid := int(ctr.GetPid()) + + log.Debugf("PostStartContainer %s: pid=%d", ctrName, pid) + + if !p.shouldMonitorPod(pod) { + return nil + } + + monGroupDir := p.state.getMonGroupDir(podUID) + if monGroupDir == "" { + return nil + } + + // Fallback: write the init PID if StartContainer didn't. + if pid > 0 { + if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { + log.Warnf("PostStartContainer %s: failed to write PID %d to tasks: %v", ctrName, pid, err) + } else { + log.Infof("PostStartContainer %s: fallback assigned pid %d to mon_group %s", ctrName, pid, monGroupDir) + p.takeInitialSnapshot(podUID, monGroupDir) + } + } else { + log.Warnf("PostStartContainer %s: PID still 0 after start, unexpected", ctrName) + } + + return nil +} + +// StopContainer is called when a container is being stopped. +func (p *plugin) StopContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) ([]*api.ContainerUpdate, error) { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + + log.Debugf("StopContainer %s", ctrName) + + monGroupDir := p.state.getMonGroupDir(podUID) + if monGroupDir == "" { + return nil, nil + } + + p.state.removeContainer(podUID, ctr.GetId()) + + if p.state.podHasNoContainers(podUID) { + log.Infof("StopContainer %s: last container, removing mon_group %s", ctrName, monGroupDir) + p.takeFinalSnapshot(podUID, monGroupDir) + if err := p.rdt.removeMonGroup(monGroupDir); err != nil { + log.Warnf("StopContainer %s: failed to remove mon_group, will retry on next sync: %v", ctrName, err) + return nil, nil + } + p.state.removePod(podUID) + } + + return nil, nil +} + +// ensureMonGroup creates the mon_group directory if it doesn't exist and registers +// the container in the in-memory state. +// +// Limitation: all containers in a pod share a single mon_group under the first +// container's RDT class. If an allocation plugin assigns different classes to +// containers in the same pod, subsequent containers use the first class. +func (p *plugin) ensureMonGroup(podUID, containerID, rdtClass string) error { + if !looksLikePodUID(podUID) { + return fmt.Errorf("invalid pod UID %q", podUID) + } + + if p.state.getMonGroupDir(podUID) != "" { + // Mon_group already exists for this pod. Just add the container. + p.state.addContainer(podUID, containerID) + return nil + } + + monGroupDir, err := p.rdt.createMonGroup(rdtClass, podUID) + if err != nil { + return err + } + + p.state.addPod(podUID, monGroupDir) + p.state.addContainer(podUID, containerID) + log.Infof("created mon_group %s for pod %s", monGroupDir, podUID) + return nil +} + +// shouldMonitorPod checks namespace and label filters. +func (p *plugin) shouldMonitorPod(pod *api.PodSandbox) bool { + if len(p.config.Namespaces) > 0 { + ns := pod.GetNamespace() + found := false + for _, allowed := range p.config.Namespaces { + if ns == allowed { + found = true + break + } + } + if !found { + return false + } + } + if len(p.config.LabelSelector) > 0 { + labels := pod.GetLabels() + for k, v := range p.config.LabelSelector { + if labels[k] != v { + return false + } + } + } + return true +} + +// getRDTClass extracts the RDT class from a container's Linux resources. +func getRDTClass(ctr *api.Container) string { + if linux := ctr.GetLinux(); linux != nil { + if res := linux.GetResources(); res != nil { + if rdt := res.GetRdtClass(); rdt != nil { + return rdt.GetValue() + } + } + } + return "" +} + +// pprintCtr returns a human-readable container identifier. +func pprintCtr(pod *api.PodSandbox, ctr *api.Container) string { + return fmt.Sprintf("%s/%s:%s", pod.GetNamespace(), pod.GetName(), ctr.GetName()) +} + +// takeInitialSnapshot records the initial counter values for a pod's mon_group. +// It is called once per pod, after the first successful PID write. +func (p *plugin) takeInitialSnapshot(podUID, monGroupDir string) { + if p.snapshots == nil || p.state.isInitialSnapshotDone(podUID) { + return + } + counters, err := p.rdt.readMonData(monGroupDir) + if err != nil { + log.Warnf("takeInitialSnapshot: failed to read mon_data for pod %s: %v", podUID, err) + return + } + if err := p.snapshots.writeInitial(podUID, monGroupDir, counters); err != nil { + log.Warnf("takeInitialSnapshot: failed to write snapshot for pod %s: %v", podUID, err) + return + } + p.state.setInitialSnapshotDone(podUID) + log.Infof("takeInitialSnapshot: recorded initial counters for pod %s", podUID) +} + +// takeFinalSnapshot records the final counter values before a mon_group is removed. +func (p *plugin) takeFinalSnapshot(podUID, monGroupDir string) { + if p.snapshots == nil { + return + } + counters, err := p.rdt.readMonData(monGroupDir) + if err != nil { + log.Warnf("takeFinalSnapshot: failed to read mon_data for pod %s: %v", podUID, err) + return + } + if err := p.snapshots.writeFinal(podUID, counters); err != nil { + log.Warnf("takeFinalSnapshot: failed to write snapshot for pod %s: %v", podUID, err) + return + } + log.Infof("takeFinalSnapshot: recorded final counters for pod %s", podUID) + p.snapshots.pruneCompleted() +} diff --git a/cmd/plugins/resctrl-mon/plugin_test.go b/cmd/plugins/resctrl-mon/plugin_test.go new file mode 100644 index 000000000..9863d1ed0 --- /dev/null +++ b/cmd/plugins/resctrl-mon/plugin_test.go @@ -0,0 +1,376 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "os" + "path/filepath" + "strconv" + "strings" + "testing" + "time" + + "github.com/containerd/nri/pkg/api" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func init() { + log = logrus.StandardLogger() + log.SetLevel(logrus.TraceLevel) +} + +func newTestPlugin(resctrlPath string) *plugin { + cfg := &pluginConfig{ + ResctrlPath: resctrlPath, + SnapshotDir: filepath.Join(resctrlPath, "_snapshots"), + SnapshotTTL: "5m", + } + snap, _ := newSnapshotStore(cfg.SnapshotDir, 5*time.Minute) + return &plugin{ + config: cfg, + state: newPodState(), + rdt: newResctrlOps(resctrlPath), + snapshots: snap, + } +} + +func makePod(uid, namespace, name string) *api.PodSandbox { + return &api.PodSandbox{ + Id: "sandbox-" + uid, // CRI sandbox ID != K8s pod UID + Uid: uid, + Namespace: namespace, + Name: name, + Labels: map[string]string{}, + } +} + +func makeContainer(id, name, podSandboxID string, pid uint32, rdtClass string) *api.Container { + ctr := &api.Container{ + Id: id, + PodSandboxId: podSandboxID, + Name: name, + Pid: pid, + Linux: &api.LinuxContainer{ + Resources: &api.LinuxResources{}, + }, + } + if rdtClass != "" { + ctr.Linux.Resources.RdtClass = &api.OptionalString{Value: rdtClass} + } + return ctr +} + +func TestShouldMonitorPod_NoFilters(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + pod := makePod("uid-1", "default", "test-pod") + assert.True(t, p.shouldMonitorPod(pod)) +} + +func TestShouldMonitorPod_NamespaceFilter(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + p.config.Namespaces = []string{"production", "staging"} + + pod1 := makePod("uid-1", "production", "pod1") + assert.True(t, p.shouldMonitorPod(pod1)) + + pod2 := makePod("uid-2", "kube-system", "pod2") + assert.False(t, p.shouldMonitorPod(pod2)) +} + +func TestShouldMonitorPod_LabelFilter(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + p.config.LabelSelector = map[string]string{"monitor": "true"} + + pod1 := makePod("uid-1", "default", "pod1") + pod1.Labels = map[string]string{"monitor": "true", "app": "web"} + assert.True(t, p.shouldMonitorPod(pod1)) + + pod2 := makePod("uid-2", "default", "pod2") + pod2.Labels = map[string]string{"app": "web"} + assert.False(t, p.shouldMonitorPod(pod2)) +} + +func TestGetRDTClass(t *testing.T) { + ctr1 := makeContainer("c1", "container1", "uid-1", 1234, "BestEffort") + assert.Equal(t, "BestEffort", getRDTClass(ctr1)) + + ctr2 := makeContainer("c2", "container2", "uid-1", 1235, "") + assert.Equal(t, "", getRDTClass(ctr2)) + + ctr3 := &api.Container{ + Id: "c3", + Name: "container3", + } + assert.Equal(t, "", getRDTClass(ctr3)) +} + +func TestPprintCtr(t *testing.T) { + pod := makePod("uid-1", "default", "my-pod") + ctr := makeContainer("c1", "my-container", "uid-1", 1234, "") + assert.Equal(t, "default/my-pod:my-container", pprintCtr(pod, ctr)) +} + +func TestPostCreateContainer_FilteredPod(t *testing.T) { + p := newTestPlugin(t.TempDir()) + p.config.Namespaces = []string{"production"} + + pod := makePod("uid-1", "default", "test-pod") + ctr := makeContainer("c1", "container1", "uid-1", 1234, "") + + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Pod should not be tracked since it's not in the production namespace. + assert.Equal(t, 0, p.state.podCount()) +} + +func TestPostCreateContainer_CreatesMonGroup(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + + pod := makePod("a1b2c3d4-e5f6-7890-abcd-ef1234567890", "default", "test-pod") + ctr := makeContainer("c1", "container1", "a1b2c3d4-e5f6-7890-abcd-ef1234567890", 0, "") + + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Pod should be tracked. + assert.Equal(t, 1, p.state.podCount()) + monDir := p.state.getMonGroupDir("a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Contains(t, monDir, "mon_groups/a1b2c3d4-e5f6-7890-abcd-ef1234567890") +} + +func TestPostCreateContainer_WithRDTClass(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + + pod := makePod("a1b2c3d4-e5f6-7890-abcd-ef1234567890", "default", "test-pod") + ctr := makeContainer("c1", "container1", "a1b2c3d4-e5f6-7890-abcd-ef1234567890", 0, "BestEffort") + + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + monDir := p.state.getMonGroupDir("a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Contains(t, monDir, "BestEffort/mon_groups/a1b2c3d4-e5f6-7890-abcd-ef1234567890") +} + +func TestMultiContainerPod(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "multi-pod") + ctr1 := makeContainer("c1", "container1", podUID, 0, "") + ctr2 := makeContainer("c2", "container2", podUID, 0, "") + + // First container creates the mon_group. + err := p.PostCreateContainer(context.Background(), pod, ctr1) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) + + // Second container reuses the same mon_group. + err = p.PostCreateContainer(context.Background(), pod, ctr2) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) // still one pod + + // Stopping first container should not remove the mon_group. + _, err = p.StopContainer(context.Background(), pod, ctr1) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) + assert.False(t, p.state.podHasNoContainers(podUID)) + + // Stopping second container should remove the mon_group. + _, err = p.StopContainer(context.Background(), pod, ctr2) + require.NoError(t, err) + assert.Equal(t, 0, p.state.podCount()) +} + +func TestStopContainer_UnknownPod(t *testing.T) { + p := newTestPlugin(t.TempDir()) + + pod := makePod("unknown-uid", "default", "unknown-pod") + ctr := makeContainer("c1", "container1", "unknown-uid", 1234, "") + + updates, err := p.StopContainer(context.Background(), pod, ctr) + require.NoError(t, err) + assert.Nil(t, updates) +} + +func TestSetConfig(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + + configYAML := []byte(` +resctrlPath: /tmp/test-resctrl +namespaces: + - production + - staging +labelSelector: + monitor: "true" +`) + + err := p.setConfig(configYAML) + require.NoError(t, err) + assert.Equal(t, "/tmp/test-resctrl", p.config.ResctrlPath) + assert.Equal(t, []string{"production", "staging"}, p.config.Namespaces) + assert.Equal(t, map[string]string{"monitor": "true"}, p.config.LabelSelector) +} + +func TestSetConfig_InvalidYAML(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + + err := p.setConfig([]byte(":::invalid yaml")) + assert.Error(t, err) +} + +func TestSetConfig_RelativePath(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + + err := p.setConfig([]byte("resctrlPath: relative/path")) + assert.Error(t, err) + assert.Contains(t, err.Error(), "absolute path") +} + +func TestSynchronize_UsesUIDNotSandboxID(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "sync-pod") + // Container references the pod by sandbox ID, not by UID. + ctr := makeContainer("c1", "container1", pod.GetId(), 0, "") + + _, err := p.Synchronize(context.Background(), []*api.PodSandbox{pod}, []*api.Container{ctr}) + require.NoError(t, err) + + // The mon_group should be keyed by the K8s pod UID, not the sandbox ID. + assert.Equal(t, 1, p.state.podCount()) + assert.True(t, p.state.hasPod(podUID)) + assert.False(t, p.state.hasPod(pod.GetId())) + + monDir := p.state.getMonGroupDir(podUID) + assert.Contains(t, monDir, podUID) +} + +func TestEnsureMonGroup_InvalidUID(t *testing.T) { + p := newTestPlugin(t.TempDir()) + + err := p.ensureMonGroup("", "c1", "") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid pod UID") + + err = p.ensureMonGroup("not-a-uuid", "c1", "") + assert.Error(t, err) + + assert.Equal(t, 0, p.state.podCount()) +} + +func TestStartContainer_CreatesSnapshot(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "snap-pod") + ctr := makeContainer("c1", "container1", pod.GetId(), 12345, "") + + // Create the mon_group and fake mon_data so readMonData succeeds. + err := p.ensureMonGroup(podUID, "c1", "") + require.NoError(t, err) + + monDir := p.state.getMonGroupDir(podUID) + monData := filepath.Join(monDir, "mon_data", "mon_PERF_PKG_00") + require.NoError(t, os.MkdirAll(monData, 0755)) + require.NoError(t, os.WriteFile(filepath.Join(monData, "core_energy"), []byte("10.5\n"), 0644)) + // Create a tasks file so writeTaskPID succeeds in the test sandbox. + require.NoError(t, os.WriteFile(filepath.Join(monDir, "tasks"), nil, 0644)) + + // StartContainer triggers takeInitialSnapshot. + err = p.StartContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Verify snapshot directory with .begin files and symlink. + snapDir := filepath.Join(p.snapshots.dir, podUID) + beginFile := filepath.Join(snapDir, "mon_data", "mon_PERF_PKG_00", "core_energy.begin") + data, err := os.ReadFile(beginFile) + require.NoError(t, err) + val, err := strconv.ParseFloat(strings.TrimSpace(string(data)), 64) + require.NoError(t, err) + assert.Equal(t, 10.5, val) + + // Verify symlink exists. + linkPath := filepath.Join(snapDir, "mon_data", "mon_PERF_PKG_00", "core_energy") + _, err = os.Readlink(linkPath) + assert.NoError(t, err) + + // Verify created_at exists, completed_at does not. + _, err = os.Stat(filepath.Join(snapDir, "created_at")) + assert.NoError(t, err) + _, err = os.Stat(filepath.Join(snapDir, "completed_at")) + assert.True(t, os.IsNotExist(err)) +} + +func TestStopContainer_WritesFinalSnapshot(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "snap-pod") + ctr := makeContainer("c1", "container1", pod.GetId(), 12345, "") + + // Set up mon_group + fake mon_data. + err := p.ensureMonGroup(podUID, "c1", "") + require.NoError(t, err) + + monDir := p.state.getMonGroupDir(podUID) + monData := filepath.Join(monDir, "mon_data", "mon_PERF_PKG_00") + require.NoError(t, os.MkdirAll(monData, 0755)) + require.NoError(t, os.WriteFile(filepath.Join(monData, "core_energy"), []byte("10.5\n"), 0644)) + require.NoError(t, os.WriteFile(filepath.Join(monDir, "tasks"), nil, 0644)) + + // Take initial snapshot. + err = p.StartContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Update the counter value for final read. + require.NoError(t, os.WriteFile(filepath.Join(monData, "core_energy"), []byte("99.9\n"), 0644)) + + // StopContainer triggers takeFinalSnapshot. + _, err = p.StopContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Verify .begin and .end files. + snapDir := filepath.Join(p.snapshots.dir, podUID) + beginFile := filepath.Join(snapDir, "mon_data", "mon_PERF_PKG_00", "core_energy.begin") + endFile := filepath.Join(snapDir, "mon_data", "mon_PERF_PKG_00", "core_energy.end") + + beginData, err := os.ReadFile(beginFile) + require.NoError(t, err) + beginVal, err := strconv.ParseFloat(strings.TrimSpace(string(beginData)), 64) + require.NoError(t, err) + assert.Equal(t, 10.5, beginVal) + + endData, err := os.ReadFile(endFile) + require.NoError(t, err) + endVal, err := strconv.ParseFloat(strings.TrimSpace(string(endData)), 64) + require.NoError(t, err) + assert.Equal(t, 99.9, endVal) + + // Verify completed_at exists. + _, err = os.Stat(filepath.Join(snapDir, "completed_at")) + assert.NoError(t, err) +} diff --git a/cmd/plugins/resctrl-mon/resctrl.go b/cmd/plugins/resctrl-mon/resctrl.go new file mode 100644 index 000000000..985ff4fa0 --- /dev/null +++ b/cmd/plugins/resctrl-mon/resctrl.go @@ -0,0 +1,268 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +const ( + defaultResctrlPath = "/sys/fs/resctrl" + monGroupsDir = "mon_groups" +) + +// resctrlOps handles filesystem operations on the resctrl mount. +type resctrlOps struct { + resctrlPath string +} + +func newResctrlOps(resctrlPath string) *resctrlOps { + return &resctrlOps{ + resctrlPath: resctrlPath, + } +} + +// createMonGroup creates a mon_group directory under the appropriate ctrl_group +// and returns the full path. If rdtClass is empty, the mon_group is created +// under the root resctrl directory. +// +// The kernel assigns an RMID to the new mon_group on mkdir. If no RMIDs are +// available, mkdir returns ENOSPC. +func (r *resctrlOps) createMonGroup(rdtClass, podUID string) (string, error) { + parentDir := r.resctrlPath + if rdtClass != "" { + if !isValidRDTClass(rdtClass) { + return "", fmt.Errorf("invalid RDT class name %q", rdtClass) + } + parentDir = filepath.Join(r.resctrlPath, rdtClass) + } + + // When an RDT class is specified, the ctrl_group must already exist + // (created by an allocation plugin). Do not create it implicitly — + // that would make an unintended ctrl_group in the resctrl filesystem. + if rdtClass != "" { + info, err := os.Stat(parentDir) + if err != nil { + return "", fmt.Errorf("ctrl_group %s does not exist: %w", parentDir, err) + } + if !info.IsDir() { + return "", fmt.Errorf("ctrl_group %s is not a directory", parentDir) + } + } + + monGroupsPath := filepath.Join(parentDir, monGroupsDir) + monGroupDir := filepath.Join(monGroupsPath, podUID) + + // Ensure the mon_groups/ directory exists. On a real resctrl mount + // this is always present. For testing, create it if needed. + if err := os.MkdirAll(monGroupsPath, 0755); err != nil { + return "", fmt.Errorf("mon_groups dir not available at %s: %w", monGroupsPath, err) + } + + // Use Mkdir (not MkdirAll) for the final mon_group directory to + // avoid accidentally creating a ctrl_group if rdtClass is wrong. + if err := os.Mkdir(monGroupDir, 0755); err != nil { + if errors.Is(err, os.ErrExist) { + return monGroupDir, nil + } + if errors.Is(err, syscall.ENOSPC) { + return "", fmt.Errorf("no RMIDs available for pod %s: %w", podUID, err) + } + return "", fmt.Errorf("failed to create mon_group %s: %w", monGroupDir, err) + } + + return monGroupDir, nil +} + +// removeMonGroup removes a mon_group directory. The kernel releases the RMID. +func (r *resctrlOps) removeMonGroup(monGroupDir string) error { + err := os.Remove(monGroupDir) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("failed to remove mon_group %s: %w", monGroupDir, err) + } + return nil +} + +// writeTaskPID writes a PID to the mon_group's tasks file. The kernel assigns +// this PID (and all future child processes) to the mon_group's RMID. +func (r *resctrlOps) writeTaskPID(monGroupDir string, pid int) error { + tasksFile := filepath.Join(monGroupDir, "tasks") + f, err := os.OpenFile(tasksFile, os.O_WRONLY, 0) + if err != nil { + return fmt.Errorf("failed to open %s for pid %d: %w", tasksFile, pid, err) + } + defer f.Close() + data := []byte(strconv.Itoa(pid) + "\n") + if _, err := f.Write(data); err != nil { + return fmt.Errorf("failed to write pid %d to %s: %w", pid, tasksFile, err) + } + return nil +} + +// readMonData reads all monitoring event files from a mon_group's mon_data/ +// directory. It returns a nested map: domain → event → value. +// +// For example, a mon_group with PERF_PKG and L3 monitoring will return: +// +// { +// "mon_PERF_PKG_00": {"activity": 1234, "core_energy": 42.123}, +// "mon_L3_00": {"llc_occupancy": 12345, "mbm_local_bytes": 0, "mbm_total_bytes": 67890}, +// } +func (r *resctrlOps) readMonData(monGroupDir string) (map[string]map[string]float64, error) { + monDataDir := filepath.Join(monGroupDir, "mon_data") + domains, err := os.ReadDir(monDataDir) + if err != nil { + return nil, fmt.Errorf("failed to read mon_data directory %s: %w", monDataDir, err) + } + + result := make(map[string]map[string]float64) + for _, domain := range domains { + if !domain.IsDir() || !strings.HasPrefix(domain.Name(), "mon_") { + continue + } + domainDir := filepath.Join(monDataDir, domain.Name()) + events, err := os.ReadDir(domainDir) + if err != nil { + log.Warnf("readMonData: failed to read domain directory %s: %v", domainDir, err) + continue + } + domainMap := make(map[string]float64) + for _, event := range events { + if event.IsDir() || event.Name() == "tasks" { + continue + } + eventFile := filepath.Join(domainDir, event.Name()) + data, err := os.ReadFile(eventFile) + if err != nil { + log.Warnf("readMonData: failed to read %s: %v", eventFile, err) + continue + } + val, err := strconv.ParseFloat(strings.TrimSpace(string(data)), 64) + if err != nil { + log.Warnf("readMonData: failed to parse %s value %q: %v", eventFile, string(data), err) + continue + } + domainMap[event.Name()] = val + } + if len(domainMap) > 0 { + result[domain.Name()] = domainMap + } + } + return result, nil +} + +// cleanOrphanedMonGroups removes mon_group directories that are not tracked +// in the given state. This handles cleanup after a plugin crash/restart. +func (r *resctrlOps) cleanOrphanedMonGroups(state *podState) { + // Scan root-level mon_groups. + r.cleanOrphanedInDir(filepath.Join(r.resctrlPath, monGroupsDir), state) + + // Scan ctrl_group-level mon_groups. + entries, err := os.ReadDir(r.resctrlPath) + if err != nil { + log.Warnf("cleanOrphanedMonGroups: failed to read %s: %v", r.resctrlPath, err) + return + } + for _, entry := range entries { + if !entry.IsDir() { + continue + } + name := entry.Name() + // Skip non-ctrl_group entries. + if name == monGroupsDir || name == "info" || strings.HasPrefix(name, "mon_") { + continue + } + ctrlGroupMonDir := filepath.Join(r.resctrlPath, name, monGroupsDir) + r.cleanOrphanedInDir(ctrlGroupMonDir, state) + } +} + +// cleanOrphanedInDir removes mon_group directories in a specific mon_groups/ +// directory that look like pod UIDs but are not tracked in state. +func (r *resctrlOps) cleanOrphanedInDir(monGroupsPath string, state *podState) { + entries, err := os.ReadDir(monGroupsPath) + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + log.Warnf("failed to read mon_groups directory %s: %v", monGroupsPath, err) + } + return + } + for _, entry := range entries { + if !entry.IsDir() { + continue + } + name := entry.Name() + // Only clean directories that look like pod UIDs (contain dashes like UUIDs). + if !looksLikePodUID(name) { + continue + } + orphanDir := filepath.Join(monGroupsPath, name) + trackedDir := state.getMonGroupDir(name) + if trackedDir == orphanDir { + // This is the active mon_group for this pod. + continue + } + log.Infof("removing orphaned mon_group %s", orphanDir) + if err := os.Remove(orphanDir); err != nil && !errors.Is(err, os.ErrNotExist) { + log.Warnf("failed to remove orphaned mon_group %s: %v", orphanDir, err) + } + } +} + +// looksLikePodUID returns true if the name looks like a Kubernetes pod UID +// (UUID format with dashes, e.g., a1b2c3d4-e5f6-7890-abcd-ef1234567890). +func looksLikePodUID(name string) bool { + if len(name) != 36 { + return false + } + // Check for UUID-like pattern: 8-4-4-4-12 hex chars. + parts := strings.Split(name, "-") + if len(parts) != 5 { + return false + } + expectedLens := []int{8, 4, 4, 4, 12} + for i, part := range parts { + if len(part) != expectedLens[i] { + return false + } + for _, c := range part { + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + return false + } + } + } + return true +} + +// isValidRDTClass returns true if the name is a safe resctrl ctrl_group name. +// It rejects path separators, dot-segments, and empty strings to prevent +// path traversal outside the resctrl mount. +func isValidRDTClass(name string) bool { + if name == "" || name == "." || name == ".." { + return false + } + for _, c := range name { + if c == '/' || c == 0 { + return false + } + } + return true +} diff --git a/cmd/plugins/resctrl-mon/resctrl_test.go b/cmd/plugins/resctrl-mon/resctrl_test.go new file mode 100644 index 000000000..760a6f39e --- /dev/null +++ b/cmd/plugins/resctrl-mon/resctrl_test.go @@ -0,0 +1,277 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCreateMonGroup_RootClass(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + assert.Equal(t, filepath.Join(tmpDir, "mon_groups", "pod-uid-1"), dir) + + // Directory should exist. + info, err := os.Stat(dir) + require.NoError(t, err) + assert.True(t, info.IsDir()) +} + +func TestCreateMonGroup_WithRDTClass(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + + dir, err := r.createMonGroup("BestEffort", "pod-uid-2") + require.NoError(t, err) + assert.Equal(t, filepath.Join(tmpDir, "BestEffort", "mon_groups", "pod-uid-2"), dir) + + info, err := os.Stat(dir) + require.NoError(t, err) + assert.True(t, info.IsDir()) +} + +func TestCreateMonGroup_MissingCtrlGroup(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + // Attempt to create a mon_group under a non-existent ctrl_group. + _, err := r.createMonGroup("NoSuchClass", "pod-uid-3") + assert.Error(t, err) + assert.Contains(t, err.Error(), "ctrl_group") + + // Verify the ctrl_group was NOT created. + _, err = os.Stat(filepath.Join(tmpDir, "NoSuchClass")) + assert.True(t, os.IsNotExist(err)) +} + +func TestCreateMonGroup_Idempotent(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir1, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + dir2, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + assert.Equal(t, dir1, dir2) +} + +func TestRemoveMonGroup(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + err = r.removeMonGroup(dir) + require.NoError(t, err) + + _, err = os.Stat(dir) + assert.True(t, os.IsNotExist(err)) +} + +func TestRemoveMonGroup_NotExist(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + err := r.removeMonGroup(filepath.Join(tmpDir, "mon_groups", "nonexistent")) + assert.NoError(t, err) +} + +func TestWriteTaskPID(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + // In real resctrl, the kernel creates the tasks file when the + // mon_group directory is created. Simulate that here. + tasksFile := filepath.Join(dir, "tasks") + require.NoError(t, os.WriteFile(tasksFile, nil, 0644)) + + err = r.writeTaskPID(dir, 12345) + require.NoError(t, err) + + data, err := os.ReadFile(tasksFile) + require.NoError(t, err) + assert.Equal(t, "12345\n", string(data)) +} + +func TestCleanOrphanedMonGroups(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + state := newPodState() + + // Create a mon_group that IS tracked. + trackedUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + dir, err := r.createMonGroup("", trackedUID) + require.NoError(t, err) + state.addPod(trackedUID, dir) + + // Create a mon_group that is NOT tracked (orphan). + orphanUID := "deadbeef-dead-beef-dead-beefdeadbeef" + _, err = r.createMonGroup("", orphanUID) + require.NoError(t, err) + + r.cleanOrphanedMonGroups(state) + + // Tracked should still exist. + _, err = os.Stat(filepath.Join(tmpDir, "mon_groups", trackedUID)) + assert.NoError(t, err) + + // Orphan should be removed. + _, err = os.Stat(filepath.Join(tmpDir, "mon_groups", orphanUID)) + assert.True(t, os.IsNotExist(err)) +} + +func TestCleanOrphanedMonGroups_CtrlGroup(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + state := newPodState() + + // Create orphan under a ctrl_group. + orphanUID := "deadbeef-dead-beef-dead-beefdeadbeef" + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + _, err := r.createMonGroup("BestEffort", orphanUID) + require.NoError(t, err) + + r.cleanOrphanedMonGroups(state) + + _, err = os.Stat(filepath.Join(tmpDir, "BestEffort", "mon_groups", orphanUID)) + assert.True(t, os.IsNotExist(err)) +} + +func TestCleanOrphanedMonGroups_StaleLocation(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + state := newPodState() + + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + // Create a mon_group under BestEffort (simulates previous run). + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + _, err := r.createMonGroup("BestEffort", podUID) + require.NoError(t, err) + + // Track the pod at the root class (simulates current run with different RDT class). + rootDir, err := r.createMonGroup("", podUID) + require.NoError(t, err) + state.addPod(podUID, rootDir) + + r.cleanOrphanedMonGroups(state) + + // Root mon_group (tracked) should still exist. + _, err = os.Stat(rootDir) + assert.NoError(t, err) + + // BestEffort mon_group (stale) should be removed. + _, err = os.Stat(filepath.Join(tmpDir, "BestEffort", "mon_groups", podUID)) + assert.True(t, os.IsNotExist(err)) +} + +func TestLooksLikePodUID(t *testing.T) { + assert.True(t, looksLikePodUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890")) + assert.True(t, looksLikePodUID("DEADBEEF-DEAD-BEEF-DEAD-BEEFDEADBEEF")) + assert.True(t, looksLikePodUID("00000000-0000-0000-0000-000000000000")) + + assert.False(t, looksLikePodUID("short")) + assert.False(t, looksLikePodUID("not-a-uuid-at-all-nope-notthisone!")) + assert.False(t, looksLikePodUID("a1b2c3d4-e5f6-7890-abcd-ef123456789")) // too short last segment + assert.False(t, looksLikePodUID("g1b2c3d4-e5f6-7890-abcd-ef1234567890")) // 'g' is not hex + assert.False(t, looksLikePodUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890x")) // too long +} + +func TestIsValidRDTClass(t *testing.T) { + assert.True(t, isValidRDTClass("BestEffort")) + assert.True(t, isValidRDTClass("Guaranteed")) + assert.True(t, isValidRDTClass("COS1")) + assert.True(t, isValidRDTClass("my-class_v2")) + + assert.False(t, isValidRDTClass("")) + assert.False(t, isValidRDTClass(".")) + assert.False(t, isValidRDTClass("..")) + assert.False(t, isValidRDTClass("../../etc")) + assert.False(t, isValidRDTClass("foo/bar")) + assert.False(t, isValidRDTClass("class\x00name")) +} + +func TestCreateMonGroup_PathTraversal(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + _, err := r.createMonGroup("../../etc", "a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid RDT class") + + _, err = r.createMonGroup("foo/bar", "a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid RDT class") + + _, err = r.createMonGroup("..", "a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid RDT class") +} + +func TestReadMonData(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + // Create a fake mon_group with mon_data directories. + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + monGroupDir, err := r.createMonGroup("", podUID) + require.NoError(t, err) + + monData := filepath.Join(monGroupDir, "mon_data") + require.NoError(t, os.MkdirAll(filepath.Join(monData, "mon_PERF_PKG_00"), 0755)) + require.NoError(t, os.MkdirAll(filepath.Join(monData, "mon_PERF_PKG_01"), 0755)) + require.NoError(t, os.MkdirAll(filepath.Join(monData, "mon_L3_00"), 0755)) + + require.NoError(t, os.WriteFile(filepath.Join(monData, "mon_PERF_PKG_00", "core_energy"), []byte("42.123456\n"), 0644)) + require.NoError(t, os.WriteFile(filepath.Join(monData, "mon_PERF_PKG_01", "core_energy"), []byte("0.000000\n"), 0644)) + require.NoError(t, os.WriteFile(filepath.Join(monData, "mon_L3_00", "llc_occupancy"), []byte("12345\n"), 0644)) + require.NoError(t, os.WriteFile(filepath.Join(monData, "mon_L3_00", "mbm_total_bytes"), []byte("67890\n"), 0644)) + + result, err := r.readMonData(monGroupDir) + require.NoError(t, err) + + assert.Equal(t, 42.123456, result["mon_PERF_PKG_00"]["core_energy"]) + assert.Equal(t, 0.0, result["mon_PERF_PKG_01"]["core_energy"]) + assert.Equal(t, 12345.0, result["mon_L3_00"]["llc_occupancy"]) + assert.Equal(t, 67890.0, result["mon_L3_00"]["mbm_total_bytes"]) +} + +func TestReadMonData_NoMonData(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + // mon_group with no mon_data directory. + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + monGroupDir, err := r.createMonGroup("", podUID) + require.NoError(t, err) + + _, err = r.readMonData(monGroupDir) + assert.Error(t, err) +} diff --git a/cmd/plugins/resctrl-mon/snapshot.go b/cmd/plugins/resctrl-mon/snapshot.go new file mode 100644 index 000000000..a6ca93dde --- /dev/null +++ b/cmd/plugins/resctrl-mon/snapshot.go @@ -0,0 +1,201 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" +) + +// snapshotUIDRe validates that a pod UID is safe for use as a directory name. +var snapshotUIDRe = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`) + +// snapshotStore manages a directory tree that mirrors the resctrl mon_data +// layout, augmenting it with .begin/.end counter files and symlinks to the +// live kernel counters. +// +// Layout for each pod: +// +// // +// created_at # RFC 3339 timestamp +// completed_at # RFC 3339 timestamp (written on stop) +// mon_data/ +// mon_PERF_PKG_00/ +// core_energy → /sys/fs/resctrl/.../core_energy (symlink) +// core_energy.begin # counter value at mon_group creation +// core_energy.end # counter value at mon_group removal +// mon_L3_00/ +// llc_occupancy → ... +// llc_occupancy.begin +// llc_occupancy.end +type snapshotStore struct { + dir string + ttl time.Duration +} + +func newSnapshotStore(dir string, ttl time.Duration) (*snapshotStore, error) { + if err := os.MkdirAll(dir, 0700); err != nil { + return nil, fmt.Errorf("failed to create snapshot directory %s: %w", dir, err) + } + // Tighten permissions in case the directory already existed with looser mode. + if err := os.Chmod(dir, 0700); err != nil { + return nil, fmt.Errorf("failed to chmod snapshot directory %s: %w", dir, err) + } + return &snapshotStore{dir: dir, ttl: ttl}, nil +} + +// podDir returns the validated directory path for a pod's snapshot tree. +func (s *snapshotStore) podDir(podUID string) (string, error) { + if !snapshotUIDRe.MatchString(podUID) { + return "", fmt.Errorf("invalid pod UID for snapshot directory: %q", podUID) + } + return filepath.Join(s.dir, podUID), nil +} + +// writeInitial creates the mirror directory tree with .begin counter files +// and symlinks to the live resctrl event files. The created_at file is +// written last as a commit marker. +func (s *snapshotStore) writeInitial(podUID, monGroupDir string, counters map[string]map[string]float64) error { + podDir, err := s.podDir(podUID) + if err != nil { + return err + } + + for domain, events := range counters { + if !isSafePathComponent(domain) { + log.Warnf("writeInitial: skipping invalid domain name %q", domain) + continue + } + domainDir := filepath.Join(podDir, "mon_data", domain) + if err := os.MkdirAll(domainDir, 0700); err != nil { + return fmt.Errorf("failed to create domain directory %s: %w", domainDir, err) + } + srcDomainDir := filepath.Join(monGroupDir, "mon_data", domain) + for event, val := range events { + if !isSafePathComponent(event) { + log.Warnf("writeInitial: skipping invalid event name %q", event) + continue + } + // Write .begin file. + if err := writeCounterFile(filepath.Join(domainDir, event+".begin"), val); err != nil { + return err + } + // Create symlink to live counter. + linkPath := filepath.Join(domainDir, event) + os.Remove(linkPath) // idempotent: remove stale symlink on re-create + if err := os.Symlink(filepath.Join(srcDomainDir, event), linkPath); err != nil { + log.Warnf("writeInitial: failed to create symlink %s: %v", linkPath, err) + } + } + } + + // Write created_at last as commit marker. + return os.WriteFile(filepath.Join(podDir, "created_at"), + []byte(time.Now().UTC().Format(time.RFC3339Nano)+"\n"), 0600) +} + +// writeFinal writes .end counter files and a completed_at timestamp. +// The completed_at file is written last as a commit marker so consumers +// know all .end files are present. +func (s *snapshotStore) writeFinal(podUID string, counters map[string]map[string]float64) error { + podDir, err := s.podDir(podUID) + if err != nil { + return err + } + + for domain, events := range counters { + if !isSafePathComponent(domain) { + log.Warnf("writeFinal: skipping invalid domain name %q", domain) + continue + } + domainDir := filepath.Join(podDir, "mon_data", domain) + if err := os.MkdirAll(domainDir, 0700); err != nil { + return fmt.Errorf("failed to create domain directory %s: %w", domainDir, err) + } + for event, val := range events { + if !isSafePathComponent(event) { + log.Warnf("writeFinal: skipping invalid event name %q", event) + continue + } + if err := writeCounterFile(filepath.Join(domainDir, event+".end"), val); err != nil { + return err + } + // Remove the symlink to the (now-removed) live counter. + // After .end is written the symlink is dangling and serves no purpose. + linkPath := filepath.Join(domainDir, event) + os.Remove(linkPath) + } + } + + // Write completed_at last as commit marker. + return os.WriteFile(filepath.Join(podDir, "completed_at"), + []byte(time.Now().UTC().Format(time.RFC3339Nano)+"\n"), 0600) +} + +// pruneCompleted removes snapshot directories for completed pods older than the TTL. +func (s *snapshotStore) pruneCompleted() { + entries, err := os.ReadDir(s.dir) + if err != nil { + log.Warnf("pruneCompleted: failed to read snapshot directory %s: %v", s.dir, err) + return + } + + cutoff := time.Now().UTC().Add(-s.ttl) + for _, entry := range entries { + if !entry.IsDir() || !snapshotUIDRe.MatchString(entry.Name()) { + continue + } + podDir := filepath.Join(s.dir, entry.Name()) + data, err := os.ReadFile(filepath.Join(podDir, "completed_at")) + if err != nil { + continue // not completed yet, or not a snapshot dir + } + ts, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(string(data))) + if err != nil { + continue + } + if ts.Before(cutoff) { + if err := os.RemoveAll(podDir); err != nil { + log.Warnf("pruneCompleted: failed to remove %s: %v", podDir, err) + } else { + log.Debugf("pruneCompleted: removed expired snapshot %s", podDir) + } + } + } +} + +// writeCounterFile writes a single counter value in the same text format +// as the kernel's resctrl event files (one number per line, no trailing zeros). +func writeCounterFile(path string, val float64) error { + s := strconv.FormatFloat(val, 'f', -1, 64) + if err := os.WriteFile(path, []byte(s+"\n"), 0600); err != nil { + return fmt.Errorf("failed to write counter file %s: %w", path, err) + } + return nil +} + +// isSafePathComponent returns true if name is a single, non-empty filename +// component with no path separators or traversal sequences. +func isSafePathComponent(name string) bool { + return name != "" && + name != "." && + name != ".." && + !strings.ContainsAny(name, "/\\") +} diff --git a/cmd/plugins/resctrl-mon/snapshot_test.go b/cmd/plugins/resctrl-mon/snapshot_test.go new file mode 100644 index 000000000..95101efbc --- /dev/null +++ b/cmd/plugins/resctrl-mon/snapshot_test.go @@ -0,0 +1,266 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os" + "path/filepath" + "strconv" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mustPodDir is a test helper that calls podDir and fails on error. +func mustPodDir(t *testing.T, s *snapshotStore, podUID string) string { + t.Helper() + p, err := s.podDir(podUID) + require.NoError(t, err) + return p +} + +// readCounterFile reads and parses a counter value file in the test. +func readCounterFile(t *testing.T, path string) float64 { + t.Helper() + data, err := os.ReadFile(path) + require.NoError(t, err) + val, err := strconv.ParseFloat(strings.TrimSpace(string(data)), 64) + require.NoError(t, err) + return val +} + +func TestPodDir_RejectsTraversal(t *testing.T) { + store, err := newSnapshotStore(t.TempDir(), 5*time.Minute) + require.NoError(t, err) + + for _, bad := range []string{"../etc/passwd", "../../root", "not-a-uid", "", "a1b2c3d4-e5f6-7890-ABCD-ef1234567890"} { + _, err := store.podDir(bad) + assert.Error(t, err, "should reject %q", bad) + } +} + +func TestIsSafePathComponent(t *testing.T) { + for _, safe := range []string{"mon_PERF_PKG_00", "core_energy", "llc_occupancy", "mbm_total_bytes"} { + assert.True(t, isSafePathComponent(safe), "%q should be safe", safe) + } + for _, bad := range []string{"", ".", "..", "../etc", "foo/bar", "foo\\bar"} { + assert.False(t, isSafePathComponent(bad), "%q should be rejected", bad) + } +} + +func TestNewSnapshotStore_DirPermissions(t *testing.T) { + parent := t.TempDir() + dir := filepath.Join(parent, "restricted") + _, err := newSnapshotStore(dir, 5*time.Minute) + require.NoError(t, err) + + info, err := os.Stat(dir) + require.NoError(t, err) + assert.Equal(t, os.FileMode(0700), info.Mode().Perm()) +} + +func TestNewSnapshotStore_CreatesDir(t *testing.T) { + parent := t.TempDir() + dir := filepath.Join(parent, "sub", "dir") + + store, err := newSnapshotStore(dir, 5*time.Minute) + require.NoError(t, err) + assert.Equal(t, dir, store.dir) + + info, err := os.Stat(dir) + require.NoError(t, err) + assert.True(t, info.IsDir()) +} + +func TestWriteInitial(t *testing.T) { + dir := t.TempDir() + store, err := newSnapshotStore(dir, 5*time.Minute) + require.NoError(t, err) + + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + monGroupDir := "/sys/fs/resctrl/mon_groups/" + podUID + counters := map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 42.123, "activity": 100.5}, + "mon_L3_00": {"llc_occupancy": 12345}, + } + + err = store.writeInitial(podUID, monGroupDir, counters) + require.NoError(t, err) + + podDir := mustPodDir(t, store, podUID) + + // Verify .begin files. + assert.Equal(t, 42.123, readCounterFile(t, filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy.begin"))) + assert.Equal(t, 100.5, readCounterFile(t, filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "activity.begin"))) + assert.Equal(t, 12345.0, readCounterFile(t, filepath.Join(podDir, "mon_data", "mon_L3_00", "llc_occupancy.begin"))) + + // Verify symlinks point to resctrl. + target, err := os.Readlink(filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy")) + require.NoError(t, err) + assert.Equal(t, filepath.Join(monGroupDir, "mon_data", "mon_PERF_PKG_00", "core_energy"), target) + + // Verify created_at exists and is valid. + data, err := os.ReadFile(filepath.Join(podDir, "created_at")) + require.NoError(t, err) + _, err = time.Parse(time.RFC3339Nano, strings.TrimSpace(string(data))) + assert.NoError(t, err) + + // Verify no completed_at yet. + _, err = os.Stat(filepath.Join(podDir, "completed_at")) + assert.True(t, os.IsNotExist(err)) + + // Verify .begin file permissions. + info, err := os.Stat(filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy.begin")) + require.NoError(t, err) + assert.Equal(t, os.FileMode(0600), info.Mode().Perm()) +} + +func TestWriteFinal(t *testing.T) { + dir := t.TempDir() + store, err := newSnapshotStore(dir, 5*time.Minute) + require.NoError(t, err) + + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + initial := map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 42.123}, + } + final := map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 99.456}, + } + + err = store.writeInitial(podUID, "/fake/mon_groups/"+podUID, initial) + require.NoError(t, err) + + err = store.writeFinal(podUID, final) + require.NoError(t, err) + + podDir := mustPodDir(t, store, podUID) + + // Verify .begin preserved. + assert.Equal(t, 42.123, readCounterFile(t, filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy.begin"))) + // Verify .end written. + assert.Equal(t, 99.456, readCounterFile(t, filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy.end"))) + // Verify symlink removed (was dangling after mon_group removal). + _, err = os.Lstat(filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy")) + assert.True(t, os.IsNotExist(err), "symlink should be removed after writeFinal") + // Verify completed_at exists. + data, err := os.ReadFile(filepath.Join(podDir, "completed_at")) + require.NoError(t, err) + _, err = time.Parse(time.RFC3339Nano, strings.TrimSpace(string(data))) + assert.NoError(t, err) +} + +func TestWriteFinal_NoInitial(t *testing.T) { + dir := t.TempDir() + store, err := newSnapshotStore(dir, 5*time.Minute) + require.NoError(t, err) + + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + final := map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 99.456}, + } + + err = store.writeFinal(podUID, final) + require.NoError(t, err) + + podDir := mustPodDir(t, store, podUID) + assert.Equal(t, 99.456, readCounterFile(t, filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy.end"))) + + // No .begin file should exist. + _, err = os.Stat(filepath.Join(podDir, "mon_data", "mon_PERF_PKG_00", "core_energy.begin")) + assert.True(t, os.IsNotExist(err)) + + // completed_at should exist. + _, err = os.Stat(filepath.Join(podDir, "completed_at")) + assert.NoError(t, err) +} + +func TestPruneCompleted(t *testing.T) { + dir := t.TempDir() + store, err := newSnapshotStore(dir, 1*time.Millisecond) // very short TTL + require.NoError(t, err) + + // Create a completed snapshot. + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + err = store.writeInitial(podUID, "/fake/mon_groups/"+podUID, map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 1.0}, + }) + require.NoError(t, err) + err = store.writeFinal(podUID, map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 2.0}, + }) + require.NoError(t, err) + + // Create an active snapshot (no completed_at). + activeUID := "deadbeef-dead-beef-dead-beefdeadbeef" + err = store.writeInitial(activeUID, "/fake/mon_groups/"+activeUID, map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 5.0}, + }) + require.NoError(t, err) + + // Wait for TTL to expire. + time.Sleep(5 * time.Millisecond) + + store.pruneCompleted() + + // Completed should be removed. + _, err = os.Stat(mustPodDir(t, store, podUID)) + assert.True(t, os.IsNotExist(err)) + + // Active should still exist. + _, err = os.Stat(mustPodDir(t, store, activeUID)) + assert.NoError(t, err) +} + +func TestPruneCompleted_KeepsRecent(t *testing.T) { + dir := t.TempDir() + store, err := newSnapshotStore(dir, 1*time.Hour) // long TTL + require.NoError(t, err) + + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + err = store.writeInitial(podUID, "/fake/mon_groups/"+podUID, map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 1.0}, + }) + require.NoError(t, err) + err = store.writeFinal(podUID, map[string]map[string]float64{ + "mon_PERF_PKG_00": {"core_energy": 2.0}, + }) + require.NoError(t, err) + + store.pruneCompleted() + + // Should still exist (TTL not expired). + _, err = os.Stat(mustPodDir(t, store, podUID)) + assert.NoError(t, err) +} + +func TestWriteCounterFile_IntegerFormat(t *testing.T) { + f := filepath.Join(t.TempDir(), "test") + require.NoError(t, writeCounterFile(f, 3833856)) + data, err := os.ReadFile(f) + require.NoError(t, err) + assert.Equal(t, "3833856\n", string(data)) +} + +func TestWriteCounterFile_FloatFormat(t *testing.T) { + f := filepath.Join(t.TempDir(), "test") + require.NoError(t, writeCounterFile(f, 792.341167)) + data, err := os.ReadFile(f) + require.NoError(t, err) + assert.Equal(t, "792.341167\n", string(data)) +} diff --git a/cmd/plugins/resctrl-mon/state.go b/cmd/plugins/resctrl-mon/state.go new file mode 100644 index 000000000..f3373bc9f --- /dev/null +++ b/cmd/plugins/resctrl-mon/state.go @@ -0,0 +1,129 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import "sync" + +// podInfo tracks the mon_group directory and container set for a single pod. +type podInfo struct { + monGroupDir string + containers map[string]struct{} // container IDs + initialSnapshotDone bool +} + +// podState tracks all pods with active mon_groups. +type podState struct { + mu sync.Mutex + pods map[string]*podInfo // keyed by pod UID +} + +func newPodState() *podState { + return &podState{ + pods: make(map[string]*podInfo), + } +} + +// addPod registers a new pod with its mon_group directory. +// If the pod already exists, the existing entry is preserved. +func (s *podState) addPod(podUID, monGroupDir string) { + s.mu.Lock() + defer s.mu.Unlock() + if _, ok := s.pods[podUID]; ok { + return + } + s.pods[podUID] = &podInfo{ + monGroupDir: monGroupDir, + containers: make(map[string]struct{}), + } +} + +// addContainer adds a container ID to an existing pod's tracking. +func (s *podState) addContainer(podUID, containerID string) { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + info.containers[containerID] = struct{}{} + } +} + +// removeContainer removes a container ID from a pod's tracking. +func (s *podState) removeContainer(podUID, containerID string) { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + delete(info.containers, containerID) + } +} + +// removePod removes all tracking for a pod. +func (s *podState) removePod(podUID string) { + s.mu.Lock() + defer s.mu.Unlock() + delete(s.pods, podUID) +} + +// getMonGroupDir returns the mon_group directory for a pod, or empty string. +func (s *podState) getMonGroupDir(podUID string) string { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + return info.monGroupDir + } + return "" +} + +// podHasNoContainers returns true if the pod has no remaining containers. +func (s *podState) podHasNoContainers(podUID string) bool { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + return len(info.containers) == 0 + } + return true +} + +// hasPod returns true if the pod UID is being tracked. +func (s *podState) hasPod(podUID string) bool { + s.mu.Lock() + defer s.mu.Unlock() + _, ok := s.pods[podUID] + return ok +} + +// podCount returns the number of tracked pods. +func (s *podState) podCount() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.pods) +} + +// setInitialSnapshotDone marks the initial counter snapshot as taken. +func (s *podState) setInitialSnapshotDone(podUID string) { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + info.initialSnapshotDone = true + } +} + +// isInitialSnapshotDone returns true if the initial snapshot was already taken. +func (s *podState) isInitialSnapshotDone(podUID string) bool { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + return info.initialSnapshotDone + } + return false +} diff --git a/deployment/helm/resctrl-mon/.helmignore b/deployment/helm/resctrl-mon/.helmignore new file mode 100644 index 000000000..bf4d580e7 --- /dev/null +++ b/deployment/helm/resctrl-mon/.helmignore @@ -0,0 +1,20 @@ +# Patterns to ignore when building packages. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deployment/helm/resctrl-mon/Chart.yaml b/deployment/helm/resctrl-mon/Chart.yaml new file mode 100644 index 000000000..f1e3e0d70 --- /dev/null +++ b/deployment/helm/resctrl-mon/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +appVersion: unstable +description: | + The resctrl-mon NRI plugin creates per-pod resctrl monitoring groups + (mon_groups) for Application Energy Telemetry via Kepler passive mode. +name: nri-resctrl-mon +sources: + - https://github.com/containers/nri-plugins +home: https://github.com/containers/nri-plugins +type: application +version: v0.0.0 diff --git a/deployment/helm/resctrl-mon/README.md b/deployment/helm/resctrl-mon/README.md new file mode 100644 index 000000000..534eafd8a --- /dev/null +++ b/deployment/helm/resctrl-mon/README.md @@ -0,0 +1,123 @@ +# Resctrl-Mon Plugin + +This chart deploys the resctrl-mon Node Resource Interface (NRI) plugin. The +resctrl-mon NRI plugin creates per-pod resctrl monitoring groups (mon_groups) +to support Application Energy Telemetry (AET) via Kepler passive mode. + +## Prerequisites + +- Kubernetes 1.24+ +- Helm 3.0.0+ +- Intel CPU with RDT monitoring support (CMT/MBM and/or AET) +- resctrl filesystem mounted at `/sys/fs/resctrl` +- Container runtime: + - containerd: + - At least [containerd 1.7.0](https://github.com/containerd/containerd/releases/tag/v1.7.0) + release version to use the NRI feature. + + - Enable NRI feature by following + [these](https://github.com/containerd/containerd/blob/main/docs/NRI.md#enabling-nri-support-in-containerd) + detailed instructions. You can optionally enable the NRI in containerd + using the Helm chart during the chart installation simply by setting the + `nri.runtime.patchConfig` parameter. For instance, + + ```sh + helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --set nri.runtime.patchConfig=true --namespace kube-system + ``` + + Enabling `nri.runtime.patchConfig` creates an init container to turn on + NRI feature in containerd and only after that proceed the plugin + installation. + + - CRI-O + - At least [v1.26.0](https://github.com/cri-o/cri-o/releases/tag/v1.26.0) + release version to use the NRI feature + - Enable NRI feature by following + [these](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table) + detailed instructions. You can optionally enable the NRI in CRI-O using + the Helm chart during the chart installation simply by setting the + `nri.runtime.patchConfig` parameter. For instance, + + ```sh + helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system --set nri.runtime.patchConfig=true + ``` + +## Installing the Chart + +Path to the chart: `nri-resctrl-mon`. + +```sh +helm repo add nri-plugins https://containers.github.io/nri-plugins +helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system +``` + +The command above deploys resctrl-mon NRI plugin on the Kubernetes cluster +within the `kube-system` namespace with default configuration. To customize the +available parameters as described in the [Configuration options](#configuration-options) +below, you have two options: you can use the `--set` flag or create a custom +values.yaml file and provide it using the `-f` flag. For example: + +```sh +# Install the resctrl-mon plugin with custom values provided using the --set option +helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system --set nri.runtime.patchConfig=true +``` + +```sh +# Install the resctrl-mon plugin with custom values specified in a custom values.yaml file +cat < myPath/values.yaml +nri: + runtime: + patchConfig: true + plugin: + index: 90 + +tolerations: +- key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" +EOF + +helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system -f myPath/values.yaml +``` + +## Uninstalling the Chart + +To uninstall the resctrl-mon plugin run the following command: + +```sh +helm delete my-resctrl-mon --namespace kube-system +``` + +## Security + +The DaemonSet runs with `hostPID: true` because the plugin must write +host-namespace PIDs into resctrl `tasks` files. Without host PID +visibility the kernel rejects the write (`ESRCH`). The container also +requires `SYS_ADMIN` and `DAC_OVERRIDE` capabilities to manage resctrl +`mon_group` directories. + +## Configuration options + +The tables below present an overview of the parameters available for users to +customize with their own values, along with the default values. + +| Name | Default | Description | +| ------------------------ | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- | +| `image.name` | [ghcr.io/containers/nri-plugins/nri-resctrl-mon](https://ghcr.io/containers/nri-plugins/nri-resctrl-mon) | container image name | +| `image.tag` | unstable | container image tag | +| `image.pullPolicy` | Always | image pull policy | +| `resources.cpu` | 10m | cpu resources for the Pod | +| `resources.memory` | 50Mi | memory quota for the Pod | +| `nri.runtime.config.pluginRegistrationTimeout` | "" | set NRI plugin registration timeout in NRI config of containerd or CRI-O | +| `nri.runtime.config.pluginRequestTimeout` | "" | set NRI plugin request timeout in NRI config of containerd or CRI-O | +| `nri.runtime.patchConfig` | false | patch NRI configuration in containerd or CRI-O | +| `nri.plugin.index` | 90 | NRI plugin index to register with | +| `initContainerImage.name` | [ghcr.io/containers/nri-plugins/nri-config-manager](https://ghcr.io/containers/nri-plugins/nri-config-manager) | init container image name | +| `initContainerImage.tag` | unstable | init container image tag | +| `initContainerImage.pullPolicy` | Always | init container image pull policy | +| `tolerations` | [] | specify taint toleration key, operator and effect | +| `affinity` | [] | specify node affinity | +| `nodeSelector` | [] | specify node selector labels | +| `podPriorityClassNodeCritical` | true | enable [marking Pod as node critical](https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/#marking-pod-as-critical) | +| `snapshotDir` | /run/nri-resctrl-mon | host path for counter snapshot JSON files | +| `snapshotTTL` | 5m | how long completed snapshots are kept before pruning | diff --git a/deployment/helm/resctrl-mon/templates/_helpers.tpl b/deployment/helm/resctrl-mon/templates/_helpers.tpl new file mode 100644 index 000000000..9b4239372 --- /dev/null +++ b/deployment/helm/resctrl-mon/templates/_helpers.tpl @@ -0,0 +1,16 @@ +{{/* +Common labels +*/}} +{{- define "nri-plugin.labels" -}} +helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{ include "nri-plugin.selectorLabels" . }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "nri-plugin.selectorLabels" -}} +app.kubernetes.io/name: nri-resctrl-mon +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} diff --git a/deployment/helm/resctrl-mon/templates/configmap.yaml b/deployment/helm/resctrl-mon/templates/configmap.yaml new file mode 100644 index 000000000..3ad305503 --- /dev/null +++ b/deployment/helm/resctrl-mon/templates/configmap.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nri-resctrl-mon-config.default + namespace: {{ .Release.Namespace }} + labels: + {{- include "nri-plugin.labels" . | nindent 4 }} +data: + config.yaml: | + resctrlPath: /sys/fs/resctrl + namespaces: [] + labelSelector: {} diff --git a/deployment/helm/resctrl-mon/templates/daemonset.yaml b/deployment/helm/resctrl-mon/templates/daemonset.yaml new file mode 100644 index 000000000..cfa895273 --- /dev/null +++ b/deployment/helm/resctrl-mon/templates/daemonset.yaml @@ -0,0 +1,117 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + {{- include "nri-plugin.labels" . | nindent 4 }} + name: nri-resctrl-mon + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + {{- include "nri-plugin.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "nri-plugin.labels" . | nindent 8 }} + spec: + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + hostPID: true + nodeSelector: + kubernetes.io/os: "linux" + {{- with .Values.nodeSelector }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.nri.runtime.patchConfig }} + initContainers: + - name: patch-runtime + {{- if (not (or (eq .Values.nri.runtime.config nil) (eq .Values.nri.runtime.config.pluginRegistrationTimeout ""))) }} + args: + - -nri-plugin-registration-timeout + - {{ .Values.nri.runtime.config.pluginRegistrationTimeout }} + - -nri-plugin-request-timeout + - {{ .Values.nri.runtime.config.pluginRequestTimeout }} + {{- end }} + image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + volumeMounts: + - name: containerd-config + mountPath: /etc/containerd + - name: crio-config + mountPath: /etc/crio/crio.conf.d + - name: dbus-socket + mountPath: /var/run/dbus/system_bus_socket + securityContext: + privileged: true + {{- end }} + containers: + - name: nri-resctrl-mon + command: + - nri-resctrl-mon + - --idx + - "{{ .Values.nri.plugin.index | int | printf "%02d" }}" + - --config + - /etc/nri/resctrl-mon/config.yaml + - -v + image: {{ .Values.image.name }}:{{ .Values.image.tag | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: + requests: + cpu: {{ .Values.resources.cpu }} + memory: {{ .Values.resources.memory }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + add: + - SYS_ADMIN + - DAC_OVERRIDE + volumeMounts: + - name: resctrl-mon-config-vol + mountPath: /etc/nri/resctrl-mon + - name: nrisockets + mountPath: /var/run/nri + - name: resctrlfs + mountPath: /sys/fs/resctrl + - name: snapshot-dir + mountPath: {{ .Values.snapshotDir | default "/run/nri-resctrl-mon" }} + {{- if .Values.podPriorityClassNodeCritical }} + priorityClassName: system-node-critical + {{- end }} + volumes: + - name: resctrl-mon-config-vol + configMap: + name: nri-resctrl-mon-config.default + - name: nrisockets + hostPath: + path: /var/run/nri + type: DirectoryOrCreate + - name: resctrlfs + hostPath: + path: /sys/fs/resctrl + type: Directory + - name: snapshot-dir + hostPath: + path: {{ .Values.snapshotDir | default "/run/nri-resctrl-mon" }} + type: DirectoryOrCreate + {{- if .Values.nri.runtime.patchConfig }} + - name: containerd-config + hostPath: + path: /etc/containerd/ + type: DirectoryOrCreate + - name: crio-config + hostPath: + path: /etc/crio/crio.conf.d/ + type: DirectoryOrCreate + - name: dbus-socket + hostPath: + path: /var/run/dbus/system_bus_socket + type: Socket + {{- end }} diff --git a/deployment/helm/resctrl-mon/values.schema.json b/deployment/helm/resctrl-mon/values.schema.json new file mode 100644 index 000000000..9fd52aa3d --- /dev/null +++ b/deployment/helm/resctrl-mon/values.schema.json @@ -0,0 +1,128 @@ +{ + "$schema": "http://json-schema.org/schema#", + "required": [ + "image", + "resources" + ], + "properties": { + "image": { + "type": "object", + "required": [ + "name", + "pullPolicy" + ], + "properties": { + "name": { + "type": "string" + }, + "tag": { + "type": "string" + }, + "pullPolicy": { + "type": "string", + "enum": ["Never", "Always", "IfNotPresent"] + } + } + }, + "initContainerImage": { + "type": "object", + "required": [ + "name", + "pullPolicy" + ], + "properties": { + "name": { + "type": "string" + }, + "tag": { + "type": "string" + }, + "pullPolicy": { + "type": "string", + "enum": ["Never", "Always", "IfNotPresent"] + } + } + }, + "resources": { + "type": "object", + "required": [ + "cpu", + "memory" + ], + "properties": { + "cpu": { + "type": "string" + }, + "memory": { + "type": "string" + } + } + }, + "nri": { + "type": "object", + "required": [ + "plugin", + "runtime" + ], + "properties": { + "plugin": { + "type": "object", + "required": [ + "index" + ], + "properties": { + "index": { + "type": "integer", + "minimum": 0, + "maximum": 99 + } + } + }, + "runtime": { + "type": "object", + "required": [ + "patchConfig" + ], + "properties": { + "patchConfig": { + "type": "boolean" + }, + "config": { + "type": "object", + "required": [ + "pluginRegistrationTimeout", + "pluginRequestTimeout" + ], + "properties": { + "pluginRegistrationTimeout": { + "type": "string", + "$comment": "allowed range is 5-30s", + "pattern": "^(([5-9])|([1-2][0-9])|(30))s$" + }, + "pluginRequestTimeout": { + "type": "string", + "$comment": "allowed range is 2-30s", + "pattern": "^(([2-9])|([1-2][0-9])|(30))s$" + } + } + } + } + } + } + }, + "podPriorityClassNodeCritical": { + "type": "boolean" + }, + "snapshotDir": { + "type": "string", + "description": "Host path for counter snapshot files", + "default": "/run/nri-resctrl-mon" + }, + "snapshotTTL": { + "type": "string", + "description": "Duration to keep completed snapshots (Go duration, e.g. 5m, 1h)", + "default": "5m", + "pattern": "^[0-9]+(s|m|h)$" + } + } +} diff --git a/deployment/helm/resctrl-mon/values.yaml b/deployment/helm/resctrl-mon/values.yaml new file mode 100644 index 000000000..ac3948f6a --- /dev/null +++ b/deployment/helm/resctrl-mon/values.yaml @@ -0,0 +1,69 @@ +# Default values for resctrl-mon. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +--- +image: + name: ghcr.io/containers/nri-plugins/nri-resctrl-mon + # tag, if defined will use the given image tag, otherwise Chart.AppVersion will be used + #tag: unstable + pullPolicy: Always + +resources: + cpu: 10m + memory: 50Mi + +snapshotDir: /run/nri-resctrl-mon +snapshotTTL: 5m + +nri: + plugin: + index: 90 + runtime: + patchConfig: false +# config: +# pluginRegistrationTimeout: 5s +# pluginRequestTimeout: 2s + +initContainerImage: + name: ghcr.io/containers/nri-plugins/nri-config-manager + # If not defined Chart.AppVersion will be used + #tag: unstable + pullPolicy: Always + +tolerations: [] +# +# Example: +# +# tolerations: +# - key: "node-role.kubernetes.io/control-plane" +# operator: "Exists" +# effect: "NoSchedule" + +affinity: [] +# +# Example: +# +# affinity: +# nodeAffinity: +# requiredDuringSchedulingIgnoredDuringExecution: +# nodeSelectorTerms: +# - matchExpressions: +# - key: feature.node.kubernetes.io/cpu-rdt.mon +# operator: In +# values: +# - "true" + +nodeSelector: [] +# +# Example: +# +# nodeSelector: +# feature.node.kubernetes.io/cpu-rdt.mon: "true" + +# NRI plugins should be considered as part of the container runtime. +# By default we make them part of the system-node-critical priority +# class. This should mitigate the potential risk of a plugin getting +# evicted under heavy system load. It should also ensure that during +# autoscaling enough new nodes are brought up to leave room for the +# plugin on each new node. +podPriorityClassNodeCritical: true diff --git a/docs/deployment/helm/index.md b/docs/deployment/helm/index.md index 183dacfe0..d34d9d84e 100644 --- a/docs/deployment/helm/index.md +++ b/docs/deployment/helm/index.md @@ -52,5 +52,6 @@ template.md memory-qos.md memtierd.md sgx-epc.md +resctrl-mon.md resource-annotator.md ``` diff --git a/docs/deployment/helm/resctrl-mon.md b/docs/deployment/helm/resctrl-mon.md new file mode 100644 index 000000000..8e87b8257 --- /dev/null +++ b/docs/deployment/helm/resctrl-mon.md @@ -0,0 +1,2 @@ +```{include} ../../../deployment/helm/resctrl-mon/README.md +``` diff --git a/docs/index.md b/docs/index.md index d6e4a7a36..dcb17f837 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,6 +8,7 @@ caption: Contents introduction.md resource-policy/index.rst memory/index.md +monitoring/index.md deployment/index.md contributing.md Project GitHub repository diff --git a/docs/monitoring/index.md b/docs/monitoring/index.md new file mode 100644 index 000000000..33ef9f208 --- /dev/null +++ b/docs/monitoring/index.md @@ -0,0 +1,9 @@ +# Monitoring plugins + +```{toctree} +--- +maxdepth: 2 +caption: Contents +--- +resctrl-mon.md +``` diff --git a/docs/monitoring/resctrl-mon.md b/docs/monitoring/resctrl-mon.md new file mode 100644 index 000000000..96fb20735 --- /dev/null +++ b/docs/monitoring/resctrl-mon.md @@ -0,0 +1,263 @@ +# Resctrl-Mon NRI Plugin + +The resctrl-mon NRI plugin creates per-pod resctrl monitoring groups +(`mon_groups`) to support Kepler's passive mode for Application Energy +Telemetry (AET). + +When a container is created, the plugin assigns its init process to a +`mon_group` before the process starts executing. The Linux kernel then +propagates the RMID (Resource Monitoring ID) to all child processes +automatically, eliminating the fork race that affects userspace-based +approaches. + +## How It Works + +1. The container runtime creates a container process (paused). +2. The NRI `PostCreateContainer` hook fires. +3. The plugin creates a `mon_group` named with the pod's UUID under + the appropriate resctrl control group. +4. The NRI `StartContainer` hook fires with the container's init PID. +5. The plugin writes the init PID to the `mon_group`'s `tasks` file. + (If the PID is not yet available, `PostStartContainer` retries.) +6. The runtime starts the container. All child processes inherit the RMID. +7. Kepler scans the resctrl filesystem and reads monitoring data. +8. When the last container in a pod stops, the plugin removes the `mon_group`. + +The plugin DaemonSet runs with `hostPID: true` so that it can write +host-namespace PIDs to the resctrl `tasks` file. Without `hostPID`, +the kernel rejects the write with `ESRCH` because the PID does not +exist in the plugin's PID namespace. + +## Mon_Group Naming + +Mon_groups are named with the Kubernetes pod UID: + +``` +/sys/fs/resctrl/[/]mon_groups// +``` + +This enables Kepler to correlate monitoring data with Kubernetes metadata +by querying the K8s API using the pod UID extracted from the directory name. + +## Plugin Configuration + +Configuration is loaded from a YAML file specified with the `-config` flag +or pushed by the container runtime via NRI. + +```yaml +# Path to the resctrl filesystem. Override for testing. +resctrlPath: /sys/fs/resctrl + +# Namespace filter: only create mon_groups for pods in these namespaces. +# Empty list = all namespaces. +namespaces: [] + +# Pod label selector: only create mon_groups for pods matching these labels. +# Empty = all pods. +labelSelector: {} + +# Directory for counter snapshot files (see "Counter Snapshots" below). +snapshotDir: /run/nri-resctrl-mon + +# How long completed snapshots are kept before pruning (Go duration). +snapshotTTL: 5m +``` + +## Coexistence with Allocation Plugins + +If an NRI resource allocation plugin (balloons, topology-aware) is running, +it assigns containers to RDT classes via `SetLinuxRDTClass`. The resctrl-mon +plugin reads the effective RDT class from the NRI container spec and creates +`mon_groups` under the corresponding control group: + +``` +/sys/fs/resctrl//mon_groups// +``` + +The container keeps its CLOSID (allocation) and gets a distinct RMID +(monitoring). If no allocation plugin is active, `mon_groups` are created +under the root resctrl directory. + +## RMID Management + +RMID allocation is delegated entirely to the Linux kernel: + +- **Allocation**: `mkdir` on a `mon_group` directory assigns an RMID. If + none are available, the kernel returns `ENOSPC` and the plugin logs a + warning and skips the pod. +- **Deallocation**: `rmdir` releases the RMID. The kernel handles the + hardware recycling window. + +## Counter Snapshots + +Resctrl monitoring counters in a new `mon_group` do not start at zero — +the kernel seeds them with the current hardware register value. This +creates measurement gaps: + +1. **Start gap**: The delta between mon_group creation and the first + consumer scrape is missed. +2. **End gap**: The delta between the last scrape and mon_group removal + is lost. +3. **Missed pods**: Pods whose entire lifetime falls between two scrape + intervals are never observed. + +To close these gaps the plugin writes a directory tree to a configurable +host path (default `/run/nri-resctrl-mon/`) that mirrors the resctrl +`mon_data` layout: + +- **On first PID write** (`StartContainer`/`PostStartContainer`): the + plugin reads all `mon_data/` event files, writes `.begin` counter + files, and creates symlinks to the live resctrl event files. +- **On last container stop** (`StopContainer`, before `rmdir`): the + plugin reads the counters again and writes `.end` counter files. + +Consumers compute lifetime totals as: + +``` +lifetime = cat .end - cat .begin +``` + +### Snapshot Directory Layout + +Each pod gets a directory named by its UID: + +``` +/run/nri-resctrl-mon// + created_at # RFC 3339 timestamp + completed_at # RFC 3339 timestamp (appears on stop) + mon_data/ + mon_PERF_PKG_00/ + core_energy → /sys/fs/resctrl/.../core_energy (symlink) + core_energy.begin # 792.341167 + core_energy.end # 1285.102834 + activity → ... + activity.begin # 6009.826054 + activity.end # 12019.652108 + mon_L3_00/ + llc_occupancy → ... + llc_occupancy.begin # 3833856 + llc_occupancy.end # 4521984 + mbm_local_bytes.begin # 7912636416 + mbm_local_bytes.end # 15825272832 + ... +``` + +Counter files use the same one-number-per-file text format as the kernel. +Symlinks point to the live resctrl event files and become dangling after +the `mon_group` is removed — a natural signal that the pod has stopped +and `.end` files contain the final values. + +Consumers that already read `/sys/fs/resctrl` directly can discover and +use the snapshot data with no new parsing logic. If the snapshot directory +is absent, consumers fall back to their existing resctrl scraping path. + +Completed snapshots are pruned after `snapshotTTL` (default 5 minutes). +The TTL should be set longer than the consumer's scrape interval. + +### Consumer Mount Requirements + +The snapshot directory (`/run/nri-resctrl-mon`) can be mounted at any +container path — `.begin`, `.end`, `created_at`, and `completed_at` are +regular files that work regardless of mount layout. + +The symlinks to live resctrl counters use **absolute host paths** (e.g. +`/sys/fs/resctrl/mon_groups//mon_data/.../core_energy`). They +resolve correctly only if the consumer container mounts the resctrl +filesystem at the **same path** as the host: + +```yaml +volumeMounts: + - name: resctrlfs + mountPath: /sys/fs/resctrl # must match the host path + readOnly: true + - name: snapshots + mountPath: /run/nri-resctrl-mon # can be any path + readOnly: true +``` + +If the consumer mounts resctrl at a different path (e.g. +`/host/sys/fs/resctrl`), the symlinks will not resolve. This does not +affect the primary use case: computing lifetime totals from `.begin` and +`.end` files requires no symlink resolution. The symlinks are a +convenience for reading the live counter value mid-lifecycle. + +After a pod stops, its symlinks become dangling because the kernel +`mon_group` has been removed. This is expected and serves as a signal +that the `.end` files contain the final counter values. + +## Developer's Guide + +### Prerequisites + +- Containerd v1.7+ or CRI-O v1.26+ +- Enable NRI in /etc/containerd/config.toml: + + ```toml + [plugins."io.containerd.nri.v1.nri"] + disable = false + disable_connections = false + plugin_config_path = "/etc/nri/conf.d" + plugin_path = "/opt/nri/plugins" + plugin_registration_timeout = "5s" + plugin_request_timeout = "2s" + socket_path = "/var/run/nri/nri.sock" + ``` + +- Intel CPU with RDT monitoring support +- resctrl filesystem mounted at `/sys/fs/resctrl` + +### Build + +```bash +make PLUGINS=nri-resctrl-mon build-plugins +``` + +### Run + +```bash +./build/bin/nri-resctrl-mon -config sample-configs/nri-resctrl-mon.yaml -idx 90 -vv +``` + +### Manual Test + +Verify that `mon_groups` are created when pods start: + +```bash +# Start a test pod +kubectl run test-pod --image=busybox -- sleep 3600 + +# Check that a mon_group was created with the pod UID +POD_UID=$(kubectl get pod test-pod -o jsonpath='{.metadata.uid}') + +# Without an RDT allocation plugin, mon_groups are under the root class: +MON_GROUP_BASE=/sys/fs/resctrl/mon_groups +# With an allocation plugin that assigns an RDT class (e.g. BestEffort): +# MON_GROUP_BASE=/sys/fs/resctrl/BestEffort/mon_groups + +ls "$MON_GROUP_BASE/$POD_UID/" + +# Verify monitoring data is available +cat "$MON_GROUP_BASE/$POD_UID/mon_data/mon_L3_00/llc_occupancy" +``` + +### Debug + +```bash +go install github.com/go-delve/delve/cmd/dlv@latest +dlv exec build/bin/nri-resctrl-mon -- -config sample-configs/nri-resctrl-mon.yaml -idx 90 +(dlv) break plugin.PostCreateContainer +(dlv) continue +``` + +### Deploy + +Build an image, import it on the node, and deploy the plugin by +running the following in `nri-plugins`: + +```bash +rm -rf build +make clean +make PLUGINS=nri-resctrl-mon IMAGE_VERSION=devel images +ctr -n k8s.io images import build/images/nri-resctrl-mon-image-*.tar +kubectl create -f build/images/nri-resctrl-mon-deployment.yaml +``` diff --git a/sample-configs/nri-resctrl-mon.yaml b/sample-configs/nri-resctrl-mon.yaml new file mode 100644 index 000000000..52f63372c --- /dev/null +++ b/sample-configs/nri-resctrl-mon.yaml @@ -0,0 +1,5 @@ +resctrlPath: /sys/fs/resctrl +namespaces: [] +labelSelector: {} +snapshotDir: /run/nri-resctrl-mon +snapshotTTL: 5m