From 6447e72d7f550ea3f33a3e91192a3914e5801ceb Mon Sep 17 00:00:00 2001
From: Ievgen Bondarenko <sactransport2000@gmail.com>
Date: Tue, 19 May 2026 17:06:35 -0700
Subject: [PATCH] cgroupfs: drop registry entries on filesystem release

Commit 26ef5174081be0b4b1f750a97e75ee6bad5d5a53 added
CgroupRegistry.RemoveCgroup and called it from dir.RmDir so that
destroyed cgroup directories do not stay in
kernel.CgroupRegistry.cgroups across save/restore. That fix covers
rmdir but two release paths still leak:

1. GetFilesystem error window. newCgroupInode at base.go:190 calls
   r.AddCgroup for the root cgroup before prepareInitialCgroup and
   r.Register run. On either failure the code calls rootD.DecRef and
   fs.VFSFilesystem().DecRef. Release then runs but currently skips
   ReleaseCgroupHierarchy and Unregister because fs.hierarchyID is
   still InvalidCgroupHierarchyID. The root cgroup id stays in the
   registry forever. Repeated mount-failures accumulate unbounded
   entries.

2. prepareInitialCgroup creates intermediate cgroup directories via
   newDirWithOwner, each of which calls AddCgroup. If
   prepareInitialCgroup fails partway, the same residue applies.

Walk fs.root in Release and call RemoveCgroup for each cgroupInode in
the subtree. RemoveCgroup is documented as a no-op for ids not in the
map (cgroup.go:556-558), so cgroups that RmDir already removed cost
nothing and the rmdir path stays the primary owner.

Signed-off-by: Ievgen Bondarenko <sactransport2000@gmail.com>
---
 pkg/sentry/fsimpl/cgroupfs/cgroupfs.go | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
index e564d06888..c98fc0d37d 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
@@ -468,6 +468,14 @@ func (fs *filesystem) Release(ctx context.Context) {
 	k := kernel.KernelFromContext(ctx)
 	r := k.CgroupRegistry()
 
+	// Drop any registry entries still held by this filesystem's cgroup
+	// inodes. RmDir already calls RemoveCgroup for cgroups that were
+	// unlinked through rmdir; the remaining entries are the root cgroup
+	// and, when GetFilesystem fails after newCgroupInode, any partially
+	// initialised children. RemoveCgroup is a no-op for ids not in the
+	// map, mirroring the symmetry with AddCgroup at base.go newCgroupInode.
+	fs.removeCgroupsFromRegistryLocked(r)
+
 	if fs.hierarchyID != kernel.InvalidCgroupHierarchyID {
 		k.ReleaseCgroupHierarchy(fs.hierarchyID)
 		r.Unregister(fs.hierarchyID)
@@ -481,6 +489,30 @@ func (fs *filesystem) Release(ctx context.Context) {
 	fs.Filesystem.Release(ctx)
 }
 
+// removeCgroupsFromRegistryLocked walks fs.root and removes the id of every
+// cgroupInode in the subtree from the kernel cgroup registry. RemoveCgroup is
+// a no-op for ids already removed via RmDir, so calling it for cgroups that
+// were rmdir'd before the filesystem was released is safe.
+func (fs *filesystem) removeCgroupsFromRegistryLocked(r *kernel.CgroupRegistry) {
+	if fs.root == nil {
+		return
+	}
+	rootInode, ok := fs.root.Inode().(*cgroupInode)
+	if !ok {
+		return
+	}
+	var walk func(c *cgroupInode)
+	walk = func(c *cgroupInode) {
+		r.RemoveCgroup(c.id)
+		c.dir.forEachChildDir(func(child *dir) {
+			if child.cgi != nil {
+				walk(child.cgi)
+			}
+		})
+	}
+	walk(rootInode)
+}
+
 // MountOptions implements vfs.FilesystemImpl.MountOptions.
 func (fs *filesystem) MountOptions() string {
 	var cnames []string