From 6447e72d7f550ea3f33a3e91192a3914e5801ceb Mon Sep 17 00:00:00 2001 From: Ievgen Bondarenko Date: Tue, 19 May 2026 17:06:35 -0700 Subject: [PATCH] cgroupfs: drop registry entries on filesystem release Commit 26ef5174081be0b4b1f750a97e75ee6bad5d5a53 added CgroupRegistry.RemoveCgroup and called it from dir.RmDir so that destroyed cgroup directories do not stay in kernel.CgroupRegistry.cgroups across save/restore. That fix covers rmdir but two release paths still leak: 1. GetFilesystem error window. newCgroupInode at base.go:190 calls r.AddCgroup for the root cgroup before prepareInitialCgroup and r.Register run. On either failure the code calls rootD.DecRef and fs.VFSFilesystem().DecRef. Release then runs but currently skips ReleaseCgroupHierarchy and Unregister because fs.hierarchyID is still InvalidCgroupHierarchyID. The root cgroup id stays in the registry forever. Repeated mount-failures accumulate unbounded entries. 2. prepareInitialCgroup creates intermediate cgroup directories via newDirWithOwner, each of which calls AddCgroup. If prepareInitialCgroup fails partway, the same residue applies. Walk fs.root in Release and call RemoveCgroup for each cgroupInode in the subtree. RemoveCgroup is documented as a no-op for ids not in the map (cgroup.go:556-558), so cgroups that RmDir already removed cost nothing and the rmdir path stays the primary owner. Signed-off-by: Ievgen Bondarenko --- pkg/sentry/fsimpl/cgroupfs/cgroupfs.go | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index e564d06888..c98fc0d37d 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -468,6 +468,14 @@ func (fs *filesystem) Release(ctx context.Context) { k := kernel.KernelFromContext(ctx) r := k.CgroupRegistry() + // Drop any registry entries still held by this filesystem's cgroup + // inodes. RmDir already calls RemoveCgroup for cgroups that were + // unlinked through rmdir; the remaining entries are the root cgroup + // and, when GetFilesystem fails after newCgroupInode, any partially + // initialised children. RemoveCgroup is a no-op for ids not in the + // map, mirroring the symmetry with AddCgroup at base.go newCgroupInode. + fs.removeCgroupsFromRegistryLocked(r) + if fs.hierarchyID != kernel.InvalidCgroupHierarchyID { k.ReleaseCgroupHierarchy(fs.hierarchyID) r.Unregister(fs.hierarchyID) @@ -481,6 +489,30 @@ func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.Release(ctx) } +// removeCgroupsFromRegistryLocked walks fs.root and removes the id of every +// cgroupInode in the subtree from the kernel cgroup registry. RemoveCgroup is +// a no-op for ids already removed via RmDir, so calling it for cgroups that +// were rmdir'd before the filesystem was released is safe. +func (fs *filesystem) removeCgroupsFromRegistryLocked(r *kernel.CgroupRegistry) { + if fs.root == nil { + return + } + rootInode, ok := fs.root.Inode().(*cgroupInode) + if !ok { + return + } + var walk func(c *cgroupInode) + walk = func(c *cgroupInode) { + r.RemoveCgroup(c.id) + c.dir.forEachChildDir(func(child *dir) { + if child.cgi != nil { + walk(child.cgi) + } + }) + } + walk(rootInode) +} + // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { var cnames []string