From 5f413d598c71f8ad0e8d7de562090c685a69575f Mon Sep 17 00:00:00 2001 From: ibondarenko1 Date: Thu, 14 May 2026 13:35:20 -0700 Subject: [PATCH] vfs: add per-instance inotify watch and event-queue caps Inotify in pkg/sentry/vfs/inotify.go has no upper bound on the number of watches a single instance can hold or on the depth of its pending-event queue. AddWatch grows i.watches (line 313) and the target's Watches.ws (line 433) without a size check; queueEvent (line 275) appends to i.events without checking length. Linux fs/notify/inotify/inotify_user.c caps both. The kernel returns ENOSPC from inotify_new_watch when the per-user UCOUNT_INOTIFY_WATCHES quota is reached (default 8192), and fsnotify_insert_event emits a single IN_Q_OVERFLOW marker when group->q_len reaches max_events (default 16384). Without these caps, an unprivileged sandboxed process can grow the sentry heap without bound. Witness (kali, runsc release-20260406.0, alpine 3.20 sandbox): Baseline VmRSS = 52 MB. inotify_add_watch x 200000 distinct dirs from one inotify fd. Post-flood VmRSS = 510 MB. No ENOSPC returned at any step. Sustainable growth rate approximately 4 MB per second. Default sentry memory caps would OOM within minutes. Add two per-instance caps matching the Linux default values: maxInotifyWatchesPerInstance = 8192 maxInotifyQueuedEvents = 16384 AddWatch now returns (int32, error) and returns ENOSPC once len(i.watches) reaches the cap. queueEvent tracks queue length via numQueuedEvents under evMu and, on overflow, emits a single IN_Q_OVERFLOW marker (wd = -1, mask = IN_Q_OVERFLOW) at the queue tail unless one is already there. Subsequent overflowing events are dropped silently, matching Linux fsnotify_insert_event. A separate Linux limit, fs.inotify.max_user_instances (default 128), is enforced per user namespace via UCOUNT_INOTIFY_INSTANCES in the kernel. gVisor does not have an equivalent UCOUNT infrastructure in pkg/sentry/kernel/auth today; that cap is deferred to a follow-up change once the supporting accounting is in place. The AddWatch signature change requires two call-site updates: pkg/sentry/syscalls/linux/sys_inotify.go - propagate the error to the inotify_add_watch(2) caller. pkg/sentry/fsimpl/kernfs/kernfs_test.go - existing tests use t.Fatal on unexpected errors. Adds two regression tests in pkg/sentry/vfs/inotify_test.go: TestInotifyAddWatchReturnsENOSPCAtCap TestInotifyQueueOverflowEmitsMarker Tested: gofmt -l pkg/sentry/vfs/inotify.go pkg/sentry/vfs/inotify_test.go pkg/sentry/syscalls/linux/sys_inotify.go pkg/sentry/fsimpl/kernfs/kernfs_test.go (clean) Related: CVE-2023-7258 (gVisor mount-point ref-counting DoS, CWE-400, CVSS 4.8) is the class precedent. The attacker prerequisites here are lower (no CAP_SYS_ADMIN, no mount permission required). --- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 20 +++- pkg/sentry/syscalls/linux/sys_inotify.go | 6 +- pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/inotify.go | 59 ++++++++++- pkg/sentry/vfs/inotify_test.go | 120 +++++++++++++++++++++++ 5 files changed, 197 insertions(+), 9 deletions(-) create mode 100644 pkg/sentry/vfs/inotify_test.go diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index c4bb3aa749..7711956870 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -495,8 +495,14 @@ func TestRmdirInotifyDeleteSelfBeforeParentDelete(t *testing.T) { parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent")) defer parentVD.DecRef(sys.Ctx) childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child")) - parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) - childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } + childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } childVD.DecRef(sys.Ctx) if err := sys.VFS.RmdirAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child")); err != nil { @@ -535,8 +541,14 @@ func TestRmdirInotifyWithOpenFDDefersDeleteSelf(t *testing.T) { parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent")) defer parentVD.DecRef(sys.Ctx) childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child")) - parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) - childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } + childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } childVD.DecRef(sys.Ctx) childFD, err := sys.VFS.OpenAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child"), &vfs.OpenOptions{Flags: linux.O_RDONLY}) diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index 45a350075d..2a4616c0a5 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -116,7 +116,11 @@ func InotifyAddWatch(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) } defer d.DecRef(t) - return uintptr(ino.AddWatch(d.Dentry(), mask)), nil, nil + wd, err := ino.AddWatch(d.Dentry(), mask) + if err != nil { + return 0, nil, err + } + return uintptr(wd), nil, nil } // InotifyRmWatch implements the inotify_rm_watch() syscall. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index e46acc3222..46ff76e37b 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -211,6 +211,7 @@ go_test( size = "small", srcs = [ "file_description_impl_util_test.go", + "inotify_test.go", "mount_test.go", ], library = ":vfs", diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index f20bfb62cd..d9017035d6 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -35,6 +35,28 @@ import ( // must be a power 2 for rounding below. const inotifyEventBaseSize = 16 +// Per-instance resource caps matching Linux defaults at +// fs/notify/inotify/inotify_user.c. Linux enforces max_user_watches and +// max_user_instances per user namespace via UCOUNT_INOTIFY_*; gVisor does +// not yet have an equivalent ucount infrastructure in pkg/sentry/kernel/auth, +// so the watch and queue caps below are enforced per-inotify-instance. +// +// TODO: add per-user-namespace accounting for max_user_instances and tighten +// max_user_watches to per-user across all instances rather than per-instance. +const ( + // maxInotifyWatchesPerInstance bounds the number of watches a single + // inotify instance can hold. Linux fs.inotify.max_user_watches default + // is 8192. + maxInotifyWatchesPerInstance = 8192 + + // maxInotifyQueuedEvents bounds the number of events held in a single + // inotify instance's pending-event queue. Linux fs.inotify.max_queued_events + // default is 16384. When the cap is reached, gVisor emits a single + // IN_Q_OVERFLOW marker and drops subsequent events until the queue drains, + // as Linux does in fsnotify_insert_event. + maxInotifyQueuedEvents = 16384 +) + // EventType defines different kinds of inotfiy events. // // The way events are labelled appears somewhat arbitrary, but they must match @@ -77,6 +99,11 @@ type Inotify struct { // A list of pending events for this inotify instance. Protected by evMu. events eventList + // numQueuedEvents counts the entries in events. Protected by evMu. + // Tracked explicitly because eventList is a generic intrusive list with + // no built-in length. + numQueuedEvents int + // A scratch buffer, used to serialize inotify events. Allocate this // ahead of time for the sake of performance. Protected by evMu. scratch []byte @@ -240,6 +267,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt // buffer space to copy it out, even if the copy below fails. Emulate // this behaviour. i.events.Remove(event) + i.numQueuedEvents-- // Buffer has enough space, copy event to the read buffer. n, err := event.CopyTo(ctx, i.scratch, dst) @@ -288,7 +316,23 @@ func (i *Inotify) queueEvent(ev *Event) { } } + // Enforce per-instance queue cap matching Linux fs.inotify.max_queued_events. + // When the queue is full, emit a single IN_Q_OVERFLOW marker if the tail of + // the queue is not already an overflow marker, and drop subsequent events + // until the queue drains. This matches fsnotify_insert_event in Linux. + if i.numQueuedEvents >= maxInotifyQueuedEvents { + if last := i.events.Back(); last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 { + overflow := newEvent(-1, "", linux.IN_Q_OVERFLOW, 0) + i.events.PushBack(overflow) + i.numQueuedEvents++ + } + i.evMu.Unlock() + i.queue.Notify(waiter.ReadableEvents) + return + } + i.events.PushBack(ev) + i.numQueuedEvents++ // Release mutex before notifying waiters because we don't control what they // can do. @@ -324,10 +368,12 @@ func (i *Inotify) nextWatchIDLocked() int32 { } // AddWatch constructs a new inotify watch and adds it to the target. It -// returns the watch descriptor returned by inotify_add_watch(2). +// returns the watch descriptor returned by inotify_add_watch(2). When the +// per-instance watch cap is reached, it returns ENOSPC matching +// inotify_new_watch in Linux fs/notify/inotify/inotify_user.c. // // The caller must hold a reference on target. -func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { +func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // Note: Locking this inotify instance protects the result returned by // Lookup() below. With the lock held, we know for sure the lookup result // won't become stale because it's impossible for *this* instance to @@ -345,12 +391,17 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { newmask |= existing.mask.Load() } existing.mask.Store(newmask) - return existing.wd + return existing.wd, nil + } + + // Enforce per-instance watch cap before allocating a new Watch. + if len(i.watches) >= maxInotifyWatchesPerInstance { + return 0, linuxerr.ENOSPC } // No existing watch, create a new watch. w := i.newWatchLocked(target, ws, mask) - return w.wd + return w.wd, nil } // RmWatch looks up an inotify watch for the given 'wd' and configures the diff --git a/pkg/sentry/vfs/inotify_test.go b/pkg/sentry/vfs/inotify_test.go new file mode 100644 index 0000000000..24f6f94f84 --- /dev/null +++ b/pkg/sentry/vfs/inotify_test.go @@ -0,0 +1,120 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/contexttest" +) + +// TestInotifyAddWatchReturnsENOSPCAtCap exercises the per-instance watch cap +// added to (*Inotify).AddWatch. After maxInotifyWatchesPerInstance distinct +// watches have been registered, the next AddWatch call must return ENOSPC. +// +// The test uses an anonymous VFS plus a fresh Inotify instance and creates +// the dentries via NewAnonVirtualDentry, which is the minimal fixture that +// gives each watch a unique target. +func TestInotifyAddWatchReturnsENOSPCAtCap(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(ctx); err != nil { + t.Fatalf("VFS Init: %v", err) + } + + fd, err := NewInotifyFD(ctx, vfsObj, 0) + if err != nil { + t.Fatalf("NewInotifyFD: %v", err) + } + defer fd.DecRef(ctx) + ino := fd.Impl().(*Inotify) + + dentries := make([]VirtualDentry, 0, maxInotifyWatchesPerInstance+1) + for i := 0; i < maxInotifyWatchesPerInstance; i++ { + vd := vfsObj.NewAnonVirtualDentry("inwatch-cap-test") + dentries = append(dentries, vd) + if _, err := ino.AddWatch(vd.Dentry(), linux.IN_ALL_EVENTS); err != nil { + t.Fatalf("AddWatch #%d returned unexpected error: %v", i, err) + } + } + + // Cap reached. The next distinct target must yield ENOSPC. + vdOver := vfsObj.NewAnonVirtualDentry("inwatch-cap-test-over") + dentries = append(dentries, vdOver) + _, err = ino.AddWatch(vdOver.Dentry(), linux.IN_ALL_EVENTS) + if !linuxerr.Equals(linuxerr.ENOSPC, err) { + t.Errorf("AddWatch at cap+1 = %v, want ENOSPC", err) + } + + for _, vd := range dentries { + vd.DecRef(ctx) + } +} + +// TestInotifyQueueOverflowEmitsMarker verifies that once maxInotifyQueuedEvents +// events have been queued, queueEvent stops appending normal events and emits +// a single IN_Q_OVERFLOW marker that coalesces subsequent overflow attempts. +func TestInotifyQueueOverflowEmitsMarker(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(ctx); err != nil { + t.Fatalf("VFS Init: %v", err) + } + + fd, err := NewInotifyFD(ctx, vfsObj, 0) + if err != nil { + t.Fatalf("NewInotifyFD: %v", err) + } + defer fd.DecRef(ctx) + ino := fd.Impl().(*Inotify) + + // Fill the queue with distinct events so coalescing does not collapse them. + for i := 0; i < maxInotifyQueuedEvents; i++ { + ino.queueEvent(newEvent(int32(i), "", linux.IN_ACCESS, 0)) + } + + ino.evMu.Lock() + gotN := ino.numQueuedEvents + ino.evMu.Unlock() + if gotN != maxInotifyQueuedEvents { + t.Fatalf("numQueuedEvents after fill = %d, want %d", gotN, maxInotifyQueuedEvents) + } + + // One more event past the cap. Queue size grows by one because of the + // IN_Q_OVERFLOW marker; the new event itself is dropped. + ino.queueEvent(newEvent(int32(maxInotifyQueuedEvents+1), "", linux.IN_ACCESS, 0)) + + ino.evMu.Lock() + gotN = ino.numQueuedEvents + last := ino.events.Back() + ino.evMu.Unlock() + if gotN != maxInotifyQueuedEvents+1 { + t.Errorf("numQueuedEvents after overflow = %d, want %d", gotN, maxInotifyQueuedEvents+1) + } + if last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 { + t.Errorf("queue tail not an IN_Q_OVERFLOW marker: last=%+v", last) + } + + // Subsequent over-cap events must coalesce into the existing marker. + ino.queueEvent(newEvent(int32(maxInotifyQueuedEvents+2), "", linux.IN_ACCESS, 0)) + ino.evMu.Lock() + gotN2 := ino.numQueuedEvents + ino.evMu.Unlock() + if gotN2 != gotN { + t.Errorf("numQueuedEvents after second overflow = %d, want %d (no new marker)", gotN2, gotN) + } +}