From 5f413d598c71f8ad0e8d7de562090c685a69575f Mon Sep 17 00:00:00 2001
From: ibondarenko1 <sactransport2000@gmail.com>
Date: Thu, 14 May 2026 13:35:20 -0700
Subject: [PATCH] vfs: add per-instance inotify watch and event-queue caps

Inotify in pkg/sentry/vfs/inotify.go has no upper bound on the number of
watches a single instance can hold or on the depth of its pending-event
queue. AddWatch grows i.watches (line 313) and the target's Watches.ws
(line 433) without a size check; queueEvent (line 275) appends to
i.events without checking length.

Linux fs/notify/inotify/inotify_user.c caps both. The kernel returns
ENOSPC from inotify_new_watch when the per-user UCOUNT_INOTIFY_WATCHES
quota is reached (default 8192), and fsnotify_insert_event emits a
single IN_Q_OVERFLOW marker when group->q_len reaches max_events
(default 16384). Without these caps, an unprivileged sandboxed process
can grow the sentry heap without bound.

Witness (kali, runsc release-20260406.0, alpine 3.20 sandbox):

  Baseline VmRSS = 52 MB.
  inotify_add_watch x 200000 distinct dirs from one inotify fd.
  Post-flood VmRSS = 510 MB. No ENOSPC returned at any step.
  Sustainable growth rate approximately 4 MB per second.
  Default sentry memory caps would OOM within minutes.

Add two per-instance caps matching the Linux default values:

  maxInotifyWatchesPerInstance = 8192
  maxInotifyQueuedEvents       = 16384

AddWatch now returns (int32, error) and returns ENOSPC once
len(i.watches) reaches the cap. queueEvent tracks queue length via
numQueuedEvents under evMu and, on overflow, emits a single
IN_Q_OVERFLOW marker (wd = -1, mask = IN_Q_OVERFLOW) at the queue
tail unless one is already there. Subsequent overflowing events are
dropped silently, matching Linux fsnotify_insert_event.

A separate Linux limit, fs.inotify.max_user_instances (default 128),
is enforced per user namespace via UCOUNT_INOTIFY_INSTANCES in the
kernel. gVisor does not have an equivalent UCOUNT infrastructure in
pkg/sentry/kernel/auth today; that cap is deferred to a follow-up
change once the supporting accounting is in place.

The AddWatch signature change requires two call-site updates:
  pkg/sentry/syscalls/linux/sys_inotify.go - propagate the error to
    the inotify_add_watch(2) caller.
  pkg/sentry/fsimpl/kernfs/kernfs_test.go - existing tests use t.Fatal
    on unexpected errors.

Adds two regression tests in pkg/sentry/vfs/inotify_test.go:
  TestInotifyAddWatchReturnsENOSPCAtCap
  TestInotifyQueueOverflowEmitsMarker

Tested:
  gofmt -l pkg/sentry/vfs/inotify.go pkg/sentry/vfs/inotify_test.go             pkg/sentry/syscalls/linux/sys_inotify.go             pkg/sentry/fsimpl/kernfs/kernfs_test.go
  (clean)

Related: CVE-2023-7258 (gVisor mount-point ref-counting DoS, CWE-400,
CVSS 4.8) is the class precedent. The attacker prerequisites here
are lower (no CAP_SYS_ADMIN, no mount permission required).
---
 pkg/sentry/fsimpl/kernfs/kernfs_test.go  |  20 +++-
 pkg/sentry/syscalls/linux/sys_inotify.go |   6 +-
 pkg/sentry/vfs/BUILD                     |   1 +
 pkg/sentry/vfs/inotify.go                |  59 ++++++++++-
 pkg/sentry/vfs/inotify_test.go           | 120 +++++++++++++++++++++++
 5 files changed, 197 insertions(+), 9 deletions(-)
 create mode 100644 pkg/sentry/vfs/inotify_test.go

diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c4bb3aa749..7711956870 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -495,8 +495,14 @@ func TestRmdirInotifyDeleteSelfBeforeParentDelete(t *testing.T) {
 	parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent"))
 	defer parentVD.DecRef(sys.Ctx)
 	childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child"))
-	parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
-	childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
+	parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
+	if err != nil {
+		t.Fatalf("AddWatch failed: %v", err)
+	}
+	childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
+	if err != nil {
+		t.Fatalf("AddWatch failed: %v", err)
+	}
 	childVD.DecRef(sys.Ctx)
 
 	if err := sys.VFS.RmdirAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child")); err != nil {
@@ -535,8 +541,14 @@ func TestRmdirInotifyWithOpenFDDefersDeleteSelf(t *testing.T) {
 	parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent"))
 	defer parentVD.DecRef(sys.Ctx)
 	childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child"))
-	parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
-	childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
+	parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
+	if err != nil {
+		t.Fatalf("AddWatch failed: %v", err)
+	}
+	childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
+	if err != nil {
+		t.Fatalf("AddWatch failed: %v", err)
+	}
 	childVD.DecRef(sys.Ctx)
 
 	childFD, err := sys.VFS.OpenAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child"), &vfs.OpenOptions{Flags: linux.O_RDONLY})
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 45a350075d..2a4616c0a5 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -116,7 +116,11 @@ func InotifyAddWatch(t *kernel.Task, sysno uintptr, args arch.SyscallArguments)
 	}
 	defer d.DecRef(t)
 
-	return uintptr(ino.AddWatch(d.Dentry(), mask)), nil, nil
+	wd, err := ino.AddWatch(d.Dentry(), mask)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(wd), nil, nil
 }
 
 // InotifyRmWatch implements the inotify_rm_watch() syscall.
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index e46acc3222..46ff76e37b 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -211,6 +211,7 @@ go_test(
     size = "small",
     srcs = [
         "file_description_impl_util_test.go",
+        "inotify_test.go",
         "mount_test.go",
     ],
     library = ":vfs",
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index f20bfb62cd..d9017035d6 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -35,6 +35,28 @@ import (
 // must be a power 2 for rounding below.
 const inotifyEventBaseSize = 16
 
+// Per-instance resource caps matching Linux defaults at
+// fs/notify/inotify/inotify_user.c. Linux enforces max_user_watches and
+// max_user_instances per user namespace via UCOUNT_INOTIFY_*; gVisor does
+// not yet have an equivalent ucount infrastructure in pkg/sentry/kernel/auth,
+// so the watch and queue caps below are enforced per-inotify-instance.
+//
+// TODO: add per-user-namespace accounting for max_user_instances and tighten
+// max_user_watches to per-user across all instances rather than per-instance.
+const (
+	// maxInotifyWatchesPerInstance bounds the number of watches a single
+	// inotify instance can hold. Linux fs.inotify.max_user_watches default
+	// is 8192.
+	maxInotifyWatchesPerInstance = 8192
+
+	// maxInotifyQueuedEvents bounds the number of events held in a single
+	// inotify instance's pending-event queue. Linux fs.inotify.max_queued_events
+	// default is 16384. When the cap is reached, gVisor emits a single
+	// IN_Q_OVERFLOW marker and drops subsequent events until the queue drains,
+	// as Linux does in fsnotify_insert_event.
+	maxInotifyQueuedEvents = 16384
+)
+
 // EventType defines different kinds of inotfiy events.
 //
 // The way events are labelled appears somewhat arbitrary, but they must match
@@ -77,6 +99,11 @@ type Inotify struct {
 	// A list of pending events for this inotify instance. Protected by evMu.
 	events eventList
 
+	// numQueuedEvents counts the entries in events. Protected by evMu.
+	// Tracked explicitly because eventList is a generic intrusive list with
+	// no built-in length.
+	numQueuedEvents int
+
 	// A scratch buffer, used to serialize inotify events. Allocate this
 	// ahead of time for the sake of performance. Protected by evMu.
 	scratch []byte
@@ -240,6 +267,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt
 		// buffer space to copy it out, even if the copy below fails. Emulate
 		// this behaviour.
 		i.events.Remove(event)
+		i.numQueuedEvents--
 
 		// Buffer has enough space, copy event to the read buffer.
 		n, err := event.CopyTo(ctx, i.scratch, dst)
@@ -288,7 +316,23 @@ func (i *Inotify) queueEvent(ev *Event) {
 		}
 	}
 
+	// Enforce per-instance queue cap matching Linux fs.inotify.max_queued_events.
+	// When the queue is full, emit a single IN_Q_OVERFLOW marker if the tail of
+	// the queue is not already an overflow marker, and drop subsequent events
+	// until the queue drains. This matches fsnotify_insert_event in Linux.
+	if i.numQueuedEvents >= maxInotifyQueuedEvents {
+		if last := i.events.Back(); last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 {
+			overflow := newEvent(-1, "", linux.IN_Q_OVERFLOW, 0)
+			i.events.PushBack(overflow)
+			i.numQueuedEvents++
+		}
+		i.evMu.Unlock()
+		i.queue.Notify(waiter.ReadableEvents)
+		return
+	}
+
 	i.events.PushBack(ev)
+	i.numQueuedEvents++
 
 	// Release mutex before notifying waiters because we don't control what they
 	// can do.
@@ -324,10 +368,12 @@ func (i *Inotify) nextWatchIDLocked() int32 {
 }
 
 // AddWatch constructs a new inotify watch and adds it to the target. It
-// returns the watch descriptor returned by inotify_add_watch(2).
+// returns the watch descriptor returned by inotify_add_watch(2). When the
+// per-instance watch cap is reached, it returns ENOSPC matching
+// inotify_new_watch in Linux fs/notify/inotify/inotify_user.c.
 //
 // The caller must hold a reference on target.
-func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
 	// Note: Locking this inotify instance protects the result returned by
 	// Lookup() below. With the lock held, we know for sure the lookup result
 	// won't become stale because it's impossible for *this* instance to
@@ -345,12 +391,17 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
 			newmask |= existing.mask.Load()
 		}
 		existing.mask.Store(newmask)
-		return existing.wd
+		return existing.wd, nil
+	}
+
+	// Enforce per-instance watch cap before allocating a new Watch.
+	if len(i.watches) >= maxInotifyWatchesPerInstance {
+		return 0, linuxerr.ENOSPC
 	}
 
 	// No existing watch, create a new watch.
 	w := i.newWatchLocked(target, ws, mask)
-	return w.wd
+	return w.wd, nil
 }
 
 // RmWatch looks up an inotify watch for the given 'wd' and configures the
diff --git a/pkg/sentry/vfs/inotify_test.go b/pkg/sentry/vfs/inotify_test.go
new file mode 100644
index 0000000000..24f6f94f84
--- /dev/null
+++ b/pkg/sentry/vfs/inotify_test.go
@@ -0,0 +1,120 @@
+// Copyright 2026 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+)
+
+// TestInotifyAddWatchReturnsENOSPCAtCap exercises the per-instance watch cap
+// added to (*Inotify).AddWatch. After maxInotifyWatchesPerInstance distinct
+// watches have been registered, the next AddWatch call must return ENOSPC.
+//
+// The test uses an anonymous VFS plus a fresh Inotify instance and creates
+// the dentries via NewAnonVirtualDentry, which is the minimal fixture that
+// gives each watch a unique target.
+func TestInotifyAddWatchReturnsENOSPCAtCap(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		t.Fatalf("VFS Init: %v", err)
+	}
+
+	fd, err := NewInotifyFD(ctx, vfsObj, 0)
+	if err != nil {
+		t.Fatalf("NewInotifyFD: %v", err)
+	}
+	defer fd.DecRef(ctx)
+	ino := fd.Impl().(*Inotify)
+
+	dentries := make([]VirtualDentry, 0, maxInotifyWatchesPerInstance+1)
+	for i := 0; i < maxInotifyWatchesPerInstance; i++ {
+		vd := vfsObj.NewAnonVirtualDentry("inwatch-cap-test")
+		dentries = append(dentries, vd)
+		if _, err := ino.AddWatch(vd.Dentry(), linux.IN_ALL_EVENTS); err != nil {
+			t.Fatalf("AddWatch #%d returned unexpected error: %v", i, err)
+		}
+	}
+
+	// Cap reached. The next distinct target must yield ENOSPC.
+	vdOver := vfsObj.NewAnonVirtualDentry("inwatch-cap-test-over")
+	dentries = append(dentries, vdOver)
+	_, err = ino.AddWatch(vdOver.Dentry(), linux.IN_ALL_EVENTS)
+	if !linuxerr.Equals(linuxerr.ENOSPC, err) {
+		t.Errorf("AddWatch at cap+1 = %v, want ENOSPC", err)
+	}
+
+	for _, vd := range dentries {
+		vd.DecRef(ctx)
+	}
+}
+
+// TestInotifyQueueOverflowEmitsMarker verifies that once maxInotifyQueuedEvents
+// events have been queued, queueEvent stops appending normal events and emits
+// a single IN_Q_OVERFLOW marker that coalesces subsequent overflow attempts.
+func TestInotifyQueueOverflowEmitsMarker(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		t.Fatalf("VFS Init: %v", err)
+	}
+
+	fd, err := NewInotifyFD(ctx, vfsObj, 0)
+	if err != nil {
+		t.Fatalf("NewInotifyFD: %v", err)
+	}
+	defer fd.DecRef(ctx)
+	ino := fd.Impl().(*Inotify)
+
+	// Fill the queue with distinct events so coalescing does not collapse them.
+	for i := 0; i < maxInotifyQueuedEvents; i++ {
+		ino.queueEvent(newEvent(int32(i), "", linux.IN_ACCESS, 0))
+	}
+
+	ino.evMu.Lock()
+	gotN := ino.numQueuedEvents
+	ino.evMu.Unlock()
+	if gotN != maxInotifyQueuedEvents {
+		t.Fatalf("numQueuedEvents after fill = %d, want %d", gotN, maxInotifyQueuedEvents)
+	}
+
+	// One more event past the cap. Queue size grows by one because of the
+	// IN_Q_OVERFLOW marker; the new event itself is dropped.
+	ino.queueEvent(newEvent(int32(maxInotifyQueuedEvents+1), "", linux.IN_ACCESS, 0))
+
+	ino.evMu.Lock()
+	gotN = ino.numQueuedEvents
+	last := ino.events.Back()
+	ino.evMu.Unlock()
+	if gotN != maxInotifyQueuedEvents+1 {
+		t.Errorf("numQueuedEvents after overflow = %d, want %d", gotN, maxInotifyQueuedEvents+1)
+	}
+	if last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 {
+		t.Errorf("queue tail not an IN_Q_OVERFLOW marker: last=%+v", last)
+	}
+
+	// Subsequent over-cap events must coalesce into the existing marker.
+	ino.queueEvent(newEvent(int32(maxInotifyQueuedEvents+2), "", linux.IN_ACCESS, 0))
+	ino.evMu.Lock()
+	gotN2 := ino.numQueuedEvents
+	ino.evMu.Unlock()
+	if gotN2 != gotN {
+		t.Errorf("numQueuedEvents after second overflow = %d, want %d (no new marker)", gotN2, gotN)
+	}
+}