diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index c4bb3aa749..7711956870 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -495,8 +495,14 @@ func TestRmdirInotifyDeleteSelfBeforeParentDelete(t *testing.T) { parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent")) defer parentVD.DecRef(sys.Ctx) childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child")) - parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) - childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } + childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } childVD.DecRef(sys.Ctx) if err := sys.VFS.RmdirAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child")); err != nil { @@ -535,8 +541,14 @@ func TestRmdirInotifyWithOpenFDDefersDeleteSelf(t *testing.T) { parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent")) defer parentVD.DecRef(sys.Ctx) childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child")) - parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) - childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } + childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS) + if err != nil { + t.Fatalf("AddWatch failed: %v", err) + } childVD.DecRef(sys.Ctx) childFD, err := sys.VFS.OpenAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child"), &vfs.OpenOptions{Flags: linux.O_RDONLY}) diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index 45a350075d..2a4616c0a5 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -116,7 +116,11 @@ func InotifyAddWatch(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) } defer d.DecRef(t) - return uintptr(ino.AddWatch(d.Dentry(), mask)), nil, nil + wd, err := ino.AddWatch(d.Dentry(), mask) + if err != nil { + return 0, nil, err + } + return uintptr(wd), nil, nil } // InotifyRmWatch implements the inotify_rm_watch() syscall. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index f20bfb62cd..d9017035d6 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -35,6 +35,28 @@ import ( // must be a power 2 for rounding below. const inotifyEventBaseSize = 16 +// Per-instance resource caps matching Linux defaults at +// fs/notify/inotify/inotify_user.c. Linux enforces max_user_watches and +// max_user_instances per user namespace via UCOUNT_INOTIFY_*; gVisor does +// not yet have an equivalent ucount infrastructure in pkg/sentry/kernel/auth, +// so the watch and queue caps below are enforced per-inotify-instance. +// +// TODO: add per-user-namespace accounting for max_user_instances and tighten +// max_user_watches to per-user across all instances rather than per-instance. +const ( + // maxInotifyWatchesPerInstance bounds the number of watches a single + // inotify instance can hold. Linux fs.inotify.max_user_watches default + // is 8192. + maxInotifyWatchesPerInstance = 8192 + + // maxInotifyQueuedEvents bounds the number of events held in a single + // inotify instance's pending-event queue. Linux fs.inotify.max_queued_events + // default is 16384. When the cap is reached, gVisor emits a single + // IN_Q_OVERFLOW marker and drops subsequent events until the queue drains, + // as Linux does in fsnotify_insert_event. + maxInotifyQueuedEvents = 16384 +) + // EventType defines different kinds of inotfiy events. // // The way events are labelled appears somewhat arbitrary, but they must match @@ -77,6 +99,11 @@ type Inotify struct { // A list of pending events for this inotify instance. Protected by evMu. events eventList + // numQueuedEvents counts the entries in events. Protected by evMu. + // Tracked explicitly because eventList is a generic intrusive list with + // no built-in length. + numQueuedEvents int + // A scratch buffer, used to serialize inotify events. Allocate this // ahead of time for the sake of performance. Protected by evMu. scratch []byte @@ -240,6 +267,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt // buffer space to copy it out, even if the copy below fails. Emulate // this behaviour. i.events.Remove(event) + i.numQueuedEvents-- // Buffer has enough space, copy event to the read buffer. n, err := event.CopyTo(ctx, i.scratch, dst) @@ -288,7 +316,23 @@ func (i *Inotify) queueEvent(ev *Event) { } } + // Enforce per-instance queue cap matching Linux fs.inotify.max_queued_events. + // When the queue is full, emit a single IN_Q_OVERFLOW marker if the tail of + // the queue is not already an overflow marker, and drop subsequent events + // until the queue drains. This matches fsnotify_insert_event in Linux. + if i.numQueuedEvents >= maxInotifyQueuedEvents { + if last := i.events.Back(); last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 { + overflow := newEvent(-1, "", linux.IN_Q_OVERFLOW, 0) + i.events.PushBack(overflow) + i.numQueuedEvents++ + } + i.evMu.Unlock() + i.queue.Notify(waiter.ReadableEvents) + return + } + i.events.PushBack(ev) + i.numQueuedEvents++ // Release mutex before notifying waiters because we don't control what they // can do. @@ -324,10 +368,12 @@ func (i *Inotify) nextWatchIDLocked() int32 { } // AddWatch constructs a new inotify watch and adds it to the target. It -// returns the watch descriptor returned by inotify_add_watch(2). +// returns the watch descriptor returned by inotify_add_watch(2). When the +// per-instance watch cap is reached, it returns ENOSPC matching +// inotify_new_watch in Linux fs/notify/inotify/inotify_user.c. // // The caller must hold a reference on target. -func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { +func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // Note: Locking this inotify instance protects the result returned by // Lookup() below. With the lock held, we know for sure the lookup result // won't become stale because it's impossible for *this* instance to @@ -345,12 +391,17 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { newmask |= existing.mask.Load() } existing.mask.Store(newmask) - return existing.wd + return existing.wd, nil + } + + // Enforce per-instance watch cap before allocating a new Watch. + if len(i.watches) >= maxInotifyWatchesPerInstance { + return 0, linuxerr.ENOSPC } // No existing watch, create a new watch. w := i.newWatchLocked(target, ws, mask) - return w.wd + return w.wd, nil } // RmWatch looks up an inotify watch for the given 'wd' and configures the diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 5c01531850..1d8b6cb3b5 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -2692,6 +2692,102 @@ TEST(Inotify, KernfsBasic) { ASSERT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)})); } +TEST(Inotify, WatchCapReturnsENOSPC) { + // gVisor caps per-instance watches at the lower-bound Linux default (8192). + // Linux caps per-user via /proc/sys/fs/inotify/max_user_watches; that + // sysctl ranges from 8192 on small-memory systems up to 1048576 on larger + // hosts. Read the configured cap and verify ENOSPC at cap+1 against + // whichever cap applies on this runner. The test is parameterised on the + // sysctl so it compares directly to whatever Linux is enforcing. + std::string contents; + ASSERT_NO_ERRNO(GetContents("/proc/sys/fs/inotify/max_user_watches", + &contents)); + const long cap = std::stol(contents); + // Linux runners with high max_user_watches make this test prohibitively + // slow to fill. Skip when the cap is beyond a sane test budget; the + // gVisor hard cap (8192) is always inside the budget so coverage holds + // there. + SKIP_IF(cap <= 0 || cap > 16384); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + std::vector subs; + subs.reserve(cap + 1); + for (long i = 0; i < cap; i++) { + auto sub = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path())); + ASSERT_THAT(inotify_add_watch(fd.get(), sub.path().c_str(), IN_ALL_EVENTS), + SyscallSucceeds()); + subs.push_back(std::move(sub)); + } + + auto over = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path())); + EXPECT_THAT(inotify_add_watch(fd.get(), over.path().c_str(), IN_ALL_EVENTS), + SyscallFailsWithErrno(ENOSPC)); +} + +TEST(Inotify, QueueOverflowEmitsMarker) { + // gVisor and Linux both cap queued inotify events at + // /proc/sys/fs/inotify/max_queued_events (Linux default 16384; gVisor + // hard-caps the per-instance queue at the same value). On overflow, both + // emit a single IN_Q_OVERFLOW marker (wd == -1) at the queue tail and drop + // subsequent events until reads drain the queue. + std::string contents; + ASSERT_NO_ERRNO(GetContents("/proc/sys/fs/inotify/max_queued_events", + &contents)); + const long max_q = std::stol(contents); + SKIP_IF(max_q <= 0 || max_q > 16384); + + const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), root.path(), IN_CREATE)); + + // Generate distinct create events so adjacent-coalescing does not collapse + // them. Push past the cap so the kernel must emit IN_Q_OVERFLOW. + for (long i = 0; i < max_q + 16; i++) { + const std::string name = absl::StrCat(root.path(), "/q", i); + ASSERT_NO_ERRNO_AND_VALUE( + Open(name, O_CREAT | O_EXCL | O_WRONLY, 0644)); + } + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + bool saw_overflow = false; + for (const auto& ev : events) { + if (ev.mask & IN_Q_OVERFLOW) { + EXPECT_EQ(ev.wd, -1); + saw_overflow = true; + } + } + EXPECT_TRUE(saw_overflow) + << "expected IN_Q_OVERFLOW marker after queue overflow"; + // The drained event count never exceeds the cap plus the overflow marker. + EXPECT_LE(events.size(), static_cast(max_q + 1)); + + // After the overflow marker is consumed by DrainEvents, the queue should + // accept new events again. Trigger one more create and verify the next + // drain returns at least one IN_CREATE event on wd. + const std::string recovery_name = absl::StrCat(root.path(), "/recovered"); + ASSERT_NO_ERRNO_AND_VALUE( + Open(recovery_name, O_CREAT | O_EXCL | O_WRONLY, 0644)); + const std::vector recovered = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + bool saw_post_overflow_event = false; + for (const auto& ev : recovered) { + if (ev.wd == wd && (ev.mask & IN_CREATE)) { + saw_post_overflow_event = true; + break; + } + } + EXPECT_TRUE(saw_post_overflow_event) + << "queue did not accept events after IN_Q_OVERFLOW was drained"; +} + } // namespace } // namespace testing } // namespace gvisor