Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions pkg/sentry/fsimpl/kernfs/kernfs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,14 @@ func TestRmdirInotifyDeleteSelfBeforeParentDelete(t *testing.T) {
parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent"))
defer parentVD.DecRef(sys.Ctx)
childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child"))
parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
if err != nil {
t.Fatalf("AddWatch failed: %v", err)
}
childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
if err != nil {
t.Fatalf("AddWatch failed: %v", err)
}
childVD.DecRef(sys.Ctx)

if err := sys.VFS.RmdirAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child")); err != nil {
Expand Down Expand Up @@ -535,8 +541,14 @@ func TestRmdirInotifyWithOpenFDDefersDeleteSelf(t *testing.T) {
parentVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent"))
defer parentVD.DecRef(sys.Ctx)
childVD := sys.GetDentryOrDie(sys.PathOpAtRoot("parent/child"))
parentWD := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
childWD := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
parentWD, err := ino.AddWatch(parentVD.Dentry(), linux.IN_ALL_EVENTS)
if err != nil {
t.Fatalf("AddWatch failed: %v", err)
}
childWD, err := ino.AddWatch(childVD.Dentry(), linux.IN_ALL_EVENTS)
if err != nil {
t.Fatalf("AddWatch failed: %v", err)
}
childVD.DecRef(sys.Ctx)

childFD, err := sys.VFS.OpenAt(ctx, sys.Creds, sys.PathOpAtRoot("parent/child"), &vfs.OpenOptions{Flags: linux.O_RDONLY})
Expand Down
6 changes: 5 additions & 1 deletion pkg/sentry/syscalls/linux/sys_inotify.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ func InotifyAddWatch(t *kernel.Task, sysno uintptr, args arch.SyscallArguments)
}
defer d.DecRef(t)

return uintptr(ino.AddWatch(d.Dentry(), mask)), nil, nil
wd, err := ino.AddWatch(d.Dentry(), mask)
if err != nil {
return 0, nil, err
}
return uintptr(wd), nil, nil
}

// InotifyRmWatch implements the inotify_rm_watch() syscall.
Expand Down
1 change: 1 addition & 0 deletions pkg/sentry/vfs/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ go_test(
size = "small",
srcs = [
"file_description_impl_util_test.go",
"inotify_test.go",
"mount_test.go",
],
library = ":vfs",
Expand Down
59 changes: 55 additions & 4 deletions pkg/sentry/vfs/inotify.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,28 @@ import (
// must be a power 2 for rounding below.
const inotifyEventBaseSize = 16

// Per-instance resource caps matching Linux defaults at
// fs/notify/inotify/inotify_user.c. Linux enforces max_user_watches and
// max_user_instances per user namespace via UCOUNT_INOTIFY_*; gVisor does
// not yet have an equivalent ucount infrastructure in pkg/sentry/kernel/auth,
// so the watch and queue caps below are enforced per-inotify-instance.
//
// TODO: add per-user-namespace accounting for max_user_instances and tighten
// max_user_watches to per-user across all instances rather than per-instance.
const (
// maxInotifyWatchesPerInstance bounds the number of watches a single
// inotify instance can hold. Linux fs.inotify.max_user_watches default
// is 8192.
maxInotifyWatchesPerInstance = 8192

// maxInotifyQueuedEvents bounds the number of events held in a single
// inotify instance's pending-event queue. Linux fs.inotify.max_queued_events
// default is 16384. When the cap is reached, gVisor emits a single
// IN_Q_OVERFLOW marker and drops subsequent events until the queue drains,
// as Linux does in fsnotify_insert_event.
maxInotifyQueuedEvents = 16384
)

// EventType defines different kinds of inotfiy events.
//
// The way events are labelled appears somewhat arbitrary, but they must match
Expand Down Expand Up @@ -77,6 +99,11 @@ type Inotify struct {
// A list of pending events for this inotify instance. Protected by evMu.
events eventList

// numQueuedEvents counts the entries in events. Protected by evMu.
// Tracked explicitly because eventList is a generic intrusive list with
// no built-in length.
numQueuedEvents int

// A scratch buffer, used to serialize inotify events. Allocate this
// ahead of time for the sake of performance. Protected by evMu.
scratch []byte
Expand Down Expand Up @@ -240,6 +267,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt
// buffer space to copy it out, even if the copy below fails. Emulate
// this behaviour.
i.events.Remove(event)
i.numQueuedEvents--

// Buffer has enough space, copy event to the read buffer.
n, err := event.CopyTo(ctx, i.scratch, dst)
Expand Down Expand Up @@ -288,7 +316,23 @@ func (i *Inotify) queueEvent(ev *Event) {
}
}

// Enforce per-instance queue cap matching Linux fs.inotify.max_queued_events.
// When the queue is full, emit a single IN_Q_OVERFLOW marker if the tail of
// the queue is not already an overflow marker, and drop subsequent events
// until the queue drains. This matches fsnotify_insert_event in Linux.
if i.numQueuedEvents >= maxInotifyQueuedEvents {
if last := i.events.Back(); last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 {
overflow := newEvent(-1, "", linux.IN_Q_OVERFLOW, 0)
i.events.PushBack(overflow)
i.numQueuedEvents++
}
i.evMu.Unlock()
i.queue.Notify(waiter.ReadableEvents)
return
}

i.events.PushBack(ev)
i.numQueuedEvents++

// Release mutex before notifying waiters because we don't control what they
// can do.
Expand Down Expand Up @@ -324,10 +368,12 @@ func (i *Inotify) nextWatchIDLocked() int32 {
}

// AddWatch constructs a new inotify watch and adds it to the target. It
// returns the watch descriptor returned by inotify_add_watch(2).
// returns the watch descriptor returned by inotify_add_watch(2). When the
// per-instance watch cap is reached, it returns ENOSPC matching
// inotify_new_watch in Linux fs/notify/inotify/inotify_user.c.
//
// The caller must hold a reference on target.
func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
// Note: Locking this inotify instance protects the result returned by
// Lookup() below. With the lock held, we know for sure the lookup result
// won't become stale because it's impossible for *this* instance to
Expand All @@ -345,12 +391,17 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
newmask |= existing.mask.Load()
}
existing.mask.Store(newmask)
return existing.wd
return existing.wd, nil
}

// Enforce per-instance watch cap before allocating a new Watch.
if len(i.watches) >= maxInotifyWatchesPerInstance {
return 0, linuxerr.ENOSPC
}

// No existing watch, create a new watch.
w := i.newWatchLocked(target, ws, mask)
return w.wd
return w.wd, nil
}

// RmWatch looks up an inotify watch for the given 'wd' and configures the
Expand Down
120 changes: 120 additions & 0 deletions pkg/sentry/vfs/inotify_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Copyright 2026 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
"testing"

"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
)

// TestInotifyAddWatchReturnsENOSPCAtCap exercises the per-instance watch cap
// added to (*Inotify).AddWatch. After maxInotifyWatchesPerInstance distinct
// watches have been registered, the next AddWatch call must return ENOSPC.
//
// The test uses an anonymous VFS plus a fresh Inotify instance and creates
// the dentries via NewAnonVirtualDentry, which is the minimal fixture that
// gives each watch a unique target.
func TestInotifyAddWatchReturnsENOSPCAtCap(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &VirtualFilesystem{}
if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS Init: %v", err)
}

fd, err := NewInotifyFD(ctx, vfsObj, 0)
if err != nil {
t.Fatalf("NewInotifyFD: %v", err)
}
defer fd.DecRef(ctx)
ino := fd.Impl().(*Inotify)

dentries := make([]VirtualDentry, 0, maxInotifyWatchesPerInstance+1)
for i := 0; i < maxInotifyWatchesPerInstance; i++ {
vd := vfsObj.NewAnonVirtualDentry("inwatch-cap-test")
dentries = append(dentries, vd)
if _, err := ino.AddWatch(vd.Dentry(), linux.IN_ALL_EVENTS); err != nil {
t.Fatalf("AddWatch #%d returned unexpected error: %v", i, err)
}
}

// Cap reached. The next distinct target must yield ENOSPC.
vdOver := vfsObj.NewAnonVirtualDentry("inwatch-cap-test-over")
dentries = append(dentries, vdOver)
_, err = ino.AddWatch(vdOver.Dentry(), linux.IN_ALL_EVENTS)
if !linuxerr.Equals(linuxerr.ENOSPC, err) {
t.Errorf("AddWatch at cap+1 = %v, want ENOSPC", err)
}

for _, vd := range dentries {
vd.DecRef(ctx)
}
}

// TestInotifyQueueOverflowEmitsMarker verifies that once maxInotifyQueuedEvents
// events have been queued, queueEvent stops appending normal events and emits
// a single IN_Q_OVERFLOW marker that coalesces subsequent overflow attempts.
func TestInotifyQueueOverflowEmitsMarker(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &VirtualFilesystem{}
if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS Init: %v", err)
}

fd, err := NewInotifyFD(ctx, vfsObj, 0)
if err != nil {
t.Fatalf("NewInotifyFD: %v", err)
}
defer fd.DecRef(ctx)
ino := fd.Impl().(*Inotify)

// Fill the queue with distinct events so coalescing does not collapse them.
for i := 0; i < maxInotifyQueuedEvents; i++ {
ino.queueEvent(newEvent(int32(i), "", linux.IN_ACCESS, 0))
}

ino.evMu.Lock()
gotN := ino.numQueuedEvents
ino.evMu.Unlock()
if gotN != maxInotifyQueuedEvents {
t.Fatalf("numQueuedEvents after fill = %d, want %d", gotN, maxInotifyQueuedEvents)
}

// One more event past the cap. Queue size grows by one because of the
// IN_Q_OVERFLOW marker; the new event itself is dropped.
ino.queueEvent(newEvent(int32(maxInotifyQueuedEvents+1), "", linux.IN_ACCESS, 0))

ino.evMu.Lock()
gotN = ino.numQueuedEvents
last := ino.events.Back()
ino.evMu.Unlock()
if gotN != maxInotifyQueuedEvents+1 {
t.Errorf("numQueuedEvents after overflow = %d, want %d", gotN, maxInotifyQueuedEvents+1)
}
if last == nil || last.mask&linux.IN_Q_OVERFLOW == 0 {
t.Errorf("queue tail not an IN_Q_OVERFLOW marker: last=%+v", last)
}

// Subsequent over-cap events must coalesce into the existing marker.
ino.queueEvent(newEvent(int32(maxInotifyQueuedEvents+2), "", linux.IN_ACCESS, 0))
ino.evMu.Lock()
gotN2 := ino.numQueuedEvents
ino.evMu.Unlock()
if gotN2 != gotN {
t.Errorf("numQueuedEvents after second overflow = %d, want %d (no new marker)", gotN2, gotN)
}
}