smartconnpool: add per-pool waiter cap to limit queries waiting for a connection (#827)

ejortegau · Claude · ejortegau · commit 22c986282e0b · 2026-04-13T10:39:21.000+02:00
This is backport of vitessio#19811 Adds a configurable limit on the number of queries that can wait for a connection from each pool. When the cap is reached, new requests are rejected immediately with RESOURCE_EXHAUSTED instead of queueing unboundedly. New flags (Default 0 = unlimited): --queryserver-config-query-pool-waiter-cap --queryserver-config-stream-pool-waiter-cap --queryserver-config-txpool-waiter-cap How it works Cap enforcement lives inside waitlist.waitForConn using a double-checked locking pattern: a lockless atomic pre-check rejects most over-cap requests early, and a strict check under the mutex guarantees correctness. WaitCount is now incremented via a callback (onWait) when the wait actually begins (before mutex acquisition), not after completion. A new WaiterCapRejected counter (exposed as {name}WaiterCapRejected) tracks how many requests were rejected due to the cap. New metrics Metric Type Description {pool}WaiterCapRejected Counter Requests rejected because the waiter cap was reached --------- Signed-off-by: Eduardo Ortega <5791035+ejortegau@users.noreply.github.com> Co-authored-by: Claude <svc-devxp-claude@slack-corp.com>
diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt
@@ -298,18 +298,21 @@ Flags:
       --queryserver-config-pool-size int                                 query server read pool size, connection pool is used by regular queries (non streaming, not in a transaction) (default 16)
       --queryserver-config-query-cache-memory int                        query server query cache size in bytes, maximum amount of memory to be used for caching. vttablet analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432)
       --queryserver-config-query-pool-timeout duration                   query server query pool timeout, it is how long vttablet waits for a connection from the query pool. If set to 0 (default) then the overall query timeout is used instead.
+      --queryserver-config-query-pool-waiter-cap uint                    query server query pool waiter cap is the maximum number of queries allowed to wait for a connection from the pool. If set to 0 (default) then there is no limit.
       --queryserver-config-query-timeout duration                        query server query timeout, this is the query timeout in vttablet side. If a query takes more than this timeout, it will be killed. (default 30s)
       --queryserver-config-schema-change-signal                          query server schema signal, will signal connected vtgates that schema has changed whenever this is detected. VTGates will need to have -schema_change_signal enabled for this to work (default true)
       --queryserver-config-schema-reload-time duration                   query server schema reload time, how often vttablet reloads schemas from underlying MySQL instance. vttablet keeps table schemas in its own memory and periodically refreshes it from MySQL. This config controls the reload time. (default 30m0s)
       --queryserver-config-stream-buffer-size int                        query server stream buffer size, the maximum number of bytes sent from vttablet for each stream call. It's recommended to keep this value in sync with vtgate's stream_buffer_size. (default 32768)
       --queryserver-config-stream-pool-size int                          query server stream connection pool size, stream pool is used by stream queries: queries that return results to client in a streaming fashion (default 200)
       --queryserver-config-stream-pool-timeout duration                  query server stream pool timeout, it is how long vttablet waits for a connection from the stream pool. If set to 0 (default) then there is no timeout.
+      --queryserver-config-stream-pool-waiter-cap uint                   query server stream pool waiter cap is the maximum number of streaming queries allowed to wait for a connection from the pool. If set to 0 (default) then there is no limit.
       --queryserver-config-strict-table-acl                              only allow queries that pass table acl checks
       --queryserver-config-terse-errors                                  prevent bind vars from escaping in client error messages
       --queryserver-config-transaction-cap int                           query server transaction cap is the maximum number of transactions allowed to happen at any given point of a time for a single vttablet. E.g. by setting transaction cap to 100, there are at most 100 transactions will be processed by a vttablet and the 101th transaction will be blocked (and fail if it cannot get connection within specified timeout) (default 20)
       --queryserver-config-transaction-timeout duration                  query server transaction timeout, a transaction will be killed if it takes longer than this value (default 30s)
       --queryserver-config-truncate-error-len int                        truncate errors sent to client if they are longer than this value (0 means do not truncate)
       --queryserver-config-txpool-timeout duration                       query server transaction pool timeout, it is how long vttablet waits if tx pool is full (default 1s)
+      --queryserver-config-txpool-waiter-cap uint                        query server transaction pool waiter cap is the maximum number of transactions allowed to wait for a connection from the pool. If set to 0 (default) then there is no limit.
       --queryserver-config-warn-result-size int                          query server result size warning threshold, warn if number of rows returned from vttablet for non-streaming queries exceeds this
       --queryserver-enable-settings-pool                                 Enable pooling of connections with modified system settings (default true)
       --queryserver-enable-views                                         Enable views support in vttablet.
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
@@ -288,18 +288,21 @@ Flags:
       --queryserver-config-pool-size int                                 query server read pool size, connection pool is used by regular queries (non streaming, not in a transaction) (default 16)
       --queryserver-config-query-cache-memory int                        query server query cache size in bytes, maximum amount of memory to be used for caching. vttablet analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432)
       --queryserver-config-query-pool-timeout duration                   query server query pool timeout, it is how long vttablet waits for a connection from the query pool. If set to 0 (default) then the overall query timeout is used instead.
+      --queryserver-config-query-pool-waiter-cap uint                    query server query pool waiter cap is the maximum number of queries allowed to wait for a connection from the pool. If set to 0 (default) then there is no limit.
       --queryserver-config-query-timeout duration                        query server query timeout, this is the query timeout in vttablet side. If a query takes more than this timeout, it will be killed. (default 30s)
       --queryserver-config-schema-change-signal                          query server schema signal, will signal connected vtgates that schema has changed whenever this is detected. VTGates will need to have -schema_change_signal enabled for this to work (default true)
       --queryserver-config-schema-reload-time duration                   query server schema reload time, how often vttablet reloads schemas from underlying MySQL instance. vttablet keeps table schemas in its own memory and periodically refreshes it from MySQL. This config controls the reload time. (default 30m0s)
       --queryserver-config-stream-buffer-size int                        query server stream buffer size, the maximum number of bytes sent from vttablet for each stream call. It's recommended to keep this value in sync with vtgate's stream_buffer_size. (default 32768)
       --queryserver-config-stream-pool-size int                          query server stream connection pool size, stream pool is used by stream queries: queries that return results to client in a streaming fashion (default 200)
       --queryserver-config-stream-pool-timeout duration                  query server stream pool timeout, it is how long vttablet waits for a connection from the stream pool. If set to 0 (default) then there is no timeout.
+      --queryserver-config-stream-pool-waiter-cap uint                   query server stream pool waiter cap is the maximum number of streaming queries allowed to wait for a connection from the pool. If set to 0 (default) then there is no limit.
       --queryserver-config-strict-table-acl                              only allow queries that pass table acl checks
       --queryserver-config-terse-errors                                  prevent bind vars from escaping in client error messages
       --queryserver-config-transaction-cap int                           query server transaction cap is the maximum number of transactions allowed to happen at any given point of a time for a single vttablet. E.g. by setting transaction cap to 100, there are at most 100 transactions will be processed by a vttablet and the 101th transaction will be blocked (and fail if it cannot get connection within specified timeout) (default 20)
       --queryserver-config-transaction-timeout duration                  query server transaction timeout, a transaction will be killed if it takes longer than this value (default 30s)
       --queryserver-config-truncate-error-len int                        truncate errors sent to client if they are longer than this value (0 means do not truncate)
       --queryserver-config-txpool-timeout duration                       query server transaction pool timeout, it is how long vttablet waits if tx pool is full (default 1s)
+      --queryserver-config-txpool-waiter-cap uint                        query server transaction pool waiter cap is the maximum number of transactions allowed to wait for a connection from the pool. If set to 0 (default) then there is no limit.
       --queryserver-config-warn-result-size int                          query server result size warning threshold, warn if number of rows returned from vttablet for non-streaming queries exceeds this
       --queryserver-enable-settings-pool                                 Enable pooling of connections with modified system settings (default true)
       --queryserver-enable-views                                         Enable views support in vttablet.
diff --git a/go/pools/smartconnpool/pool.go b/go/pools/smartconnpool/pool.go
@@ -18,6 +18,7 @@ package smartconnpool
 
 import (
 	"context"
+	"errors"
 	"math/rand/v2"
 	"sync"
 	"sync/atomic"
@@ -39,6 +40,9 @@ var (
 	// ErrConnPoolClosed is returned when trying to get a connection from a closed conn pool
 	ErrConnPoolClosed = vterrors.New(vtrpcpb.Code_INTERNAL, "connection pool is closed")
 
+	// ErrPoolWaiterCapReached is returned when the waiter cap has been reached
+	ErrPoolWaiterCapReached = vterrors.New(vtrpcpb.Code_RESOURCE_EXHAUSTED, "connection pool waiter cap reached")
+
 	// PoolCloseTimeout is how long to wait for all connections to be returned to the pool during close
 	PoolCloseTimeout = 10 * time.Second
 )
@@ -52,6 +56,7 @@ type Metrics struct {
 	idleClosed           atomic.Int64
 	diffSetting          atomic.Int64
 	resetSetting         atomic.Int64
+	waiterCapRejected    atomic.Int64
 }
 
 func (m *Metrics) MaxLifetimeClosed() int64 {
@@ -86,6 +91,10 @@ func (m *Metrics) ResetSettingCount() int64 {
 	return m.resetSetting.Load()
 }
 
+func (m *Metrics) WaiterCapRejected() int64 {
+	return m.waiterCapRejected.Load()
+}
+
 type Connector[C Connection] func(ctx context.Context) (C, error)
 type RefreshCheck func() (bool, error)
 
@@ -94,6 +103,7 @@ type Config[C Connection] struct {
 	IdleTimeout     time.Duration
 	MaxLifetime     time.Duration
 	RefreshInterval time.Duration
+	MaxWaiters      uint
 	LogWait         func(time.Time)
 }
 
@@ -144,6 +154,9 @@ type ConnPool[C Connection] struct {
 		refreshInterval atomic.Int64
 		// logWait is called every time a client must block waiting for a connection
 		logWait func(time.Time)
+		// maxWaiters is the maximum number of clients that can be waiting for a connection;
+		// 0 means unlimited
+		maxWaiters uint
 	}
 
 	Metrics Metrics
@@ -159,7 +172,14 @@ func NewPool[C Connection](config *Config[C]) *ConnPool[C] {
 	pool.config.idleTimeout.Store(config.IdleTimeout.Nanoseconds())
 	pool.config.refreshInterval.Store(config.RefreshInterval.Nanoseconds())
 	pool.config.logWait = config.LogWait
+	pool.config.maxWaiters = config.MaxWaiters
 	pool.wait.init()
+	pool.wait.onWait = func() {
+		pool.Metrics.waitCount.Add(1)
+	}
+	pool.wait.onWaiterCapReached = func() {
+		pool.Metrics.waiterCapRejected.Add(1)
+	}
 
 	return pool
 }
@@ -353,8 +373,7 @@ func (pool *ConnPool[D]) RefreshInterval() time.Duration {
 	return time.Duration(pool.config.refreshInterval.Load())
 }
 
-func (pool *ConnPool[C]) recordWait(start time.Time) {
-	pool.Metrics.waitCount.Add(1)
+func (pool *ConnPool[C]) recordWaitDuration(start time.Time) {
 	pool.Metrics.waitTime.Add(time.Since(start).Nanoseconds())
 	if pool.config.logWait != nil {
 		pool.config.logWait(start)
@@ -571,11 +590,14 @@ func (pool *ConnPool[C]) get(ctx context.Context) (*Pooled[C], error) {
 			return nil, ErrConnPoolClosed
 		}
 
-		conn, err = pool.wait.waitForConn(ctx, nil, *closeChan)
+		conn, err = pool.wait.waitForConn(ctx, nil, *closeChan, pool.config.maxWaiters)
 		if err != nil {
+			if errors.Is(err, ErrPoolWaiterCapReached) {
+				return nil, err
+			}
 			return nil, ErrTimeout
 		}
-		pool.recordWait(start)
+		pool.recordWaitDuration(start)
 	}
 	// no connections available and no connections to wait for (pool is closed)
 	if conn == nil {
@@ -634,11 +656,14 @@ func (pool *ConnPool[C]) getWithSetting(ctx context.Context, setting *Setting) (
 			return nil, ErrConnPoolClosed
 		}
 
-		conn, err = pool.wait.waitForConn(ctx, setting, *closeChan)
+		conn, err = pool.wait.waitForConn(ctx, setting, *closeChan, pool.config.maxWaiters)
 		if err != nil {
+			if errors.Is(err, ErrPoolWaiterCapReached) {
+				return nil, err
+			}
 			return nil, ErrTimeout
 		}
-		pool.recordWait(start)
+		pool.recordWaitDuration(start)
 	}
 	// no connections available and no connections to wait for (pool is closed)
 	if conn == nil {
@@ -873,6 +898,9 @@ func (pool *ConnPool[C]) RegisterStats(stats *servenv.Exporter, name string) {
 	stats.NewCounterFunc(name+"GetSetting", "Tablet server conn pool get with setting count", func() int64 {
 		return pool.Metrics.GetSettingCount()
 	})
+	stats.NewCounterFunc(name+"WaiterCapRejected", "Tablet server conn pool waiter cap rejected", func() int64 {
+		return pool.Metrics.WaiterCapRejected()
+	})
 	stats.NewCounterFunc(name+"DiffSetting", "Number of times pool applied different setting", func() int64 {
 		return pool.Metrics.DiffSettingCount()
 	})
diff --git a/go/pools/smartconnpool/waitlist.go b/go/pools/smartconnpool/waitlist.go
@@ -39,21 +39,60 @@ type waitlist[C Connection] struct {
 	nodes sync.Pool
 	mu    sync.Mutex
 	list  list.List[waiter[C]]
+	// onWait is called when a client gets to the point in which it is waiting for a connection - or the mutex that it needs to grab to wait for a connection.
+	onWait func()
+	// onWaiterCapReached is called when the waitlist has reached its maximum capacity.
+	onWaiterCapReached func()
 }
 
 // waitForConn blocks until a connection with the given Setting is returned by another client,
 // or until the given context expires.
+// If maxWaiters is > 0 and the waitlist already has that many waiters, it returns
+// ErrPoolWaiterCapReached immediately without blocking.
 // The returned connection may _not_ have the requested Setting. This function can
 // also return a `nil` connection even if our context has expired, if the pool has
 // forced an expiration of all waiters in the waitlist.
-func (wl *waitlist[C]) waitForConn(ctx context.Context, setting *Setting, closeChan <-chan struct{}) (*Pooled[C], error) {
+func (wl *waitlist[C]) waitForConn(ctx context.Context, setting *Setting, closeChan <-chan struct{}, maxWaiters uint) (*Pooled[C], error) {
 	elem := wl.nodes.Get().(*list.Element[waiter[C]])
 	defer wl.nodes.Put(elem)
 
 	elem.Value = waiter[C]{conn: elem.Value.conn, setting: setting}
 
+	// Fast path: reject early using an atomic read of the list length to avoid
+	// contending on the mutex under high query rates. This is racy — the count
+	// can change between this check and the lock acquisition — so we re-check
+	// under the lock below for correctness. Still, we expect to reject most
+	// requests early here when under a heavy load.
+	//
+	// We do this here rather than further upstream (e.g. in ConnPool.Get) because
+	// callers only reach waitForConn after exhausting all other options (idle
+	// connections, new connections, settings stacks). There is no point in checking
+	// there when those requests can still get a connection without waiting. The cap
+	// is just for waiting.
+	if wl.aboveWaiterCap(maxWaiters) {
+		if wl.onWaiterCapReached != nil {
+			wl.onWaiterCapReached()
+		}
+		return nil, ErrPoolWaiterCapReached
+	}
+
+	// If we reach this point, we are waiting, at the very least on the mutex, likely
+	// on the connection. So call onWait which takes care of recording the wait.
+	if wl.onWait != nil {
+		wl.onWait()
+	}
+
 	wl.mu.Lock()
-	// add ourselves as a waiter at the end of the waitlist
+	// Strict check: the list length may have changed since the lockless check
+	// above, so we verify again while holding the lock to guarantee the cap is
+	// never exceeded.
+	if wl.aboveWaiterCap(maxWaiters) {
+		wl.mu.Unlock()
+		if wl.onWaiterCapReached != nil {
+			wl.onWaiterCapReached()
+		}
+		return nil, ErrPoolWaiterCapReached
+	}
 	wl.list.PushBackValue(elem)
 	wl.mu.Unlock()
 
@@ -110,6 +149,10 @@ func (wl *waitlist[C]) waitForConn(ctx context.Context, setting *Setting, closeC
 	}
 }
 
+func (wl *waitlist[C]) aboveWaiterCap(maxWaiters uint) bool {
+	return maxWaiters > 0 && wl.list.Len() >= int(maxWaiters)
+}
+
 func (wl *waitlist[C]) maybeStarvingCount() (maybeStarving int) {
 	if wl.list.Len() == 0 {
 		return
diff --git a/go/pools/smartconnpool/waitlist_test.go b/go/pools/smartconnpool/waitlist_test.go
@@ -40,7 +40,7 @@ func TestWaitlistPoolCloseWithMultipleWaiters(t *testing.T) {
 
 	for i := 0; i < waiterCount; i++ {
 		go func() {
-			_, err := wait.waitForConn(ctx, nil, poolClose)
+			_, err := wait.waitForConn(ctx, nil, poolClose, 0)
 
 			if err != nil {
 				expireCount.Add(1)
@@ -68,3 +68,34 @@ func TestWaitlistPoolCloseWithMultipleWaiters(t *testing.T) {
 
 	assert.Equal(t, int32(waiterCount), expireCount.Load())
 }
+
+func TestWaitlistWaiterCap(t *testing.T) {
+	wl := waitlist[*TestConn]{}
+	wl.init()
+
+	poolClose := make(chan struct{})
+
+	const maxWaiters = 3
+
+	errs := make(chan error, maxWaiters)
+	for i := 1; i <= maxWaiters; i++ {
+		go func() {
+			_, err := wl.waitForConn(context.Background(), nil, poolClose, maxWaiters)
+			errs <- err
+		}()
+
+		assert.Eventually(t, func() bool {
+			return wl.waiting() == i
+		}, time.Second, 5*time.Millisecond)
+	}
+
+	_, err := wl.waitForConn(context.Background(), nil, poolClose, maxWaiters)
+	assert.ErrorIs(t, err, ErrPoolWaiterCapReached)
+	assert.Equal(t, maxWaiters, wl.waiting())
+
+	close(poolClose)
+
+	for i := 0; i < maxWaiters; i++ {
+		assert.NotErrorIs(t, <-errs, ErrPoolWaiterCapReached)
+	}
+}
diff --git a/go/vt/vttablet/tabletserver/connpool/pool.go b/go/vt/vttablet/tabletserver/connpool/pool.go
@@ -71,6 +71,7 @@ func NewPool(env tabletenv.Env, name string, cfg tabletenv.ConnPoolConfig) *Pool
 		IdleTimeout:     cfg.IdleTimeout,
 		MaxLifetime:     cfg.MaxLifetime,
 		RefreshInterval: mysqlctl.PoolDynamicHostnameResolution,
+		MaxWaiters:      cfg.MaxWaiters,
 	}
 
 	if name != "" {
diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go
diff --git a/go/vt/vttablet/tabletserver/tx_pool.go b/go/vt/vttablet/tabletserver/tx_pool.go

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@ func NewPool(env tabletenv.Env, name string, cfg tabletenv.ConnPoolConfig) *Pool`
`71`	`71`	`IdleTimeout: cfg.IdleTimeout,`
`72`	`72`	`MaxLifetime: cfg.MaxLifetime,`
`73`	`73`	`RefreshInterval: mysqlctl.PoolDynamicHostnameResolution,`
	`74`	`+ MaxWaiters: cfg.MaxWaiters,`
`74`	`75`	`}`
`75`	`76`
`76`	`77`	`if name != "" {`