From 95849c9f8c3fc711ce7f0d91aa802433b07e4bb8 Mon Sep 17 00:00:00 2001 From: Taeknology <20297177+Taeknology@users.noreply.github.com> Date: Tue, 19 May 2026 18:03:56 +0900 Subject: [PATCH] Deflake diskless timeout rdb pipe test by tolerating either timeout-disconnect branch The "diskless timeout replicas drop during rdb pipe" subcase in tests/integration/replication.tcl waits for a "(full sync)" timeout disconnect log. On test-macos-latest the disconnect can surface on the "(streaming sync)" branch instead, because the RDB child exits and clears server.rdb_child_type (src/rdb.c:3698) in the same serverCron tick as the disconnect loop (src/replication.c:5348), closing the (full sync) window before it can fire. The timed-out replica is by then already promoted via replicaPutOnline(), so it falls into the (streaming sync) branch (src/replication.c:5357-5364). Linux CI does not hit this because SIGSTOP back-pressure keeps the RDB child blocked on the pipe write long enough that backgroundSaveDoneHandler does not run in the same tick. Accept either timeout-disconnect message and assert exactly one such disconnect occurred, so the test still rejects a regression that emits zero or multiple timeouts. This follows the existing catch+fallback convention already used in the same test for "all" and "slow" subcases (replication.tcl:1011-1036). Also addresses CodeRabbit feedback by adding a descriptive failure message to the `assert_equal 1 [count_log_message ...]` guard so a regression that emits zero or multiple timeout-disconnect log lines surfaces with context instead of a bare `Expected '1' to be equal to '0'`. Fixes #3686. Signed-off-by: Taeknology <20297177+Taeknology@users.noreply.github.com> --- tests/integration/replication.tcl | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 7690d1fd49d..25b49bd1135 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -973,8 +973,30 @@ start_server {tags {"repl external:skip"} overrides {save ""}} { # Let one replica hit repl-timeout while the slow reader # is paused, then restore a generous timeout so the # remaining replica can finish the streamed RDB. + # + # The disconnect can land in either of two branches in + # replication.c serverCron (see src/replication.c around + # the "Disconnecting timedout replica" emitters): + # - "(full sync)" WAIT_BGSAVE_END + rdb_child_type == SOCKET + # - "(streaming sync)" REPLICA_STATE_ONLINE + # On some platforms (notably macOS CI) the RDB child can + # exit and clear rdb_child_type in the same serverCron + # tick as the disconnect check, closing the (full sync) + # window; the timed-out replica is by then already + # promoted via replicaPutOnline() and the disconnect + # surfaces on the (streaming sync) path instead. Both + # are legitimate timeout-driven disconnects. $master config set repl-timeout 2 - wait_for_log_messages -2 {"*Disconnecting timedout replica (full sync)*"} $loglines 100 100 + if {[catch { + wait_for_log_messages -2 {"*Disconnecting timedout replica (full sync)*"} $loglines 100 100 + }]} { + wait_for_log_messages -2 {"*Disconnecting timedout replica (streaming sync)*"} $loglines 100 100 + } + # Guard against silently broadening the assertion: the + # slow replica must time out exactly once across both + # branches in this subcase. + assert_equal 1 [count_log_message -2 "Disconnecting timedout replica"] \ + "expected exactly one 'Disconnecting timedout replica' log entry (full sync or streaming sync) for the slow replica" $master config set repl-timeout 60 }