From 95849c9f8c3fc711ce7f0d91aa802433b07e4bb8 Mon Sep 17 00:00:00 2001
From: Taeknology <20297177+Taeknology@users.noreply.github.com>
Date: Tue, 19 May 2026 18:03:56 +0900
Subject: [PATCH] Deflake diskless timeout rdb pipe test by tolerating either
 timeout-disconnect branch

The "diskless timeout replicas drop during rdb pipe" subcase in
tests/integration/replication.tcl waits for a "(full sync)" timeout
disconnect log. On test-macos-latest the disconnect can surface on the
"(streaming sync)" branch instead, because the RDB child exits and
clears server.rdb_child_type (src/rdb.c:3698) in the same serverCron
tick as the disconnect loop (src/replication.c:5348), closing the
(full sync) window before it can fire. The timed-out replica is by then
already promoted via replicaPutOnline(), so it falls into the
(streaming sync) branch (src/replication.c:5357-5364). Linux CI does
not hit this because SIGSTOP back-pressure keeps the RDB child blocked
on the pipe write long enough that backgroundSaveDoneHandler does not
run in the same tick.

Accept either timeout-disconnect message and assert exactly one such
disconnect occurred, so the test still rejects a regression that emits
zero or multiple timeouts. This follows the existing catch+fallback
convention already used in the same test for "all" and "slow"
subcases (replication.tcl:1011-1036).

Also addresses CodeRabbit feedback by adding a descriptive failure
message to the `assert_equal 1 [count_log_message ...]` guard so a
regression that emits zero or multiple timeout-disconnect log lines
surfaces with context instead of a bare `Expected '1' to be equal
to '0'`.

Fixes #3686.

Signed-off-by: Taeknology <20297177+Taeknology@users.noreply.github.com>
---
 tests/integration/replication.tcl | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index 7690d1fd49d..25b49bd1135 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -973,8 +973,30 @@ start_server {tags {"repl external:skip"} overrides {save ""}} {
                         # Let one replica hit repl-timeout while the slow reader
                         # is paused, then restore a generous timeout so the
                         # remaining replica can finish the streamed RDB.
+                        #
+                        # The disconnect can land in either of two branches in
+                        # replication.c serverCron (see src/replication.c around
+                        # the "Disconnecting timedout replica" emitters):
+                        #   - "(full sync)"      WAIT_BGSAVE_END + rdb_child_type == SOCKET
+                        #   - "(streaming sync)" REPLICA_STATE_ONLINE
+                        # On some platforms (notably macOS CI) the RDB child can
+                        # exit and clear rdb_child_type in the same serverCron
+                        # tick as the disconnect check, closing the (full sync)
+                        # window; the timed-out replica is by then already
+                        # promoted via replicaPutOnline() and the disconnect
+                        # surfaces on the (streaming sync) path instead. Both
+                        # are legitimate timeout-driven disconnects.
                         $master config set repl-timeout 2
-                        wait_for_log_messages -2 {"*Disconnecting timedout replica (full sync)*"} $loglines 100 100
+                        if {[catch {
+                            wait_for_log_messages -2 {"*Disconnecting timedout replica (full sync)*"} $loglines 100 100
+                        }]} {
+                            wait_for_log_messages -2 {"*Disconnecting timedout replica (streaming sync)*"} $loglines 100 100
+                        }
+                        # Guard against silently broadening the assertion: the
+                        # slow replica must time out exactly once across both
+                        # branches in this subcase.
+                        assert_equal 1 [count_log_message -2 "Disconnecting timedout replica"] \
+                            "expected exactly one 'Disconnecting timedout replica' log entry (full sync or streaming sync) for the slow replica"
                         $master config set repl-timeout 60
                     }