Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 74 additions & 25 deletions tests/unit/cluster/clusterscan.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -582,30 +582,79 @@ start_cluster 2 0 {tags {external:skip cluster}} {
}
}

# CLUSTERSCAN CLUSTERDOWN test - separate cluster to test unassigned slots
start_cluster 2 0 {tags {external:skip cluster}} {
test "CLUSTERSCAN returns CLUSTERDOWN for unassigned slot" {
# This test covers the case when a slot is not served by any node.
# When a cursor pointing to that slot is used we would get -CLUSTERDOWN
# This helps with error handling rather than a crash or silent failure.
set cursor_slot_0 ""
set slot0_owner -1
foreach n {0 1} {
if {[catch {R $n clusterscan 0-{06S}-0 SLOT 0} res] == 0} {
set cursor_slot_0 [lindex $res 0]
Copy link
Copy Markdown
Member Author

@enjoy-binbin enjoy-binbin May 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually returns {0 {}} in the old code, so cursor_slot_0 is 0, and in the wait condition, we are actually checking R 0 clusterscan 0.

I'm wondering if we should return "cluster down" for this. See the other PR (#3675) for more info

127.0.0.1:30001> clusterscan 0
1) "0-{06S}-0"
2) (empty array)
127.0.0.1:30001> clusterscan 0-{06S}-0
(error) CLUSTERDOWN The cluster is down

set slot0_owner $n
break
}
}

R $slot0_owner CLUSTER DELSLOTS 0
set other_node [expr {1 - $slot0_owner}]
catch {R $other_node CLUSTER DELSLOTS 0}

wait_for_condition 1000 50 {
[catch {R $slot0_owner clusterscan $cursor_slot_0} res] && [string match "*CLUSTERDOWN*" $res]
} else {
fail "Expected CLUSTERDOWN error"
}
start_cluster 3 0 {tags {external:skip cluster}} {
test "CLUSTERSCAN returns correct errors on cluster down and unassigned slots" {
# Hashtag reference: {06S} -> slot 0 -> R0, {6ZJ} -> slot 16383 -> R2.

# Case 1: Node 0 is paused, cluster enters FAIL state.
# With cluster-require-full-coverage=yes (default), any CLUSTERSCAN
# should get CLUSTERDOWN because the cluster cannot serve all slots.
pause_process [srv 0 pid]
wait_for_cluster_state fail
assert_error {CLUSTERDOWN The cluster is down} {R 1 clusterscan "0-{06S}-0"}
assert_error {CLUSTERDOWN The cluster is down} {R 1 clusterscan "0-{6ZJ}-0"}
assert_error {CLUSTERDOWN The cluster is down} {R 2 clusterscan "0-{06S}-0"}
assert_error {CLUSTERDOWN The cluster is down} {R 2 clusterscan "0-{6ZJ}-0"}

# Case 2: Node 0 is paused, cluster enters FAIL state.
# With cluster-require-full-coverage=no, full-coverage requirement disabled.
# Now the cluster is "ok" even though node 0's slots are unreachable.
# Cursors for slot 0 should get MOVED.
R 1 config set cluster-require-full-coverage no
R 2 config set cluster-require-full-coverage no
wait_for_cluster_state ok
# Node 0 owns slot 0, so this should get MOVED.
assert_error {MOVED 0 *} {R 1 clusterscan "0-{06S}-0"}
assert_error {MOVED 0 *} {R 2 clusterscan "0-{06S}-0"}
# Slot 16383: assigned to node 2. Nodes that don't own it get MOVED;
# node 2 handles it locally.
assert_error {MOVED 16383 *} {R 1 clusterscan "0-{6ZJ}-0"}
assert_equal [R 2 clusterscan "0-{6ZJ}-0"] {0 {}}

# Restore full-coverage and bring node 0 back.
R 1 config set cluster-require-full-coverage yes
R 2 config set cluster-require-full-coverage yes
resume_process [srv 0 pid]
wait_for_cluster_state ok

# Case 3: Delete slot 0 from all nodes, slot 0 is now unassigned.
# With full-coverage=yes the cluster enters FAIL state.
# Cursors for slot 0 should get "Hash slot not served".
# Cursors for assigned but remote slots should get "cluster is down".
R 0 CLUSTER DELSLOTS 0
catch {R 1 CLUSTER DELSLOTS 0}
catch {R 2 CLUSTER DELSLOTS 0}
wait_for_cluster_state fail

# Unassigned slot -> specific error.
assert_error {CLUSTERDOWN Hash slot not served} {R 0 clusterscan "0-{06S}-0"}
assert_error {CLUSTERDOWN Hash slot not served} {R 1 clusterscan "0-{06S}-0"}
assert_error {CLUSTERDOWN Hash slot not served} {R 2 clusterscan "0-{06S}-0"}

# Other slots are still assigned but cluster is in FAIL state.
assert_error {CLUSTERDOWN The cluster is down} {R 0 clusterscan "0-{6ZJ}-0"}
assert_error {CLUSTERDOWN The cluster is down} {R 1 clusterscan "0-{6ZJ}-0"}
assert_error {CLUSTERDOWN The cluster is down} {R 2 clusterscan "0-{6ZJ}-0"}

# Case 4: Disable full-coverage again with slot 0 still unassigned.
# The cluster is "ok" but slot 0 remains unassigned.
# Cursors for slot 0 should still get "Hash slot not served".
# Cursors for assigned but remote slots should now get MOVED.
R 0 config set cluster-require-full-coverage no
R 1 config set cluster-require-full-coverage no
R 2 config set cluster-require-full-coverage no
wait_for_cluster_state ok

# Slot 0: unassigned -> "Hash slot not served" regardless of node.
assert_error {CLUSTERDOWN Hash slot not served} {R 0 clusterscan "0-{06S}-0"}
assert_error {CLUSTERDOWN Hash slot not served} {R 1 clusterscan "0-{06S}-0"}
assert_error {CLUSTERDOWN Hash slot not served} {R 2 clusterscan "0-{06S}-0"}

# Slot 16383: assigned to node 2. Nodes that don't own it get MOVED;
# node 2 handles it locally.
assert_error {MOVED 16383 *} {R 0 clusterscan "0-{6ZJ}-0"}
assert_error {MOVED 16383 *} {R 1 clusterscan "0-{6ZJ}-0"}
# Node 2 owns slot 16383, so this should work.
assert_equal [R 2 clusterscan "0-{6ZJ}-0"] {0 {}}
}
}
Loading