Skip to content

Commit 728956c

Browse files
authored
Fix timing issues in flaky scaledown tests (#189)
Terminate a clusterclient directly instead of first closing its client socket. This avoids that hiredis-cluster might succeed to reconnect to the server before the server has shutdown. Close the simulated-redis's listener socket early improves stability. Add check for alternative output depending on timing in following testcases: cluster-scale-down-test.sh dbsize-to-all-nodes-during-scaledown-test.sh dbsize-to-all-nodes-during-scaledown-test-async.sh
1 parent 77bd2bb commit 728956c

4 files changed

+45
-15
lines changed

Diff for: tests/scripts/cluster-scale-down-test.sh

+12-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/sh
1+
#!/bin/bash
22
#
33
# To verify the clients behaviour in a cluster scaledown scenario.
44
# The testcase will send commands, all targeting hash slot 12182, while removing
@@ -64,7 +64,6 @@ EXPECT CONNECT
6464
EXPECT ["GET", "{foo}1"]
6565
SEND "bar1"
6666
# Forced close. The next command "GET {foo}2" will fail.
67-
CLOSE
6867
EOF
6968
server2=$!
7069

@@ -97,12 +96,20 @@ if [ $clientexit -ne 0 ]; then
9796
exit $clientexit
9897
fi
9998

100-
# Check the output from clusterclient
101-
expected="bar1
99+
# Check the output from clusterclient, which depends on timing.
100+
# Client sends the command 'GET {foo}2' just after nodeid2 closes its socket.
101+
expected1="bar1
102102
error: Server closed the connection
103103
bar3"
104104

105-
echo "$expected" | diff -u - "$testname.out" || exit 99
105+
# Client sends the command 'GET {foo}2' just before nodeid2 closes its socket.
106+
expected2="bar1
107+
error: Connection reset by peer
108+
bar3"
109+
110+
diff -u "$testname.out" <(echo "$expected1") || \
111+
diff -u "$testname.out" <(echo "$expected2") || \
112+
exit 99
106113

107114
# Clean up
108115
rm "$testname.out"

Diff for: tests/scripts/dbsize-to-all-nodes-during-scaledown-test-async.sh

+17-5
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ EXPECT ["DBSIZE"]
3131
SEND 10
3232
EXPECT ["DBSIZE"]
3333
SEND 11
34-
# The second command to node2 fails which triggers a slotmap update.
34+
# The second command to node #2 fails which triggers a slotmap update.
3535
EXPECT ["CLUSTER", "SLOTS"]
3636
SEND [[0, 16383, ["127.0.0.1", 7401, "nodeid7401"]]]
3737
EXPECT ["DBSIZE"]
@@ -45,7 +45,7 @@ timeout 5s ./simulated-redis.pl -p 7402 -d --sigcont $syncpid2 <<'EOF' &
4545
EXPECT CONNECT
4646
EXPECT ["DBSIZE"]
4747
SEND 20
48-
CLOSE
48+
# Forced close. The second command to this node should trigger a slotmap update.
4949
EOF
5050
server2=$!
5151

@@ -81,14 +81,26 @@ if [ $clientexit -ne 0 ]; then
8181
exit $clientexit
8282
fi
8383

84-
# Check the output from clusterclient
85-
expected="10
84+
# Check the output from clusterclient, which depends on timing.
85+
# Client sends the second 'DBSIZE' to node #2 just after node #2 closes its socket.
86+
expected1="10
8687
20
8788
error: Server closed the connection
8889
11
8990
12"
9091

91-
echo "$expected" | diff -u - "$testname.out" || exit 99
92+
# Client sends the second 'DBSIZE' to node #2 just before node #2 closes its socket.
93+
expected2="10
94+
20
95+
error: Connection reset by peer
96+
11
97+
12"
98+
99+
# The reply "11" from node #1 can come before or after the socket error from node #2.
100+
# Therefore, we sort before comparing.
101+
diff -u <(echo "$expected1" | sort) <(sort "$testname.out") || \
102+
diff -u <(echo "$expected2" | sort) <(sort "$testname.out") || \
103+
exit 99
92104

93105
# Clean up
94106
rm "$testname.out"

Diff for: tests/scripts/dbsize-to-all-nodes-during-scaledown-test.sh

+15-5
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ EXPECT ["DBSIZE"]
3333
SEND 11
3434
EXPECT ["DBSIZE"]
3535
SEND 12
36-
# The second command to node2 fails which triggers a slotmap update pipelined
36+
# The second command to node #2 fails which triggers a slotmap update pipelined
3737
# onto the 3rd DBSIZE to this node.
3838
EXPECT ["CLUSTER", "SLOTS"]
3939
SEND [[0, 16383, ["127.0.0.1", 7401, "nodeid7401"]]]
@@ -46,7 +46,7 @@ timeout 5s ./simulated-redis.pl -p 7402 -d --sigcont $syncpid2 <<'EOF' &
4646
EXPECT CONNECT
4747
EXPECT ["DBSIZE"]
4848
SEND 20
49-
CLOSE
49+
# Forced close. The second command to this node should trigger a slotmap update.
5050
EOF
5151
server2=$!
5252

@@ -82,14 +82,24 @@ if [ $clientexit -ne 0 ]; then
8282
exit $clientexit
8383
fi
8484

85-
# Check the output from clusterclient
86-
expected="10
85+
# Check the output from clusterclient, which depends on timing.
86+
# Client sends the second 'DBSIZE' to node #2 just after node #2 closes its socket.
87+
expected1="10
8788
20
8889
11
8990
error: Server closed the connection
9091
12"
9192

92-
echo "$expected" | diff -u - "$testname.out" || exit 99
93+
# Client sends the second 'DBSIZE' to node #2 just before node #2 closes its socket.
94+
expected2="10
95+
20
96+
11
97+
error: Connection reset by peer
98+
12"
99+
100+
diff -u "$testname.out" <(echo "$expected1") || \
101+
diff -u "$testname.out" <(echo "$expected2") || \
102+
exit 99
93103

94104
# Clean up
95105
rm "$testname.out"

Diff for: tests/scripts/simulated-redis.pl

+1
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ END
166166
unexpected($port, "event: $_");
167167
}
168168
}
169+
close $listener;
169170
print "(port $port) Done.\n" if $debug;
170171
exit;
171172

0 commit comments

Comments
 (0)