Skip to content

Commit ac7d3e9

Browse files
authored
Merge pull request #19451 from fuweid/add-downgrade-status
*: support DowngradeInfo field in maintenence.Status API
2 parents 49f34c9 + 65159a2 commit ac7d3e9

File tree

10 files changed

+733
-349
lines changed

10 files changed

+733
-349
lines changed

Documentation/dev-guide/apispec/swagger/rpc.swagger.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2135,6 +2135,19 @@
21352135
}
21362136
}
21372137
},
2138+
"etcdserverpbDowngradeInfo": {
2139+
"type": "object",
2140+
"properties": {
2141+
"enabled": {
2142+
"type": "boolean",
2143+
"description": "enabled indicates whether the cluster is enabled to downgrade."
2144+
},
2145+
"targetVersion": {
2146+
"type": "string",
2147+
"description": "targetVersion is the target downgrade version."
2148+
}
2149+
}
2150+
},
21382151
"etcdserverpbDowngradeRequest": {
21392152
"type": "object",
21402153
"properties": {
@@ -2840,6 +2853,10 @@
28402853
"type": "string",
28412854
"format": "int64",
28422855
"title": "dbSizeQuota is the configured etcd storage quota in bytes (the value passed to etcd instance by flag --quota-backend-bytes)"
2856+
},
2857+
"downgradeInfo": {
2858+
"$ref": "#/definitions/etcdserverpbDowngradeInfo",
2859+
"description": "downgradeInfo indicates if there is downgrade process."
28432860
}
28442861
}
28452862
},

api/etcdserverpb/rpc.pb.go

Lines changed: 610 additions & 323 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/etcdserverpb/rpc.proto

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,6 +1198,15 @@ message StatusResponse {
11981198
string storageVersion = 11 [(versionpb.etcd_version_field)="3.6"];
11991199
// dbSizeQuota is the configured etcd storage quota in bytes (the value passed to etcd instance by flag --quota-backend-bytes)
12001200
int64 dbSizeQuota = 12 [(versionpb.etcd_version_field)="3.6"];
1201+
// downgradeInfo indicates if there is downgrade process.
1202+
DowngradeInfo downgradeInfo = 13 [(versionpb.etcd_version_field)="3.6"];
1203+
}
1204+
1205+
message DowngradeInfo {
1206+
// enabled indicates whether the cluster is enabled to downgrade.
1207+
bool enabled = 1;
1208+
// targetVersion is the target downgrade version.
1209+
string targetVersion = 2;
12011210
}
12021211

12031212
message AuthEnableRequest {

etcdctl/ctlv3/command/printer.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package command
1717
import (
1818
"errors"
1919
"fmt"
20+
"strconv"
2021
"strings"
2122

2223
"github.com/dustin/go-humanize"
@@ -220,7 +221,7 @@ func makeEndpointHealthTable(healthList []epHealth) (hdr []string, rows [][]stri
220221
func makeEndpointStatusTable(statusList []epStatus) (hdr []string, rows [][]string) {
221222
hdr = []string{
222223
"endpoint", "ID", "version", "storage version", "db size", "in use", "percentage not in use", "quota", "is leader", "is learner", "raft term",
223-
"raft index", "raft applied index", "errors",
224+
"raft index", "raft applied index", "errors", "downgrade target version", "downgrade enabled",
224225
}
225226
for _, status := range statusList {
226227
rows = append(rows, []string{
@@ -238,6 +239,8 @@ func makeEndpointStatusTable(statusList []epStatus) (hdr []string, rows [][]stri
238239
fmt.Sprint(status.Resp.RaftIndex),
239240
fmt.Sprint(status.Resp.RaftAppliedIndex),
240241
fmt.Sprint(strings.Join(status.Resp.Errors, ", ")),
242+
status.Resp.DowngradeInfo.GetTargetVersion(),
243+
strconv.FormatBool(status.Resp.DowngradeInfo.GetEnabled()),
241244
})
242245
}
243246
return hdr, rows

etcdctl/ctlv3/command/printer_fields.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ func (p *fieldsPrinter) EndpointStatus(eps []epStatus) {
203203
fmt.Println(`"RaftAppliedIndex" :`, ep.Resp.RaftAppliedIndex)
204204
fmt.Println(`"Errors" :`, ep.Resp.Errors)
205205
fmt.Printf("\"Endpoint\" : %q\n", ep.Ep)
206+
fmt.Printf("\"DowngradeTargetVersion\" : %q\n", ep.Resp.DowngradeInfo.GetTargetVersion())
207+
fmt.Println(`"DowngradeEnabled" :`, ep.Resp.DowngradeInfo.GetEnabled())
206208
fmt.Println()
207209
}
208210
}

scripts/etcd_version_annotations.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,9 @@ etcdserverpb.DeleteRangeResponse: "3.0"
150150
etcdserverpb.DeleteRangeResponse.deleted: ""
151151
etcdserverpb.DeleteRangeResponse.header: ""
152152
etcdserverpb.DeleteRangeResponse.prev_kvs: "3.1"
153+
etcdserverpb.DowngradeInfo: ""
154+
etcdserverpb.DowngradeInfo.enabled: ""
155+
etcdserverpb.DowngradeInfo.targetVersion: ""
153156
etcdserverpb.DowngradeRequest: "3.5"
154157
etcdserverpb.DowngradeRequest.CANCEL: ""
155158
etcdserverpb.DowngradeRequest.DowngradeAction: "3.5"
@@ -382,6 +385,7 @@ etcdserverpb.StatusResponse: "3.0"
382385
etcdserverpb.StatusResponse.dbSize: ""
383386
etcdserverpb.StatusResponse.dbSizeInUse: "3.4"
384387
etcdserverpb.StatusResponse.dbSizeQuota: "3.6"
388+
etcdserverpb.StatusResponse.downgradeInfo: "3.6"
385389
etcdserverpb.StatusResponse.errors: "3.4"
386390
etcdserverpb.StatusResponse.header: ""
387391
etcdserverpb.StatusResponse.isLearner: "3.4"

server/etcdserver/api/v3rpc/maintenance.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,10 +262,17 @@ func (ms *maintenanceServer) Status(ctx context.Context, ar *pb.StatusRequest) (
262262
DbSizeInUse: ms.bg.Backend().SizeInUse(),
263263
IsLearner: ms.cs.IsLearner(),
264264
DbSizeQuota: ms.cg.Config().QuotaBackendBytes,
265+
DowngradeInfo: &pb.DowngradeInfo{Enabled: false},
265266
}
266267
if storageVersion := ms.vs.GetStorageVersion(); storageVersion != nil {
267268
resp.StorageVersion = storageVersion.String()
268269
}
270+
if downgradeInfo := ms.vs.GetDowngradeInfo(); downgradeInfo != nil {
271+
resp.DowngradeInfo = &pb.DowngradeInfo{
272+
Enabled: downgradeInfo.Enabled,
273+
TargetVersion: downgradeInfo.TargetVersion,
274+
}
275+
}
269276
if resp.Leader == raft.None {
270277
resp.Errors = append(resp.Errors, errors.ErrNoLeader.Error())
271278
}

tests/e2e/cluster_downgrade_test.go

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/stretchr/testify/assert"
2626
"github.com/stretchr/testify/require"
2727

28+
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
2829
"go.etcd.io/etcd/api/v3/version"
2930
"go.etcd.io/etcd/client/pkg/v3/fileutil"
3031
"go.etcd.io/etcd/client/pkg/v3/types"
@@ -51,6 +52,10 @@ func TestDowngradeUpgradeClusterOf1(t *testing.T) {
5152
testDowngradeUpgrade(t, 1, 1, false, noCancellation)
5253
}
5354

55+
func TestDowngradeUpgrade2InClusterOf3(t *testing.T) {
56+
testDowngradeUpgrade(t, 2, 3, false, noCancellation)
57+
}
58+
5459
func TestDowngradeUpgradeClusterOf3(t *testing.T) {
5560
testDowngradeUpgrade(t, 3, 3, false, noCancellation)
5661
}
@@ -128,6 +133,9 @@ func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterS
128133
time.Sleep(etcdserver.HealthInterval)
129134
}
130135

136+
t.Log("Downgrade should be disabled")
137+
e2e.ValidateDowngradeInfo(t, epc, &pb.DowngradeInfo{Enabled: false})
138+
131139
t.Log("Adding member to test membership, but a learner avoid breaking quorum")
132140
resp, err := cc.MemberAddAsLearner(context.Background(), "fake1", []string{"http://127.0.0.1:1001"})
133141
require.NoError(t, err)
@@ -150,6 +158,10 @@ func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterS
150158
return // No need to perform downgrading, end the test here
151159
}
152160
e2e.DowngradeEnable(t, epc, lastVersion)
161+
162+
t.Log("Downgrade should be enabled")
163+
e2e.ValidateDowngradeInfo(t, epc, &pb.DowngradeInfo{Enabled: true, TargetVersion: lastClusterVersion.String()})
164+
153165
if triggerCancellation == cancelRightAfterEnable {
154166
t.Logf("Cancelling downgrade right after enabling (no node is downgraded yet)")
155167
e2e.DowngradeCancel(t, epc)
@@ -165,7 +177,7 @@ func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterS
165177
err = e2e.DowngradeUpgradeMembersByID(t, nil, epc, membersToChange, currentVersion, lastClusterVersion)
166178
require.NoError(t, err)
167179
if len(membersToChange) == len(epc.Procs) {
168-
e2e.AssertProcessLogs(t, leader(t, epc), "the cluster has been downgraded")
180+
e2e.AssertProcessLogs(t, epc.Procs[epc.WaitLeader(t)], "the cluster has been downgraded")
169181
}
170182

171183
t.Log("Downgrade complete")
@@ -202,6 +214,14 @@ func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterS
202214
require.NoError(t, err)
203215
t.Log("Upgrade complete")
204216

217+
if triggerCancellation == noCancellation && numberOfMembersToDowngrade < clusterSize {
218+
t.Log("Downgrade should be still enabled")
219+
e2e.ValidateDowngradeInfo(t, epc, &pb.DowngradeInfo{Enabled: true, TargetVersion: lastClusterVersion.String()})
220+
} else {
221+
t.Log("Downgrade should be disabled")
222+
e2e.ValidateDowngradeInfo(t, epc, &pb.DowngradeInfo{Enabled: false})
223+
}
224+
205225
afterMembers, afterKV = getMembersAndKeys(t, cc)
206226
assert.Equal(t, beforeKV.Kvs, afterKV.Kvs)
207227
assert.Equal(t, beforeMembers.Members, afterMembers.Members)
@@ -224,27 +244,6 @@ func newCluster(t *testing.T, clusterSize int, snapshotCount uint64) *e2e.EtcdPr
224244
return epc
225245
}
226246

227-
func leader(t *testing.T, epc *e2e.EtcdProcessCluster) e2e.EtcdProcess {
228-
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
229-
defer cancel()
230-
for i := 0; i < len(epc.Procs); i++ {
231-
endpoints := epc.Procs[i].EndpointsGRPC()
232-
cli, err := clientv3.New(clientv3.Config{
233-
Endpoints: endpoints,
234-
DialTimeout: 3 * time.Second,
235-
})
236-
require.NoError(t, err)
237-
defer cli.Close()
238-
resp, err := cli.Status(ctx, endpoints[0])
239-
require.NoError(t, err)
240-
if resp.Header.GetMemberId() == resp.Leader {
241-
return epc.Procs[i]
242-
}
243-
}
244-
t.Fatal("Leader not found")
245-
return nil
246-
}
247-
248247
func generateSnapshot(t *testing.T, snapshotCount uint64, cc *e2e.EtcdctlV3) {
249248
ctx, cancel := context.WithCancel(context.Background())
250249
defer cancel()

tests/framework/e2e/downgrade.go

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/stretchr/testify/require"
2828
"go.uber.org/zap"
2929

30+
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
3031
"go.etcd.io/etcd/api/v3/version"
3132
"go.etcd.io/etcd/tests/v3/framework/testutils"
3233
)
@@ -46,7 +47,6 @@ func DowngradeEnable(t *testing.T, epc *EtcdProcessCluster, ver *semver.Version)
4647
Server: OffsetMinor(ver, 1).String(),
4748
Storage: ver.String(),
4849
})
49-
AssertProcessLogs(t, epc.Procs[i], "The server is ready to downgrade")
5050
}
5151

5252
t.Log("Cluster is ready for downgrade")
@@ -82,6 +82,51 @@ func DowngradeCancel(t *testing.T, epc *EtcdProcessCluster) {
8282
t.Log("Cluster downgrade cancellation is completed")
8383
}
8484

85+
func ValidateDowngradeInfo(t *testing.T, clus *EtcdProcessCluster, expected *pb.DowngradeInfo) {
86+
cfg := clus.Cfg
87+
88+
for i := 0; i < len(clus.Procs); i++ {
89+
member := clus.Procs[i]
90+
mc := member.Etcdctl()
91+
mName := member.Config().Name
92+
93+
testutils.ExecuteWithTimeout(t, 1*time.Minute, func() {
94+
for {
95+
statuses, err := mc.Status(context.Background())
96+
if err != nil {
97+
cfg.Logger.Warn("failed to get member status and retrying",
98+
zap.Error(err),
99+
zap.String("member", mName))
100+
101+
time.Sleep(time.Second)
102+
continue
103+
}
104+
105+
require.Lenf(t, statuses, 1, "member %s", mName)
106+
got := (*pb.StatusResponse)(statuses[0]).GetDowngradeInfo()
107+
108+
if got.GetEnabled() == expected.GetEnabled() && got.GetTargetVersion() == expected.GetTargetVersion() {
109+
cfg.Logger.Info("DowngradeInfo match", zap.String("member", mName))
110+
break
111+
}
112+
113+
cfg.Logger.Warn("DowngradeInfo didn't match retrying",
114+
zap.String("member", mName),
115+
zap.Dict("expected",
116+
zap.Bool("Enabled", expected.GetEnabled()),
117+
zap.String("TargetVersion", expected.GetTargetVersion()),
118+
),
119+
zap.Dict("got",
120+
zap.Bool("Enabled", got.GetEnabled()),
121+
zap.String("TargetVersion", got.GetTargetVersion()),
122+
),
123+
)
124+
time.Sleep(time.Second)
125+
}
126+
})
127+
}
128+
}
129+
85130
func DowngradeUpgradeMembers(t *testing.T, lg *zap.Logger, clus *EtcdProcessCluster, numberOfMembersToChange int, currentVersion, targetVersion *semver.Version) error {
86131
membersToChange := rand.Perm(len(clus.Procs))[:numberOfMembersToChange]
87132
t.Logf("Elect members for operations on members: %v", membersToChange)
@@ -100,7 +145,6 @@ func DowngradeUpgradeMembersByID(t *testing.T, lg *zap.Logger, clus *EtcdProcess
100145
opString = "downgrading"
101146
newExecPath = BinPath.EtcdLastRelease
102147
}
103-
104148
for _, memberID := range membersToChange {
105149
member := clus.Procs[memberID]
106150
if member.Config().ExecPath == newExecPath {
@@ -117,11 +161,16 @@ func DowngradeUpgradeMembersByID(t *testing.T, lg *zap.Logger, clus *EtcdProcess
117161
return err
118162
}
119163
}
164+
165+
clusterVersion := targetVersion.String()
166+
if !isDowngrade && len(membersToChange) != len(clus.Procs) {
167+
clusterVersion = currentVersion.String()
168+
}
120169
lg.Info("Validating versions")
121170
for _, memberID := range membersToChange {
122171
member := clus.Procs[memberID]
123172
ValidateVersion(t, clus.Cfg, member, version.Versions{
124-
Cluster: targetVersion.String(),
173+
Cluster: clusterVersion,
125174
Server: targetVersion.String(),
126175
})
127176
}

tests/robustness/failpoint/cluster.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,13 @@ func (f memberDowngradeUpgrade) Inject(ctx context.Context, t *testing.T, lg *za
232232
if err != nil {
233233
return nil, err
234234
}
235+
236+
// NOTE: By default, the leader can cancel the downgrade once all members
237+
// have reached the target version. However, determining the final stable
238+
// cluster version after an upgrade can be challenging. To ensure stability,
239+
// we should wait for leader to cancel downgrade process.
240+
e2e.AssertProcessLogs(t, clus.Procs[clus.WaitLeader(t)], "the cluster has been downgraded")
241+
235242
// partial upgrade the cluster
236243
numberOfMembersToUpgrade := rand.Int()%len(clus.Procs) + 1
237244
err = e2e.DowngradeUpgradeMembers(t, lg, clus, numberOfMembersToUpgrade, lastVersion, currentVersion)

0 commit comments

Comments
 (0)