Skip to content

Commit 7bdf489

Browse files
scheduler: fix the recovery time of slow store (#9388)
close #9384 Signed-off-by: Ryan Leung <[email protected]> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com>
1 parent 29ead01 commit 7bdf489

File tree

2 files changed

+144
-5
lines changed

2 files changed

+144
-5
lines changed

pkg/schedule/schedulers/evict_slow_store.go

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ type evictSlowStoreSchedulerConfig struct {
4747
cluster *core.BasicCluster
4848
// Last timestamp of the chosen slow store for eviction.
4949
lastSlowStoreCaptureTS time.Time
50+
isRecovered bool
5051
// Duration gap for recovering the candidate, unit: s.
5152
RecoveryDurationGap uint64 `json:"recovery-duration"`
5253
EvictedStores []uint64 `json:"evict-stores"`
@@ -119,6 +120,21 @@ func (conf *evictSlowStoreSchedulerConfig) setStoreAndPersist(id uint64) error {
119120
return conf.save()
120121
}
121122

123+
func (conf *evictSlowStoreSchedulerConfig) tryUpdateRecoverStatus(isRecovered bool) error {
124+
conf.RLock()
125+
if conf.isRecovered == isRecovered {
126+
conf.RUnlock()
127+
return nil
128+
}
129+
conf.RUnlock()
130+
131+
conf.Lock()
132+
defer conf.Unlock()
133+
conf.lastSlowStoreCaptureTS = time.Now()
134+
conf.isRecovered = isRecovered
135+
return conf.save()
136+
}
137+
122138
func (conf *evictSlowStoreSchedulerConfig) clearAndPersist() (oldID uint64, err error) {
123139
oldID = conf.evictStore()
124140
conf.Lock()
@@ -299,14 +315,31 @@ func (s *evictSlowStoreScheduler) Schedule(cluster sche.SchedulerCluster, _ bool
299315
// slow node next time.
300316
log.Info("slow store has been removed",
301317
zap.Uint64("store-id", store.GetID()))
302-
} else if store.GetSlowScore() <= slowStoreRecoverThreshold && s.conf.readyForRecovery() {
318+
s.cleanupEvictLeader(cluster)
319+
return nil, nil
320+
}
321+
// recover slow store if its score is below the threshold.
322+
if store.GetSlowScore() <= slowStoreRecoverThreshold {
323+
if err := s.conf.tryUpdateRecoverStatus(true); err != nil {
324+
log.Info("evict-slow-store-scheduler persist config failed", zap.Uint64("store-id", store.GetID()), zap.Error(err))
325+
return nil, nil
326+
}
327+
328+
if !s.conf.readyForRecovery() {
329+
return nil, nil
330+
}
331+
303332
log.Info("slow store has been recovered",
304333
zap.Uint64("store-id", store.GetID()))
305-
} else {
306-
return s.schedulerEvictLeader(cluster), nil
334+
s.cleanupEvictLeader(cluster)
335+
return nil, nil
307336
}
308-
s.cleanupEvictLeader(cluster)
309-
return nil, nil
337+
// If the slow store is still slow or slow again, we can continue to evict leaders from it.
338+
if err := s.conf.tryUpdateRecoverStatus(false); err != nil {
339+
log.Info("evict-slow-store-scheduler persist config failed", zap.Uint64("store-id", store.GetID()), zap.Error(err))
340+
return nil, nil
341+
}
342+
return s.schedulerEvictLeader(cluster), nil
310343
}
311344

312345
var slowStore *core.StoreInfo

pkg/schedule/schedulers/evict_slow_store_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package schedulers
1717
import (
1818
"context"
1919
"testing"
20+
"time"
2021

2122
"github.com/stretchr/testify/require"
2223
"github.com/stretchr/testify/suite"
@@ -207,3 +208,108 @@ func TestEvictSlowStoreBatch(t *testing.T) {
207208
re.Equal(5, persistValue.Batch)
208209
re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/schedule/schedulers/transientRecoveryGap"))
209210
}
211+
212+
func TestRecoveryTime(t *testing.T) {
213+
re := require.New(t)
214+
cancel, _, tc, oc := prepareSchedulersTest()
215+
defer cancel()
216+
217+
// Add stores 1, 2, 3 with different leader counts
218+
tc.AddLeaderStore(1, 10)
219+
tc.AddLeaderStore(2, 0)
220+
tc.AddLeaderStore(3, 0)
221+
222+
// Add regions with leader in store 1
223+
for i := range 10 {
224+
tc.AddLeaderRegion(uint64(i), 1, 2, 3)
225+
}
226+
227+
storage := storage.NewStorageWithMemoryBackend()
228+
es, err := CreateScheduler(types.EvictSlowStoreScheduler, oc, storage,
229+
ConfigSliceDecoder(types.EvictSlowStoreScheduler, []string{}), nil)
230+
re.NoError(err)
231+
bs, err := CreateScheduler(types.BalanceLeaderScheduler, oc, storage,
232+
ConfigSliceDecoder(types.BalanceLeaderScheduler, []string{}), nil)
233+
re.NoError(err)
234+
235+
var recoveryTimeInSec uint64 = 1
236+
recoveryTime := 1 * time.Second
237+
es.(*evictSlowStoreScheduler).conf.RecoveryDurationGap = recoveryTimeInSec
238+
239+
// Mark store 1 as slow
240+
storeInfo := tc.GetStore(1)
241+
slowStore := storeInfo.Clone(func(store *core.StoreInfo) {
242+
store.GetStoreStats().SlowScore = 100
243+
})
244+
tc.PutStore(slowStore)
245+
246+
// Verify store is marked for eviction
247+
ops, _ := es.Schedule(tc, false)
248+
re.NotEmpty(ops)
249+
re.Equal(types.EvictSlowStoreScheduler.String(), ops[0].Desc())
250+
re.Equal(uint64(1), es.(*evictSlowStoreScheduler).conf.evictStore())
251+
252+
// Store recovers from being slow
253+
time.Sleep(recoveryTime)
254+
recoveredStore := storeInfo.Clone(func(store *core.StoreInfo) {
255+
store.GetStoreStats().SlowScore = 0
256+
})
257+
tc.PutStore(recoveredStore)
258+
259+
// Should not recover immediately due to recovery time window
260+
for range 10 {
261+
// trigger recovery check
262+
es.Schedule(tc, false)
263+
ops, _ = bs.Schedule(tc, false)
264+
re.Empty(ops)
265+
re.Equal(uint64(1), es.(*evictSlowStoreScheduler).conf.evictStore())
266+
}
267+
268+
// Store is slow again before recovery time is over
269+
time.Sleep(recoveryTime / 2)
270+
slowStore = storeInfo.Clone(func(store *core.StoreInfo) {
271+
store.GetStoreStats().SlowScore = 100
272+
})
273+
tc.PutStore(slowStore)
274+
time.Sleep(recoveryTime / 2)
275+
// Should not recover due to recovery time window recalculation
276+
for range 10 {
277+
// trigger recovery check
278+
es.Schedule(tc, false)
279+
ops, _ = bs.Schedule(tc, false)
280+
re.Empty(ops)
281+
re.Equal(uint64(1), es.(*evictSlowStoreScheduler).conf.evictStore())
282+
}
283+
284+
// Store recovers from being slow
285+
time.Sleep(recoveryTime)
286+
recoveredStore = storeInfo.Clone(func(store *core.StoreInfo) {
287+
store.GetStoreStats().SlowScore = 0
288+
})
289+
tc.PutStore(recoveredStore)
290+
291+
// Should not recover immediately due to recovery time window
292+
for range 10 {
293+
// trigger recovery check
294+
es.Schedule(tc, false)
295+
ops, _ = bs.Schedule(tc, false)
296+
re.Empty(ops)
297+
re.Equal(uint64(1), es.(*evictSlowStoreScheduler).conf.evictStore())
298+
}
299+
300+
// Should now recover
301+
time.Sleep(recoveryTime)
302+
// trigger recovery check
303+
es.Schedule(tc, false)
304+
305+
ops, _ = bs.Schedule(tc, false)
306+
re.Empty(ops)
307+
re.Empty(es.(*evictSlowStoreScheduler).conf.evictStore())
308+
309+
// Verify persistence
310+
var persistValue evictSlowStoreSchedulerConfig
311+
err = es.(*evictSlowStoreScheduler).conf.load(&persistValue)
312+
re.NoError(err)
313+
re.Zero(persistValue.evictStore())
314+
re.True(persistValue.readyForRecovery())
315+
}

0 commit comments

Comments
 (0)