From 47381cd17d1f913aeff162b47b7a6afb15a666e0 Mon Sep 17 00:00:00 2001 From: hillium Date: Thu, 22 Jan 2026 11:47:44 +0800 Subject: [PATCH 01/18] br: add support for restoring to the last available timestamp Signed-off-by: hillium --- br/pkg/restore/log_client/client.go | 9 ++++ br/pkg/restore/log_client/id_map.go | 84 ++++++++++++++++++++++++++++- br/pkg/task/restore.go | 10 ++++ br/pkg/task/stream.go | 1 + 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 1175cb813eeec..4720d0ca9da5f 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -215,6 +215,7 @@ type LogClient struct { upstreamClusterID uint64 restoreID uint64 + lastRestore bool // the query to insert rows into table `gc_delete_range`, lack of ts. deleteRangeQuery []*stream.PreDelRangeQuery @@ -232,6 +233,14 @@ func (rc *LogClient) SetRestoreID(restoreID uint64) { rc.restoreID = restoreID } +func (rc *LogClient) SetRestoreToLast(restoreToLast bool) { + rc.lastRestore = restoreToLast +} + +func (rc *LogClient) LastOne() bool { + return rc.lastRestore +} + type restoreStatistics struct { // restoreSSTKVSize is the total size (Original KV length) of KV pairs restored from SST files. restoreSSTKVSize uint64 diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 0ad5bb92ee9d0..49d7a13c55308 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -161,11 +161,42 @@ func (rc *LogClient) loadSchemasMap( if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { log.Info("checkpoint storage is specified, load pitr id map from the checkpoint storage.") dbMaps, err := rc.loadSchemasMapFromStorage(ctx, checkpointStorage, restoredTS) - return dbMaps, errors.Trace(err) + if err != nil { + return nil, errors.Trace(err) + } + if len(dbMaps) > 0 { + return dbMaps, nil + } } if rc.pitrIDMapTableExists() { dbMaps, err := rc.loadSchemasMapFromTable(ctx, restoredTS) - return dbMaps, errors.Trace(err) + if err != nil { + return nil, errors.Trace(err) + } + if len(dbMaps) > 0 { + return dbMaps, nil + } + // If we are loading the base map for a previous restore (restoredTS != rc.restoreTS), + // fall back to the latest restore_id for this restoredTS. + if restoredTS != rc.restoreTS && rc.pitrIDMapHasRestoreIDColumn() { + dbMaps, fallbackRestoreID, err := rc.loadSchemasMapFromTableLatestRestoreID(ctx, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if len(dbMaps) > 0 { + log.Info("load pitr id map from latest restore_id for previous segment", + zap.Uint64("restored_ts", restoredTS), + zap.Uint64("restore_id", fallbackRestoreID), + zap.Uint64("current_restore_id", rc.restoreID)) + return dbMaps, nil + } + } + if rc.storage != nil { + log.Info("fallback to log backup storage for pitr id map", zap.Uint64("restored_ts", restoredTS)) + dbMaps, err := rc.loadSchemasMapFromStorage(ctx, rc.storage, restoredTS) + return dbMaps, errors.Trace(err) + } + return nil, nil } log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") dbMaps, err := rc.loadSchemasMapFromStorage(ctx, rc.storage, restoredTS) @@ -252,3 +283,52 @@ func (rc *LogClient) loadSchemasMapFromTable( return backupMeta.GetDbMaps(), nil } + +func (rc *LogClient) loadSchemasMapFromTableLatestRestoreID( + ctx context.Context, + restoredTS uint64, +) ([]*backuppb.PitrDBMap, uint64, error) { + getPitrIDMapSQL := "SELECT restore_id, segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY restore_id DESC, segment_id;" + args := []any{restoredTS, rc.upstreamClusterID} + + execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + getPitrIDMapSQL, + args..., + ) + if errSQL != nil { + return nil, 0, errors.Annotatef(errSQL, "failed to get pitr id map from mysql.tidb_pitr_id_map") + } + if len(rows) == 0 { + log.Info("pitr id map does not exist", zap.Uint64("restored ts", restoredTS)) + return nil, 0, nil + } + + targetRestoreID := rows[0].GetUint64(0) + expectedSegmentID := uint64(0) + metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) + for _, row := range rows { + restoreID := row.GetUint64(0) + if restoreID != targetRestoreID { + break + } + elementID := row.GetUint64(1) + if expectedSegmentID != elementID { + return nil, 0, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", expectedSegmentID) + } + d := row.GetBytes(2) + if len(d) == 0 { + return nil, 0, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", expectedSegmentID) + } + metaData = append(metaData, d...) + expectedSegmentID += 1 + } + backupMeta := &backuppb.BackupMeta{} + if err := backupMeta.Unmarshal(metaData); err != nil { + return nil, 0, errors.Trace(err) + } + + return backupMeta.GetDbMaps(), targetRestoreID, nil +} diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index a35e3a557b1d0..ef8390c743e7f 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -104,6 +104,8 @@ const ( // FlagStreamStartTS and FlagStreamRestoreTS is used for log restore timestamp range. FlagStreamStartTS = "start-ts" FlagStreamRestoreTS = "restored-ts" + // FlagStreamLast is used for log restore, represents restore to the last available TS. + FlagStreamLast = "last" // FlagStreamFullBackupStorage is used for log restore, represents the full backup storage. FlagStreamFullBackupStorage = "full-backup-storage" // FlagPiTRBatchCount and FlagPiTRBatchSize are used for restore log with batch method. @@ -277,6 +279,8 @@ type RestoreConfig struct { RestoreTS uint64 `json:"restore-ts" toml:"restore-ts"` // whether RestoreTS was explicitly specified by user vs auto-detected IsRestoredTSUserSpecified bool `json:"-" toml:"-"` + // LastRestore represents whether restore is the last one. + LastRestore bool `json:"last" toml:"last"` // rewriteTS is the rewritten timestamp of meta kvs. RewriteTS uint64 `json:"-" toml:"-"` tiflashRecorder *tiflashrec.TiFlashRecorder `json:"-" toml:"-"` @@ -382,6 +386,7 @@ func DefineStreamRestoreFlags(command *cobra.Command) { "support TSO or datetime, e.g. '400036290571534337' or '2018-05-11 01:42:23+0800'") command.Flags().String(FlagStreamRestoreTS, "", "the point of restore, used for log restore.\n"+ "support TSO or datetime, e.g. '400036290571534337' or '2018-05-11 01:42:23+0800'") + command.Flags().Bool(FlagStreamLast, true, "restore to the last available commit timestamp") command.Flags().String(FlagStreamFullBackupStorage, "", "specify the backup full storage. "+ "fill it if want restore full backup before restore log.") command.Flags().Uint32(FlagPiTRBatchCount, defaultPiTRBatchCount, "specify the batch count to restore log.") @@ -409,6 +414,11 @@ func (cfg *RestoreConfig) ParseStreamRestoreFlags(flags *pflag.FlagSet) error { // check if RestoreTS was explicitly specified by user cfg.IsRestoredTSUserSpecified = flags.Changed(FlagStreamRestoreTS) + cfg.LastRestore, err = flags.GetBool(FlagStreamLast) + if err != nil { + return errors.Trace(err) + } + if cfg.FullBackupStorage, err = flags.GetString(FlagStreamFullBackupStorage); err != nil { return errors.Trace(err) } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 5108b4fd1e017..f5ef2876d0cbf 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1841,6 +1841,7 @@ func createLogClient(ctx context.Context, g glue.Glue, cfg *RestoreConfig, mgr * } client.SetCrypter(&cfg.CipherInfo) client.SetUpstreamClusterID(cfg.UpstreamClusterID) + client.SetRestoreToLast(cfg.LastRestore) err = client.InitClients(ctx, u, cfg.logCheckpointMetaManager, cfg.sstCheckpointMetaManager, uint(cfg.PitrConcurrency), cfg.ConcurrencyPerStore.Value) if err != nil { From c61038b4a7b2e0e5a6b4cf42fe20339e4f2345c0 Mon Sep 17 00:00:00 2001 From: hillium Date: Thu, 22 Jan 2026 14:37:42 +0800 Subject: [PATCH 02/18] persist tiflash replica Signed-off-by: hillium --- br/pkg/restore/log_client/tiflash_items.go | 154 +++++++++++++++++++++ br/pkg/task/restore.go | 4 + br/pkg/task/stream.go | 50 ++++++- 3 files changed, 203 insertions(+), 5 deletions(-) create mode 100644 br/pkg/restore/log_client/tiflash_items.go diff --git a/br/pkg/restore/log_client/tiflash_items.go b/br/pkg/restore/log_client/tiflash_items.go new file mode 100644 index 0000000000000..a5c47b1eb883d --- /dev/null +++ b/br/pkg/restore/log_client/tiflash_items.go @@ -0,0 +1,154 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package logclient + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/pingcap/errors" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/checkpoint" + "github.com/pingcap/tidb/pkg/meta/model" + "github.com/pingcap/tidb/pkg/objstore/storeapi" + "go.uber.org/zap" +) + +const pitrTiFlashItemsDir = "pitr_tiflash_items" + +type pitrTiFlashItems struct { + Items map[int64]model.TiFlashReplicaInfo `json:"items"` +} + +func PitrTiFlashItemsFilename(clusterID, restoredTS uint64) string { + return fmt.Sprintf("%s/pitr_tiflash_items.cluster_id:%d.restored_ts:%d", pitrTiFlashItemsDir, clusterID, restoredTS) +} + +func (rc *LogClient) loadTiFlashItemsFromStorage( + ctx context.Context, + storage storeapi.Storage, + restoredTS uint64, +) (map[int64]model.TiFlashReplicaInfo, bool, error) { + clusterID := rc.GetClusterID(ctx) + fileName := PitrTiFlashItemsFilename(clusterID, restoredTS) + exists, err := storage.FileExists(ctx, fileName) + if err != nil { + return nil, false, errors.Annotatef(err, "failed to check tiflash items file %s", fileName) + } + if !exists { + return nil, false, nil + } + + raw, err := storage.ReadFile(ctx, fileName) + if err != nil { + return nil, false, errors.Annotatef(err, "failed to read tiflash items file %s", fileName) + } + + var payload pitrTiFlashItems + if err := json.Unmarshal(raw, &payload); err != nil { + return nil, false, errors.Annotatef(err, "failed to unmarshal tiflash items file %s", fileName) + } + if payload.Items == nil { + payload.Items = map[int64]model.TiFlashReplicaInfo{} + } + log.Info("loaded pitr tiflash items", zap.String("file", fileName), zap.Int("item-count", len(payload.Items))) + return payload.Items, true, nil +} + +func (rc *LogClient) saveTiFlashItemsToStorage( + ctx context.Context, + storage storeapi.Storage, + restoredTS uint64, + items map[int64]model.TiFlashReplicaInfo, +) error { + clusterID := rc.GetClusterID(ctx) + fileName := PitrTiFlashItemsFilename(clusterID, restoredTS) + if items == nil { + items = map[int64]model.TiFlashReplicaInfo{} + } + payload := pitrTiFlashItems{Items: items} + raw, err := json.Marshal(&payload) + if err != nil { + return errors.Trace(err) + } + log.Info("saving pitr tiflash items", zap.String("file", fileName), zap.Int("item-count", len(items))) + if err := storage.WriteFile(ctx, fileName, raw); err != nil { + return errors.Annotatef(err, "failed to save tiflash items file %s", fileName) + } + return nil +} + +func (rc *LogClient) loadTiFlashItems( + ctx context.Context, + restoredTS uint64, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) (map[int64]model.TiFlashReplicaInfo, error) { + if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { + items, found, err := rc.loadTiFlashItemsFromStorage(ctx, checkpointStorage, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if found { + return items, nil + } + } + if rc.storage == nil { + return nil, nil + } + items, found, err := rc.loadTiFlashItemsFromStorage(ctx, rc.storage, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if !found { + return nil, nil + } + return items, nil +} + +func (rc *LogClient) saveTiFlashItems( + ctx context.Context, + restoredTS uint64, + items map[int64]model.TiFlashReplicaInfo, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + storage := rc.tryGetCheckpointStorage(logCheckpointMetaManager) + if storage == nil { + storage = rc.storage + } + if storage == nil { + return errors.New("no storage available for persisting tiflash items") + } + return errors.Trace(rc.saveTiFlashItemsToStorage(ctx, storage, restoredTS, items)) +} + +// LoadTiFlashRecorderItems loads persisted TiFlash recorder items for a segment. +func (rc *LogClient) LoadTiFlashRecorderItems( + ctx context.Context, + restoredTS uint64, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) (map[int64]model.TiFlashReplicaInfo, error) { + return rc.loadTiFlashItems(ctx, restoredTS, logCheckpointMetaManager) +} + +// SaveTiFlashRecorderItems persists TiFlash recorder items for the next segment. +func (rc *LogClient) SaveTiFlashRecorderItems( + ctx context.Context, + restoredTS uint64, + items map[int64]model.TiFlashReplicaInfo, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + return rc.saveTiFlashItems(ctx, restoredTS, items, logCheckpointMetaManager) +} diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index ef8390c743e7f..7b1c222ddb0fc 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -281,6 +281,8 @@ type RestoreConfig struct { IsRestoredTSUserSpecified bool `json:"-" toml:"-"` // LastRestore represents whether restore is the last one. LastRestore bool `json:"last" toml:"last"` + // whether LastRestore was explicitly specified by user vs default + IsLastRestoreUserSpecified bool `json:"-" toml:"-"` // rewriteTS is the rewritten timestamp of meta kvs. RewriteTS uint64 `json:"-" toml:"-"` tiflashRecorder *tiflashrec.TiFlashRecorder `json:"-" toml:"-"` @@ -414,6 +416,8 @@ func (cfg *RestoreConfig) ParseStreamRestoreFlags(flags *pflag.FlagSet) error { // check if RestoreTS was explicitly specified by user cfg.IsRestoredTSUserSpecified = flags.Changed(FlagStreamRestoreTS) + // check if LastRestore was explicitly specified by user + cfg.IsLastRestoreUserSpecified = flags.Changed(FlagStreamLast) cfg.LastRestore, err = flags.GetBool(FlagStreamLast) if err != nil { return errors.Trace(err) diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index f5ef2876d0cbf..f82efd1d2551f 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1372,6 +1372,12 @@ func RunStreamRestore( if err := checkLogRange(cfg.StartTS, cfg.RestoreTS, logInfo.logMinTS, logInfo.logMaxTS); err != nil { return errors.Trace(err) } + if cfg.LastRestore && !cfg.IsLastRestoreUserSpecified && cfg.IsRestoredTSUserSpecified && cfg.RestoreTS < logInfo.logMaxTS { + log.Info("restore-ts is before log max and --last not specified; treating as non-final segment", + zap.Uint64("restore-ts", cfg.RestoreTS), + zap.Uint64("log-max-ts", logInfo.logMaxTS)) + cfg.LastRestore = false + } // register task if needed // will potentially override restoredTS @@ -1620,6 +1626,9 @@ func restoreStream( if err := buildAndSaveIDMapIfNeeded(ctx, client, cfg); err != nil { return errors.Trace(err) } + if err := loadTiFlashRecorderItemsIfNeeded(ctx, client, cfg); err != nil { + return errors.Trace(err) + } // build schema replace schemasReplace, err := buildSchemaReplace(client, cfg) @@ -1795,11 +1804,19 @@ func restoreStream( } if cfg.tiflashRecorder != nil { - sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema()) - log.Info("Generating SQLs for restoring TiFlash Replica", - zap.Strings("sqls", sqls)) - if err := client.ResetTiflashReplicas(ctx, sqls, g); err != nil { - return errors.Annotate(err, "failed to reset tiflash replicas") + if !cfg.LastRestore { + if err := client.SaveTiFlashRecorderItems(ctx, cfg.RestoreTS, cfg.tiflashRecorder.GetItems(), cfg.logCheckpointMetaManager); err != nil { + return errors.Annotate(err, "failed to persist tiflash items for next segment") + } + log.Info("skip restoring TiFlash Replica until last segment", + zap.Uint64("restored-ts", cfg.RestoreTS)) + } else { + sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema()) + log.Info("Generating SQLs for restoring TiFlash Replica", + zap.Strings("sqls", sqls)) + if err := client.ResetTiflashReplicas(ctx, sqls, g); err != nil { + return errors.Annotate(err, "failed to reset tiflash replicas") + } } } @@ -2244,6 +2261,29 @@ func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, return nil } +func loadTiFlashRecorderItemsIfNeeded(ctx context.Context, client *logclient.LogClient, cfg *LogRestoreConfig) error { + if cfg.tiflashRecorder == nil { + return nil + } + if len(cfg.FullBackupStorage) != 0 { + return nil + } + + items, err := client.LoadTiFlashRecorderItems(ctx, cfg.StartTS, cfg.logCheckpointMetaManager) + if err != nil { + return errors.Trace(err) + } + if items == nil { + log.Info("no tiflash items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) + return nil + } + cfg.tiflashRecorder.Load(items) + log.Info("loaded tiflash items for previous segment", + zap.Uint64("start-ts", cfg.StartTS), + zap.Int("item-count", len(items))) + return nil +} + func getCurrentTSFromCheckpointOrPD(ctx context.Context, mgr *conn.Mgr, cfg *LogRestoreConfig) (uint64, error) { if cfg.checkpointTaskInfo != nil && cfg.checkpointTaskInfo.Metadata != nil { // reuse the checkpoint task's rewrite ts From 2732cb8af91d7a4b05f36d5c16aa3a59b648ad22 Mon Sep 17 00:00:00 2001 From: hillium Date: Thu, 22 Jan 2026 16:29:51 +0800 Subject: [PATCH 03/18] handle ingest ddl Signed-off-by: hillium --- br/pkg/restore/ingestrec/ingest_recorder.go | 54 +++++++ br/pkg/restore/log_client/ingest_items.go | 167 ++++++++++++++++++++ br/pkg/task/stream.go | 58 ++++++- 3 files changed, 277 insertions(+), 2 deletions(-) create mode 100644 br/pkg/restore/log_client/ingest_items.go diff --git a/br/pkg/restore/ingestrec/ingest_recorder.go b/br/pkg/restore/ingestrec/ingest_recorder.go index f406d36b0d680..7419d9b340eac 100644 --- a/br/pkg/restore/ingestrec/ingest_recorder.go +++ b/br/pkg/restore/ingestrec/ingest_recorder.go @@ -247,3 +247,57 @@ func (i *IngestRecorder) IterateForeignKeys(f func(*ForeignKeyRecord) error) err } return nil } + +// ExportItems returns a snapshot of ingest items keyed by table ID and index ID. +func (i *IngestRecorder) ExportItems() map[int64]map[int64]bool { + items := make(map[int64]map[int64]bool, len(i.items)) + for tableID, indexes := range i.items { + if len(indexes) == 0 { + continue + } + tableItems := make(map[int64]bool, len(indexes)) + for indexID, info := range indexes { + if info == nil { + continue + } + tableItems[indexID] = info.IsPrimary + } + if len(tableItems) > 0 { + items[tableID] = tableItems + } + } + return items +} + +// MergeItems merges the provided ingest items into the recorder. +func (i *IngestRecorder) MergeItems(items map[int64]map[int64]bool) { + if len(items) == 0 { + return + } + if i.items == nil { + i.items = make(map[int64]map[int64]*IngestIndexInfo) + } + for tableID, indexMap := range items { + if len(indexMap) == 0 { + continue + } + tableIndexes, exists := i.items[tableID] + if !exists { + tableIndexes = make(map[int64]*IngestIndexInfo, len(indexMap)) + i.items[tableID] = tableIndexes + } + for indexID, isPrimary := range indexMap { + info, exists := tableIndexes[indexID] + if !exists { + tableIndexes[indexID] = &IngestIndexInfo{ + IsPrimary: isPrimary, + Updated: false, + } + continue + } + if isPrimary && !info.IsPrimary { + info.IsPrimary = true + } + } + } +} diff --git a/br/pkg/restore/log_client/ingest_items.go b/br/pkg/restore/log_client/ingest_items.go new file mode 100644 index 0000000000000..56b695c12f08e --- /dev/null +++ b/br/pkg/restore/log_client/ingest_items.go @@ -0,0 +1,167 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package logclient + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/pingcap/errors" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/checkpoint" + "github.com/pingcap/tidb/pkg/objstore/storeapi" + "go.uber.org/zap" +) + +const pitrIngestItemsDir = "pitr_ingest_items" + +type pitrIngestItems struct { + Items map[int64]map[int64]bool `json:"items"` +} + +func PitrIngestItemsFilename(clusterID, restoredTS uint64) string { + return fmt.Sprintf("%s/pitr_ingest_items.cluster_id:%d.restored_ts:%d", pitrIngestItemsDir, clusterID, restoredTS) +} + +func countIngestItems(items map[int64]map[int64]bool) int { + total := 0 + for _, indexMap := range items { + total += len(indexMap) + } + return total +} + +func (rc *LogClient) loadIngestItemsFromStorage( + ctx context.Context, + storage storeapi.Storage, + restoredTS uint64, +) (map[int64]map[int64]bool, bool, error) { + clusterID := rc.GetClusterID(ctx) + fileName := PitrIngestItemsFilename(clusterID, restoredTS) + exists, err := storage.FileExists(ctx, fileName) + if err != nil { + return nil, false, errors.Annotatef(err, "failed to check ingest items file %s", fileName) + } + if !exists { + return nil, false, nil + } + + raw, err := storage.ReadFile(ctx, fileName) + if err != nil { + return nil, false, errors.Annotatef(err, "failed to read ingest items file %s", fileName) + } + + var payload pitrIngestItems + if err := json.Unmarshal(raw, &payload); err != nil { + return nil, false, errors.Annotatef(err, "failed to unmarshal ingest items file %s", fileName) + } + if payload.Items == nil { + payload.Items = map[int64]map[int64]bool{} + } + log.Info("loaded pitr ingest items", + zap.String("file", fileName), + zap.Int("table-count", len(payload.Items)), + zap.Int("index-count", countIngestItems(payload.Items))) + return payload.Items, true, nil +} + +func (rc *LogClient) saveIngestItemsToStorage( + ctx context.Context, + storage storeapi.Storage, + restoredTS uint64, + items map[int64]map[int64]bool, +) error { + clusterID := rc.GetClusterID(ctx) + fileName := PitrIngestItemsFilename(clusterID, restoredTS) + if items == nil { + items = map[int64]map[int64]bool{} + } + payload := pitrIngestItems{Items: items} + raw, err := json.Marshal(&payload) + if err != nil { + return errors.Trace(err) + } + log.Info("saving pitr ingest items", + zap.String("file", fileName), + zap.Int("table-count", len(items)), + zap.Int("index-count", countIngestItems(items))) + if err := storage.WriteFile(ctx, fileName, raw); err != nil { + return errors.Annotatef(err, "failed to save ingest items file %s", fileName) + } + return nil +} + +func (rc *LogClient) loadIngestItems( + ctx context.Context, + restoredTS uint64, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) (map[int64]map[int64]bool, error) { + if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { + items, found, err := rc.loadIngestItemsFromStorage(ctx, checkpointStorage, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if found { + return items, nil + } + } + if rc.storage == nil { + return nil, nil + } + items, found, err := rc.loadIngestItemsFromStorage(ctx, rc.storage, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if !found { + return nil, nil + } + return items, nil +} + +func (rc *LogClient) saveIngestItems( + ctx context.Context, + restoredTS uint64, + items map[int64]map[int64]bool, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + storage := rc.tryGetCheckpointStorage(logCheckpointMetaManager) + if storage == nil { + storage = rc.storage + } + if storage == nil { + return errors.New("no storage available for persisting ingest items") + } + return errors.Trace(rc.saveIngestItemsToStorage(ctx, storage, restoredTS, items)) +} + +// LoadIngestRecorderItems loads persisted ingest recorder items for a segment. +func (rc *LogClient) LoadIngestRecorderItems( + ctx context.Context, + restoredTS uint64, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) (map[int64]map[int64]bool, error) { + return rc.loadIngestItems(ctx, restoredTS, logCheckpointMetaManager) +} + +// SaveIngestRecorderItems persists ingest recorder items for the next segment. +func (rc *LogClient) SaveIngestRecorderItems( + ctx context.Context, + restoredTS uint64, + items map[int64]map[int64]bool, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + return rc.saveIngestItems(ctx, restoredTS, items, logCheckpointMetaManager) +} diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index f82efd1d2551f..138ff9aef0b31 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1499,6 +1499,7 @@ func restoreStream( checkpointTotalSize uint64 currentTS uint64 extraFields []zapcore.Field + ingestItemsForNextSeg map[int64]map[int64]bool mu sync.Mutex startTime = time.Now() ) @@ -1635,6 +1636,9 @@ func restoreStream( if err != nil { return errors.Trace(err) } + if err := loadIngestRecorderItemsIfNeeded(ctx, client, cfg, schemasReplace.GetIngestRecorder()); err != nil { + return errors.Trace(err) + } importModeSwitcher := restore.NewImportModeSwitcher(mgr.GetPDClient(), cfg.Config.SwitchModeInterval, mgr.GetTLSConfig()) @@ -1687,6 +1691,9 @@ func restoreStream( rewriteRules := buildRewriteRules(schemasReplace) ingestRecorder := schemasReplace.GetIngestRecorder() + if !cfg.LastRestore { + ingestItemsForNextSeg = ingestRecorder.ExportItems() + } if err := rangeFilterFromIngestRecorder(ingestRecorder, rewriteRules); err != nil { return errors.Trace(err) } @@ -1799,8 +1806,18 @@ func restoreStream( } // index ingestion is not captured by regular log backup, so we need to manually ingest again - if err = client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g); err != nil { - return errors.Annotate(err, "failed to repair ingest index") + if cfg.LastRestore { + if err = client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g); err != nil { + return errors.Annotate(err, "failed to repair ingest index") + } + } else { + if len(ingestItemsForNextSeg) > 0 { + if err := client.SaveIngestRecorderItems(ctx, cfg.RestoreTS, ingestItemsForNextSeg, cfg.logCheckpointMetaManager); err != nil { + return errors.Annotate(err, "failed to persist ingest items for next segment") + } + } + log.Info("skip repairing ingest index until last segment", + zap.Uint64("restored-ts", cfg.RestoreTS)) } if cfg.tiflashRecorder != nil { @@ -2284,6 +2301,43 @@ func loadTiFlashRecorderItemsIfNeeded(ctx context.Context, client *logclient.Log return nil } +func countIngestRecorderItems(items map[int64]map[int64]bool) int { + total := 0 + for _, indexMap := range items { + total += len(indexMap) + } + return total +} + +func loadIngestRecorderItemsIfNeeded( + ctx context.Context, + client *logclient.LogClient, + cfg *LogRestoreConfig, + recorder *ingestrec.IngestRecorder, +) error { + if recorder == nil { + return nil + } + if len(cfg.FullBackupStorage) != 0 { + return nil + } + + items, err := client.LoadIngestRecorderItems(ctx, cfg.StartTS, cfg.logCheckpointMetaManager) + if err != nil { + return errors.Trace(err) + } + if items == nil { + log.Info("no ingest items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) + return nil + } + recorder.MergeItems(items) + log.Info("loaded ingest items for previous segment", + zap.Uint64("start-ts", cfg.StartTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", countIngestRecorderItems(items))) + return nil +} + func getCurrentTSFromCheckpointOrPD(ctx context.Context, mgr *conn.Mgr, cfg *LogRestoreConfig) (uint64, error) { if cfg.checkpointTaskInfo != nil && cfg.checkpointTaskInfo.Metadata != nil { // reuse the checkpoint task's rewrite ts From 5bcfd84671d1b856f2ff930b4c02d6f281de5b89 Mon Sep 17 00:00:00 2001 From: hillium Date: Fri, 23 Jan 2026 14:12:37 +0800 Subject: [PATCH 04/18] tidy up codebase Signed-off-by: hillium --- br/pkg/checkpoint/manager.go | 9 + br/pkg/checkpoint/pitr_items.go | 219 ++++++++++++++++++++ br/pkg/checkpoint/pitr_items_manager.go | 172 +++++++++++++++ br/pkg/checkpoint/storage.go | 1 + br/pkg/restore/ingestrec/ingest_recorder.go | 9 + br/pkg/restore/log_client/ingest_items.go | 147 +++---------- br/pkg/restore/log_client/tiflash_items.go | 130 ++---------- br/pkg/task/stream.go | 14 +- 8 files changed, 462 insertions(+), 239 deletions(-) create mode 100644 br/pkg/checkpoint/pitr_items.go create mode 100644 br/pkg/checkpoint/pitr_items_manager.go diff --git a/br/pkg/checkpoint/manager.go b/br/pkg/checkpoint/manager.go index 3f0e40b2f80e2..f52774f79ea45 100644 --- a/br/pkg/checkpoint/manager.go +++ b/br/pkg/checkpoint/manager.go @@ -23,6 +23,7 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/pkg/domain" + "github.com/pingcap/tidb/pkg/meta/model" "github.com/pingcap/tidb/pkg/objstore/storeapi" "github.com/pingcap/tidb/pkg/parser/ast" ) @@ -69,6 +70,7 @@ type MetaManager[K KeyType, SV, LV ValueType, M any] interface { type LogMetaManager[K KeyType, SV, LV ValueType, M any] interface { MetaManager[K, SV, LV, M] + SegmentedRestoreStorage LoadCheckpointProgress(context.Context) (*CheckpointProgress, error) SaveCheckpointProgress(context.Context, *CheckpointProgress) error @@ -81,6 +83,13 @@ type LogMetaManager[K KeyType, SV, LV ValueType, M any] interface { TryGetStorage() storeapi.Storage } +type SegmentedRestoreStorage interface { + LoadPITRIngestItems(context.Context, uint64, uint64) (map[int64]map[int64]bool, bool, error) + SavePITRIngestItems(context.Context, uint64, uint64, map[int64]map[int64]bool) error + LoadPITRTiFlashItems(context.Context, uint64, uint64) (map[int64]model.TiFlashReplicaInfo, bool, error) + SavePITRTiFlashItems(context.Context, uint64, uint64, map[int64]model.TiFlashReplicaInfo) error +} + type TableMetaManager[K KeyType, SV, LV ValueType, M any] struct { se glue.Session runnerSe glue.Session diff --git a/br/pkg/checkpoint/pitr_items.go b/br/pkg/checkpoint/pitr_items.go new file mode 100644 index 0000000000000..ef8b88ca1791b --- /dev/null +++ b/br/pkg/checkpoint/pitr_items.go @@ -0,0 +1,219 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checkpoint + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/pingcap/errors" + "github.com/pingcap/tidb/br/pkg/glue" + "github.com/pingcap/tidb/pkg/domain" + "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/meta/model" + "github.com/pingcap/tidb/pkg/objstore/storeapi" + "github.com/pingcap/tidb/pkg/parser/ast" + "github.com/pingcap/tidb/pkg/util/sqlexec" +) + +const ( + LogRestorePITRItemsDatabaseName = "__TiDB_BR_Temporary_Log_Restore_PiTR_Items" + + pitrIngestItemsTableName = "pitr_ingest_items" + pitrTiFlashItemsTableName = "pitr_tiflash_items" + + pitrIngestItemsDir = "pitr_ingest_items" + pitrTiFlashItemsDir = "pitr_tiflash_items" + + createPITRItemsTable = ` + CREATE TABLE IF NOT EXISTS %n.%n ( + cluster_id BIGINT NOT NULL, + restored_ts BIGINT NOT NULL, + segment_id BIGINT NOT NULL, + data BLOB(524288) NOT NULL, + update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(cluster_id, restored_ts, segment_id));` + + insertPITRItemsSQLTemplate = ` + REPLACE INTO %n.%n (cluster_id, restored_ts, segment_id, data) VALUES (%?, %?, %?, %?);` + + selectPITRItemsSQLTemplate = ` + SELECT segment_id, data FROM %n.%n WHERE cluster_id = %? AND restored_ts = %? ORDER BY segment_id;` + + deletePITRItemsSQLTemplate = ` + DELETE FROM %n.%n WHERE cluster_id = %? AND restored_ts = %?;` +) + +type pitrIngestItemsPayload struct { + Items map[int64]map[int64]bool `json:"items"` +} + +type pitrTiFlashItemsPayload struct { + Items map[int64]model.TiFlashReplicaInfo `json:"items"` +} + +func marshalPITRIngestItems(items map[int64]map[int64]bool) ([]byte, error) { + if items == nil { + items = map[int64]map[int64]bool{} + } + return json.Marshal(&pitrIngestItemsPayload{Items: items}) +} + +func unmarshalPITRIngestItems(data []byte) (map[int64]map[int64]bool, error) { + var payload pitrIngestItemsPayload + if err := json.Unmarshal(data, &payload); err != nil { + return nil, errors.Trace(err) + } + if payload.Items == nil { + payload.Items = map[int64]map[int64]bool{} + } + return payload.Items, nil +} + +func marshalPITRTiFlashItems(items map[int64]model.TiFlashReplicaInfo) ([]byte, error) { + if items == nil { + items = map[int64]model.TiFlashReplicaInfo{} + } + return json.Marshal(&pitrTiFlashItemsPayload{Items: items}) +} + +func unmarshalPITRTiFlashItems(data []byte) (map[int64]model.TiFlashReplicaInfo, error) { + var payload pitrTiFlashItemsPayload + if err := json.Unmarshal(data, &payload); err != nil { + return nil, errors.Trace(err) + } + if payload.Items == nil { + payload.Items = map[int64]model.TiFlashReplicaInfo{} + } + return payload.Items, nil +} + +func pitrItemsFilename(dir, name string, clusterID, restoredTS uint64) string { + return fmt.Sprintf("%s/%s.cluster_id:%d.restored_ts:%d", dir, name, clusterID, restoredTS) +} + +func pitrIngestItemsPath(clusterID, restoredTS uint64) string { + return pitrItemsFilename(pitrIngestItemsDir, pitrIngestItemsDir, clusterID, restoredTS) +} + +func pitrTiFlashItemsPath(clusterID, restoredTS uint64) string { + return pitrItemsFilename(pitrTiFlashItemsDir, pitrTiFlashItemsDir, clusterID, restoredTS) +} + +func loadPITRItemsFromStorage( + ctx context.Context, + storage storeapi.Storage, + path string, + itemName string, +) ([]byte, bool, error) { + exists, err := storage.FileExists(ctx, path) + if err != nil { + return nil, false, errors.Annotatef(err, "failed to check %s file %s", itemName, path) + } + if !exists { + return nil, false, nil + } + raw, err := storage.ReadFile(ctx, path) + if err != nil { + return nil, false, errors.Annotatef(err, "failed to read %s file %s", itemName, path) + } + return raw, true, nil +} + +func savePITRItemsToStorage( + ctx context.Context, + storage storeapi.Storage, + path string, + itemName string, + data []byte, +) error { + if err := storage.WriteFile(ctx, path, data); err != nil { + return errors.Annotatef(err, "failed to save %s file %s", itemName, path) + } + return nil +} + +func initPITRItemsTable(ctx context.Context, se glue.Session, dbName string, tableNames []string) error { + if err := se.ExecuteInternal(ctx, "CREATE DATABASE IF NOT EXISTS %n;", dbName); err != nil { + return errors.Trace(err) + } + for _, tableName := range tableNames { + if err := se.ExecuteInternal(ctx, createPITRItemsTable, dbName, tableName); err != nil { + return errors.Trace(err) + } + } + return nil +} + +func pitrItemsTableExists(dom *domain.Domain, tableName string) bool { + if dom == nil { + return false + } + return dom.InfoSchema(). + TableExists(ast.NewCIStr(LogRestorePITRItemsDatabaseName), ast.NewCIStr(tableName)) +} + +func loadPITRItemsFromTable( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, + dbName string, + tableName string, + clusterID uint64, + restoredTS uint64, +) ([]byte, bool, error) { + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + selectPITRItemsSQLTemplate, + dbName, tableName, clusterID, restoredTS, + ) + if errSQL != nil { + return nil, false, errors.Annotatef(errSQL, "failed to get pitr items from table %s.%s", dbName, tableName) + } + if len(rows) == 0 { + return nil, false, nil + } + data := make([]byte, 0, len(rows)*CheckpointIdMapBlockSize) + for i, row := range rows { + segmentID, chunk := row.GetUint64(0), row.GetBytes(1) + if uint64(i) != segmentID { + return nil, false, errors.Errorf( + "pitr items table %s.%s is incomplete at segment %d", dbName, tableName, segmentID) + } + data = append(data, chunk...) + } + return data, true, nil +} + +func savePITRItemsToTable( + ctx context.Context, + se glue.Session, + dbName string, + tableName string, + clusterID uint64, + restoredTS uint64, + data []byte, +) error { + if err := initPITRItemsTable(ctx, se, dbName, []string{tableName}); err != nil { + return errors.Trace(err) + } + if err := se.ExecuteInternal(ctx, deletePITRItemsSQLTemplate, dbName, tableName, clusterID, restoredTS); err != nil { + return errors.Trace(err) + } + return errors.Trace(chunkInsertCheckpointData(data, func(segmentID uint64, chunk []byte) error { + return errors.Trace(se.ExecuteInternal(ctx, insertPITRItemsSQLTemplate, dbName, tableName, clusterID, restoredTS, segmentID, chunk)) + })) +} diff --git a/br/pkg/checkpoint/pitr_items_manager.go b/br/pkg/checkpoint/pitr_items_manager.go new file mode 100644 index 0000000000000..311e08c2733bd --- /dev/null +++ b/br/pkg/checkpoint/pitr_items_manager.go @@ -0,0 +1,172 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checkpoint + +import ( + "context" + + "github.com/pingcap/errors" + "github.com/pingcap/tidb/pkg/meta/model" +) + +func (manager *TableMetaManager[K, SV, LV, M]) LoadPITRIngestItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, +) (map[int64]map[int64]bool, bool, error) { + if !pitrItemsTableExists(manager.dom, pitrIngestItemsTableName) { + return nil, false, nil + } + if manager.se == nil { + return nil, false, errors.New("checkpoint session is not initialized") + } + execCtx := manager.se.GetSessionCtx().GetRestrictedSQLExecutor() + data, found, err := loadPITRItemsFromTable(ctx, execCtx, LogRestorePITRItemsDatabaseName, pitrIngestItemsTableName, clusterID, restoredTS) + if err != nil { + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil + } + items, err := unmarshalPITRIngestItems(data) + if err != nil { + return nil, false, errors.Trace(err) + } + return items, true, nil +} + +func (manager *TableMetaManager[K, SV, LV, M]) SavePITRIngestItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, + items map[int64]map[int64]bool, +) error { + if manager.se == nil { + return errors.New("checkpoint session is not initialized") + } + data, err := marshalPITRIngestItems(items) + if err != nil { + return errors.Trace(err) + } + return errors.Trace(savePITRItemsToTable(ctx, manager.se, LogRestorePITRItemsDatabaseName, pitrIngestItemsTableName, clusterID, restoredTS, data)) +} + +func (manager *TableMetaManager[K, SV, LV, M]) LoadPITRTiFlashItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, +) (map[int64]model.TiFlashReplicaInfo, bool, error) { + if !pitrItemsTableExists(manager.dom, pitrTiFlashItemsTableName) { + return nil, false, nil + } + if manager.se == nil { + return nil, false, errors.New("checkpoint session is not initialized") + } + execCtx := manager.se.GetSessionCtx().GetRestrictedSQLExecutor() + data, found, err := loadPITRItemsFromTable(ctx, execCtx, LogRestorePITRItemsDatabaseName, pitrTiFlashItemsTableName, clusterID, restoredTS) + if err != nil { + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil + } + items, err := unmarshalPITRTiFlashItems(data) + if err != nil { + return nil, false, errors.Trace(err) + } + return items, true, nil +} + +func (manager *TableMetaManager[K, SV, LV, M]) SavePITRTiFlashItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, + items map[int64]model.TiFlashReplicaInfo, +) error { + if manager.se == nil { + return errors.New("checkpoint session is not initialized") + } + data, err := marshalPITRTiFlashItems(items) + if err != nil { + return errors.Trace(err) + } + return errors.Trace(savePITRItemsToTable(ctx, manager.se, LogRestorePITRItemsDatabaseName, pitrTiFlashItemsTableName, clusterID, restoredTS, data)) +} + +func (manager *StorageMetaManager[K, SV, LV, M]) LoadPITRIngestItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, +) (map[int64]map[int64]bool, bool, error) { + path := pitrIngestItemsPath(clusterID, restoredTS) + data, found, err := loadPITRItemsFromStorage(ctx, manager.storage, path, "ingest items") + if err != nil { + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil + } + items, err := unmarshalPITRIngestItems(data) + if err != nil { + return nil, false, errors.Trace(err) + } + return items, true, nil +} + +func (manager *StorageMetaManager[K, SV, LV, M]) SavePITRIngestItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, + items map[int64]map[int64]bool, +) error { + data, err := marshalPITRIngestItems(items) + if err != nil { + return errors.Trace(err) + } + return errors.Trace(savePITRItemsToStorage(ctx, manager.storage, pitrIngestItemsPath(clusterID, restoredTS), "ingest items", data)) +} + +func (manager *StorageMetaManager[K, SV, LV, M]) LoadPITRTiFlashItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, +) (map[int64]model.TiFlashReplicaInfo, bool, error) { + path := pitrTiFlashItemsPath(clusterID, restoredTS) + data, found, err := loadPITRItemsFromStorage(ctx, manager.storage, path, "tiflash items") + if err != nil { + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil + } + items, err := unmarshalPITRTiFlashItems(data) + if err != nil { + return nil, false, errors.Trace(err) + } + return items, true, nil +} + +func (manager *StorageMetaManager[K, SV, LV, M]) SavePITRTiFlashItems( + ctx context.Context, + clusterID uint64, + restoredTS uint64, + items map[int64]model.TiFlashReplicaInfo, +) error { + data, err := marshalPITRTiFlashItems(items) + if err != nil { + return errors.Trace(err) + } + return errors.Trace(savePITRItemsToStorage(ctx, manager.storage, pitrTiFlashItemsPath(clusterID, restoredTS), "tiflash items", data)) +} diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index 59f319057ec6d..bb00eb5887826 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -93,6 +93,7 @@ const ( func IsCheckpointDB(dbname string) bool { // Check if the database name starts with any of the checkpoint database name prefixes return strings.HasPrefix(dbname, LogRestoreCheckpointDatabaseName) || + strings.HasPrefix(dbname, LogRestorePITRItemsDatabaseName) || strings.HasPrefix(dbname, SnapshotRestoreCheckpointDatabaseName) || strings.HasPrefix(dbname, CustomSSTRestoreCheckpointDatabaseName) } diff --git a/br/pkg/restore/ingestrec/ingest_recorder.go b/br/pkg/restore/ingestrec/ingest_recorder.go index 7419d9b340eac..78f22c5adff65 100644 --- a/br/pkg/restore/ingestrec/ingest_recorder.go +++ b/br/pkg/restore/ingestrec/ingest_recorder.go @@ -248,6 +248,15 @@ func (i *IngestRecorder) IterateForeignKeys(f func(*ForeignKeyRecord) error) err return nil } +// CountItems counts the total ingested indexes across all tables. +func CountItems(items map[int64]map[int64]bool) int { + total := 0 + for _, indexMap := range items { + total += len(indexMap) + } + return total +} + // ExportItems returns a snapshot of ingest items keyed by table ID and index ID. func (i *IngestRecorder) ExportItems() map[int64]map[int64]bool { items := make(map[int64]map[int64]bool, len(i.items)) diff --git a/br/pkg/restore/log_client/ingest_items.go b/br/pkg/restore/log_client/ingest_items.go index 56b695c12f08e..16982e3b6ec7e 100644 --- a/br/pkg/restore/log_client/ingest_items.go +++ b/br/pkg/restore/log_client/ingest_items.go @@ -16,144 +16,39 @@ package logclient import ( "context" - "encoding/json" - "fmt" "github.com/pingcap/errors" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" - "github.com/pingcap/tidb/pkg/objstore/storeapi" + "github.com/pingcap/tidb/br/pkg/restore/ingestrec" "go.uber.org/zap" ) -const pitrIngestItemsDir = "pitr_ingest_items" - -type pitrIngestItems struct { - Items map[int64]map[int64]bool `json:"items"` -} - -func PitrIngestItemsFilename(clusterID, restoredTS uint64) string { - return fmt.Sprintf("%s/pitr_ingest_items.cluster_id:%d.restored_ts:%d", pitrIngestItemsDir, clusterID, restoredTS) -} - -func countIngestItems(items map[int64]map[int64]bool) int { - total := 0 - for _, indexMap := range items { - total += len(indexMap) - } - return total -} - -func (rc *LogClient) loadIngestItemsFromStorage( - ctx context.Context, - storage storeapi.Storage, - restoredTS uint64, -) (map[int64]map[int64]bool, bool, error) { - clusterID := rc.GetClusterID(ctx) - fileName := PitrIngestItemsFilename(clusterID, restoredTS) - exists, err := storage.FileExists(ctx, fileName) - if err != nil { - return nil, false, errors.Annotatef(err, "failed to check ingest items file %s", fileName) - } - if !exists { - return nil, false, nil - } - - raw, err := storage.ReadFile(ctx, fileName) - if err != nil { - return nil, false, errors.Annotatef(err, "failed to read ingest items file %s", fileName) - } - - var payload pitrIngestItems - if err := json.Unmarshal(raw, &payload); err != nil { - return nil, false, errors.Annotatef(err, "failed to unmarshal ingest items file %s", fileName) - } - if payload.Items == nil { - payload.Items = map[int64]map[int64]bool{} - } - log.Info("loaded pitr ingest items", - zap.String("file", fileName), - zap.Int("table-count", len(payload.Items)), - zap.Int("index-count", countIngestItems(payload.Items))) - return payload.Items, true, nil -} - -func (rc *LogClient) saveIngestItemsToStorage( - ctx context.Context, - storage storeapi.Storage, - restoredTS uint64, - items map[int64]map[int64]bool, -) error { - clusterID := rc.GetClusterID(ctx) - fileName := PitrIngestItemsFilename(clusterID, restoredTS) - if items == nil { - items = map[int64]map[int64]bool{} - } - payload := pitrIngestItems{Items: items} - raw, err := json.Marshal(&payload) - if err != nil { - return errors.Trace(err) - } - log.Info("saving pitr ingest items", - zap.String("file", fileName), - zap.Int("table-count", len(items)), - zap.Int("index-count", countIngestItems(items))) - if err := storage.WriteFile(ctx, fileName, raw); err != nil { - return errors.Annotatef(err, "failed to save ingest items file %s", fileName) - } - return nil -} - -func (rc *LogClient) loadIngestItems( +// LoadIngestRecorderItems loads persisted ingest recorder items for a segment. +func (rc *LogClient) LoadIngestRecorderItems( ctx context.Context, restoredTS uint64, - logCheckpointMetaManager checkpoint.LogMetaManagerT, + logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) (map[int64]map[int64]bool, error) { - if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { - items, found, err := rc.loadIngestItemsFromStorage(ctx, checkpointStorage, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if found { - return items, nil - } - } - if rc.storage == nil { - return nil, nil + if logCheckpointMetaManager == nil { + return nil, errors.New("checkpoint meta manager is not initialized") } - items, found, err := rc.loadIngestItemsFromStorage(ctx, rc.storage, restoredTS) + clusterID := rc.GetClusterID(ctx) + items, found, err := logCheckpointMetaManager.LoadPITRIngestItems(ctx, clusterID, restoredTS) if err != nil { return nil, errors.Trace(err) } if !found { return nil, nil } - return items, nil -} - -func (rc *LogClient) saveIngestItems( - ctx context.Context, - restoredTS uint64, - items map[int64]map[int64]bool, - logCheckpointMetaManager checkpoint.LogMetaManagerT, -) error { - storage := rc.tryGetCheckpointStorage(logCheckpointMetaManager) - if storage == nil { - storage = rc.storage - } - if storage == nil { - return errors.New("no storage available for persisting ingest items") + if items == nil { + items = map[int64]map[int64]bool{} } - return errors.Trace(rc.saveIngestItemsToStorage(ctx, storage, restoredTS, items)) -} - -// LoadIngestRecorderItems loads persisted ingest recorder items for a segment. -func (rc *LogClient) LoadIngestRecorderItems( - ctx context.Context, - restoredTS uint64, - logCheckpointMetaManager checkpoint.LogMetaManagerT, -) (map[int64]map[int64]bool, error) { - return rc.loadIngestItems(ctx, restoredTS, logCheckpointMetaManager) + log.Info("loaded pitr ingest items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", ingestrec.CountItems(items))) + return items, nil } // SaveIngestRecorderItems persists ingest recorder items for the next segment. @@ -161,7 +56,15 @@ func (rc *LogClient) SaveIngestRecorderItems( ctx context.Context, restoredTS uint64, items map[int64]map[int64]bool, - logCheckpointMetaManager checkpoint.LogMetaManagerT, + logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) error { - return rc.saveIngestItems(ctx, restoredTS, items, logCheckpointMetaManager) + if logCheckpointMetaManager == nil { + return errors.New("checkpoint meta manager is not initialized") + } + clusterID := rc.GetClusterID(ctx) + log.Info("saving pitr ingest items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", ingestrec.CountItems(items))) + return errors.Trace(logCheckpointMetaManager.SavePITRIngestItems(ctx, clusterID, restoredTS, items)) } diff --git a/br/pkg/restore/log_client/tiflash_items.go b/br/pkg/restore/log_client/tiflash_items.go index a5c47b1eb883d..90798b9b8ddef 100644 --- a/br/pkg/restore/log_client/tiflash_items.go +++ b/br/pkg/restore/log_client/tiflash_items.go @@ -16,131 +16,38 @@ package logclient import ( "context" - "encoding/json" - "fmt" "github.com/pingcap/errors" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" "github.com/pingcap/tidb/pkg/meta/model" - "github.com/pingcap/tidb/pkg/objstore/storeapi" "go.uber.org/zap" ) -const pitrTiFlashItemsDir = "pitr_tiflash_items" - -type pitrTiFlashItems struct { - Items map[int64]model.TiFlashReplicaInfo `json:"items"` -} - -func PitrTiFlashItemsFilename(clusterID, restoredTS uint64) string { - return fmt.Sprintf("%s/pitr_tiflash_items.cluster_id:%d.restored_ts:%d", pitrTiFlashItemsDir, clusterID, restoredTS) -} - -func (rc *LogClient) loadTiFlashItemsFromStorage( - ctx context.Context, - storage storeapi.Storage, - restoredTS uint64, -) (map[int64]model.TiFlashReplicaInfo, bool, error) { - clusterID := rc.GetClusterID(ctx) - fileName := PitrTiFlashItemsFilename(clusterID, restoredTS) - exists, err := storage.FileExists(ctx, fileName) - if err != nil { - return nil, false, errors.Annotatef(err, "failed to check tiflash items file %s", fileName) - } - if !exists { - return nil, false, nil - } - - raw, err := storage.ReadFile(ctx, fileName) - if err != nil { - return nil, false, errors.Annotatef(err, "failed to read tiflash items file %s", fileName) - } - - var payload pitrTiFlashItems - if err := json.Unmarshal(raw, &payload); err != nil { - return nil, false, errors.Annotatef(err, "failed to unmarshal tiflash items file %s", fileName) - } - if payload.Items == nil { - payload.Items = map[int64]model.TiFlashReplicaInfo{} - } - log.Info("loaded pitr tiflash items", zap.String("file", fileName), zap.Int("item-count", len(payload.Items))) - return payload.Items, true, nil -} - -func (rc *LogClient) saveTiFlashItemsToStorage( - ctx context.Context, - storage storeapi.Storage, - restoredTS uint64, - items map[int64]model.TiFlashReplicaInfo, -) error { - clusterID := rc.GetClusterID(ctx) - fileName := PitrTiFlashItemsFilename(clusterID, restoredTS) - if items == nil { - items = map[int64]model.TiFlashReplicaInfo{} - } - payload := pitrTiFlashItems{Items: items} - raw, err := json.Marshal(&payload) - if err != nil { - return errors.Trace(err) - } - log.Info("saving pitr tiflash items", zap.String("file", fileName), zap.Int("item-count", len(items))) - if err := storage.WriteFile(ctx, fileName, raw); err != nil { - return errors.Annotatef(err, "failed to save tiflash items file %s", fileName) - } - return nil -} - -func (rc *LogClient) loadTiFlashItems( +// LoadTiFlashRecorderItems loads persisted TiFlash recorder items for a segment. +func (rc *LogClient) LoadTiFlashRecorderItems( ctx context.Context, restoredTS uint64, - logCheckpointMetaManager checkpoint.LogMetaManagerT, + logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) (map[int64]model.TiFlashReplicaInfo, error) { - if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { - items, found, err := rc.loadTiFlashItemsFromStorage(ctx, checkpointStorage, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if found { - return items, nil - } - } - if rc.storage == nil { - return nil, nil + if logCheckpointMetaManager == nil { + return nil, errors.New("checkpoint meta manager is not initialized") } - items, found, err := rc.loadTiFlashItemsFromStorage(ctx, rc.storage, restoredTS) + clusterID := rc.GetClusterID(ctx) + items, found, err := logCheckpointMetaManager.LoadPITRTiFlashItems(ctx, clusterID, restoredTS) if err != nil { return nil, errors.Trace(err) } if !found { return nil, nil } - return items, nil -} - -func (rc *LogClient) saveTiFlashItems( - ctx context.Context, - restoredTS uint64, - items map[int64]model.TiFlashReplicaInfo, - logCheckpointMetaManager checkpoint.LogMetaManagerT, -) error { - storage := rc.tryGetCheckpointStorage(logCheckpointMetaManager) - if storage == nil { - storage = rc.storage - } - if storage == nil { - return errors.New("no storage available for persisting tiflash items") + if items == nil { + items = map[int64]model.TiFlashReplicaInfo{} } - return errors.Trace(rc.saveTiFlashItemsToStorage(ctx, storage, restoredTS, items)) -} - -// LoadTiFlashRecorderItems loads persisted TiFlash recorder items for a segment. -func (rc *LogClient) LoadTiFlashRecorderItems( - ctx context.Context, - restoredTS uint64, - logCheckpointMetaManager checkpoint.LogMetaManagerT, -) (map[int64]model.TiFlashReplicaInfo, error) { - return rc.loadTiFlashItems(ctx, restoredTS, logCheckpointMetaManager) + log.Info("loaded pitr tiflash items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("item-count", len(items))) + return items, nil } // SaveTiFlashRecorderItems persists TiFlash recorder items for the next segment. @@ -148,7 +55,14 @@ func (rc *LogClient) SaveTiFlashRecorderItems( ctx context.Context, restoredTS uint64, items map[int64]model.TiFlashReplicaInfo, - logCheckpointMetaManager checkpoint.LogMetaManagerT, + logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) error { - return rc.saveTiFlashItems(ctx, restoredTS, items, logCheckpointMetaManager) + if logCheckpointMetaManager == nil { + return errors.New("checkpoint meta manager is not initialized") + } + clusterID := rc.GetClusterID(ctx) + log.Info("saving pitr tiflash items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("item-count", len(items))) + return errors.Trace(logCheckpointMetaManager.SavePITRTiFlashItems(ctx, clusterID, restoredTS, items)) } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 138ff9aef0b31..b63f983843b99 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1542,6 +1542,10 @@ func restoreStream( } client := cfg.logClient + if !cfg.LastRestore && cfg.logCheckpointMetaManager == nil { + return errors.Annotatef(berrors.ErrInvalidArgument, + "segmented log restore requires checkpoint storage (table or external), enable checkpoint or configure --checkpoint-storage") + } migs, err := client.GetLockedMigrations(ctx) if err != nil { return errors.Trace(err) @@ -2301,14 +2305,6 @@ func loadTiFlashRecorderItemsIfNeeded(ctx context.Context, client *logclient.Log return nil } -func countIngestRecorderItems(items map[int64]map[int64]bool) int { - total := 0 - for _, indexMap := range items { - total += len(indexMap) - } - return total -} - func loadIngestRecorderItemsIfNeeded( ctx context.Context, client *logclient.LogClient, @@ -2334,7 +2330,7 @@ func loadIngestRecorderItemsIfNeeded( log.Info("loaded ingest items for previous segment", zap.Uint64("start-ts", cfg.StartTS), zap.Int("table-count", len(items)), - zap.Int("index-count", countIngestRecorderItems(items))) + zap.Int("index-count", ingestrec.CountItems(items))) return nil } From e13e019de1679ae531a782a41822494f3ea8188d Mon Sep 17 00:00:00 2001 From: hillium Date: Tue, 27 Jan 2026 12:26:48 +0800 Subject: [PATCH 05/18] stash agent docs Signed-off-by: hillium --- .../findings/lifecycle_of_recorders.md | 226 ++++++++++++++++++ __agent_doc/main.md | 53 ++++ __agent_doc/segmented_restore_plan.md | 62 +++++ 3 files changed, 341 insertions(+) create mode 100644 __agent_doc/findings/lifecycle_of_recorders.md create mode 100644 __agent_doc/main.md create mode 100644 __agent_doc/segmented_restore_plan.md diff --git a/__agent_doc/findings/lifecycle_of_recorders.md b/__agent_doc/findings/lifecycle_of_recorders.md new file mode 100644 index 0000000000000..d12b40417b805 --- /dev/null +++ b/__agent_doc/findings/lifecycle_of_recorders.md @@ -0,0 +1,226 @@ +# DDL Special Handling During Log Meta Restore + +This document summarizes **only the DDL-related behaviors that are handled specially** during log/meta restore. All items below are confirmed by code paths in the current repository (no inference beyond what the code checks). + +## 0) Recorder lifecycles (delRangeRecorder / ingestRecorder / TiflashRecorder) + +This section describes **when each recorder is created, populated, and consumed**. + +### delRangeRecorder (GC delete-range) + +- Created: + - `br/pkg/task/stream.go` → `buildSchemaReplace` + - `stream.NewSchemasReplace(..., recordDeleteRange=client.RecordDeleteRange, ...)` + - `br/pkg/stream/rewrite_meta_rawkv.go` → `NewSchemasReplace` + - `delRangeRecorder: newDelRangeExecWrapper(globalTableIdMap, recordDeleteRange)` + +- Populated: + - `br/pkg/stream/rewrite_meta_rawkv.go` → `SchemasReplace.RewriteMetaKvEntry` + - On Default CF + `utils.IsMetaDDLJobHistoryKey(e.Key)`: decode `model.Job` and call `processIngestIndexAndDeleteRangeFromJob(job)` + - `br/pkg/stream/rewrite_meta_rawkv.go` → `processIngestIndexAndDeleteRangeFromJob` + - `if ddl.JobNeedGC(job)` → `ddl.AddDelRangeJobInternal(..., sr.delRangeRecorder, job)` + - `br/pkg/stream/rewrite_meta_rawkv.go` → `brDelRangeExecWrapper.ConsumeDeleteRange` + - calls `recordDeleteRange(*PreDelRangeQuery)` (wired to `LogClient.RecordDeleteRange`). + +- Buffered / thread-safety: + - `br/pkg/restore/log_client/batch_meta_processor.go` → `RestoreMetaKVProcessor.RestoreAndRewriteMetaKVFiles` + - starts loader: `rp.client.RunGCRowsLoader(ctx)` + - `br/pkg/restore/log_client/client.go`: + - `RecordDeleteRange` pushes into `deleteRangeQueryCh` + - `RunGCRowsLoader` drains `deleteRangeQueryCh` into `rc.deleteRangeQuery` + +- Consumed: + - `br/pkg/task/stream.go` → `restoreStream` (after KV restore) + - `client.InsertGCRows(ctx)` + - `br/pkg/restore/log_client/client.go` → `InsertGCRows` + - closes channel, waits loader, and inserts into `gc_delete_range`. + +### ingestRecorder (ingest index repair) + +- Created: + - `br/pkg/stream/rewrite_meta_rawkv.go` → `NewSchemasReplace` + - `ingestRecorder: ingestrec.New()` + +- Populated: + - `br/pkg/stream/rewrite_meta_rawkv.go` → `SchemasReplace.RewriteMetaKvEntry` + - On Default CF + `mDDLJobHistory`: decode `model.Job` → `processIngestIndexAndDeleteRangeFromJob(job)` + - `br/pkg/stream/rewrite_meta_rawkv.go` → `tryRecordIngestIndex` + - For `ActionMultiSchemaChange`: expands subjobs + - Otherwise: `sr.ingestRecorder.TryAddJob(job, ...)` + - `br/pkg/restore/ingestrec/ingest_recorder.go` → `TryAddJob` + - only records ingest reorg jobs for `ActionAddIndex` / `ActionAddPrimaryKey` / `ActionModifyColumn` (and state constraints). + +- Rewritten (table ID mapping after meta restore): + - `br/pkg/task/stream.go` → `restoreStream` + - `ingestRecorder := schemasReplace.GetIngestRecorder()` + - `rangeFilterFromIngestRecorder(ingestRecorder, rewriteRules)` + - `br/pkg/task/stream.go` → `rangeFilterFromIngestRecorder` + - `ingestRecorder.RewriteTableID(...)` based on `rewriteRules`. + +- Consumed: + - `br/pkg/task/stream.go` → `restoreStream` (after KV restore) + - `client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g)` + - `br/pkg/restore/log_client/client.go` → `RepairIngestIndex` + - calls `ingestRecorder.UpdateIndexInfo(..., InfoSchema)` then `Iterate(...)` to generate and execute SQL. + - may load/save generated SQLs via checkpoint meta manager (see `generateRepairIngestIndexSQLs`). + +### TiflashRecorder (TiFlash replica stripping + later restore) + +- Created: + - `br/pkg/task/stream.go` → `RunPointInTimeRestore` + - `cfg.tiflashRecorder = tiflashrec.New()` + +- Populated and applied during meta KV replay: + - Hook wiring: + - `br/pkg/task/stream.go` → `buildSchemaReplace` + - sets `schemasReplace.AfterTableRewrittenFn = func(deleted bool, tableInfo *model.TableInfo) { ... }` + - Hook invocation: + - `br/pkg/stream/rewrite_meta_rawkv.go` → `rewriteTableInfo` + - calls `sr.AfterTableRewrittenFn(false, &tableInfo)` on normal rewrite + - `br/pkg/stream/rewrite_meta_rawkv.go` → `rewriteEntryForTable` + - on deletion calls `sr.AfterTableRewrittenFn(true, &model.TableInfo{ID: newTableID})` + - Hook behavior: + - `br/pkg/task/stream.go` → `buildSchemaReplace` + - records current `tableInfo.TiFlashReplica` into `cfg.tiflashRecorder` (or `DelTable` when deleted / nil) + - **removes** replica info from restored meta: `tableInfo.TiFlashReplica = nil` + +- Checkpoint persistence: + - Save: + - `br/pkg/restore/log_client/client.go` → `LoadOrCreateCheckpointMetadataForLogRestore` + - `CheckpointMetadataForLogRestore.TiFlashItems = tiflashRecorder.GetItems()` + - Load: + - `br/pkg/task/stream.go` → `RunPointInTimeRestore` + - when skipping full restore due to checkpoint: `cfg.tiflashRecorder.Load(taskInfo.CheckpointInfo.Metadata.TiFlashItems)` + +- Consumed: + - SQL generation: + - `br/pkg/restore/tiflashrec/tiflash_recorder.go` + - `GenerateAlterTableDDLs(InfoSchema)` generates `ALTER TABLE ... SET TIFLASH REPLICA ...` + - Execution: + - `br/pkg/task/stream.go` → `restoreStream` + - `sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema())` + - `client.ResetTiflashReplicas(ctx, sqls, g)` + +## 1) DDL job history extraction (mDDLJobHistory) +**What is special:** DDL job history entries are *not restored as meta KV*; instead they are decoded and used for two special cases (ingest index repair and delete-range GC). + +**Path:** +- `br/pkg/stream/rewrite_meta_rawkv.go` + - `SchemasReplace.RewriteMetaKvEntry` + - Checks `utils.IsMetaDDLJobHistoryKey(e.Key)` on **Default CF** + - Decodes `model.Job` and routes to `processIngestIndexAndDeleteRangeFromJob` + +**Behavior:** +- DDL job history is parsed and **not written back** as meta KV (`return nil, ...`). + +## 2) Ingest index DDL jobs (repair by replay) +**What is special:** Ingest-mode index builds are *not included in log backup KV*, so they are recorded and later repaired via SQL. + +**Paths:** +- Capture from DDL job history: + - `br/pkg/stream/rewrite_meta_rawkv.go` + - `processIngestIndexAndDeleteRangeFromJob` + - `tryRecordIngestIndex` + - For `ActionMultiSchemaChange`: expands to sub-jobs + - Otherwise: `ingestrec.IngestRecorder.TryAddJob` +- Recording logic: + - `br/pkg/restore/ingestrec/ingest_recorder.go` + - `TryAddJob` + - **Only records** when all conditions are true: + - `job.ReorgMeta.ReorgTp == model.ReorgTypeIngest` + - `job.Type` is **one of** `ActionAddIndex`, `ActionAddPrimaryKey`, `ActionModifyColumn` + - Job is synced (or sub-job done) +- Repair execution: + - `br/pkg/restore/log_client/client.go` + - `RepairIngestIndex` + - `generateRepairIngestIndexSQLs` + +**Behavior:** +- Only the job types above are recorded for ingest repair. +- The repair uses **latest InfoSchema** to build ADD INDEX/PRIMARY KEY SQL. + +## 3) DDL jobs that require GC delete-range +**What is special:** For DDL jobs where TiDB normally relies on GC to clean ranges, the delete-range is recorded and executed explicitly after restore. + +**Paths:** +- Capture from DDL job history: + - `br/pkg/stream/rewrite_meta_rawkv.go` + - `processIngestIndexAndDeleteRangeFromJob` + - `if ddl.JobNeedGC(job)` → `ddl.AddDelRangeJobInternal(..., brDelRangeExecWrapper, job)` +- Delete-range recording: + - `br/pkg/stream/rewrite_meta_rawkv.go` + - `brDelRangeExecWrapper` (captures SQL + params) +- Execution after restore: + - `br/pkg/restore/log_client/client.go` + - `RunGCRowsLoader` / `InsertGCRows` + +**Behavior:** +- **Only jobs where** `ddl.JobNeedGC(job)` is true are handled. +- The code does **not** list job types explicitly; the DDL package decides. + +## 4) Table deletion / recreation tracking (DDL effects) +**What is special:** Deleted tables are tracked to refresh metadata in a dependency-safe order, and a re-created table is removed from the delete list. + +**Paths:** +- `br/pkg/stream/rewrite_meta_rawkv.go` + - `rewriteEntryForTable` + - When write CF indicates deletion: add to `deletedTables` + - When a new table meta is written after deletion: remove from `deletedTables` + - Comment references **RENAME TABLE** and **EXCHANGE PARTITION** sequences +- `br/pkg/restore/log_client/client.go` + - `RefreshMetaForTables` uses `deletedTables` to refresh meta in order + +**Behavior:** +- Delete + re-create sequences (e.g., rename/exchange partition) are handled to avoid stale refresh. + +## 5) DDL/meta filtering to only restore DB + DDL history +**What is special:** During meta KV restore, only `mDB` and `mDDLJobHistory` keys are considered; other meta keys are skipped. + +**Path:** +- `br/pkg/restore/log_client/log_file_manager.go` + - `ReadFilteredEntriesFromFiles` + - `if !utils.IsDBOrDDLJobHistoryKey(txnEntry.Key) { continue }` + +**Behavior:** +- This limits meta restore scope to database info and DDL job history. + +## 6) TiFlash replica (ALTER TABLE ... SET TIFLASH REPLICA) + +**What is special:** During PiTR, TiFlash replica config is **intentionally stripped out of restored table meta** and restored later via SQL. + +This impacts any historical/meta changes whose effect is persisted into `TableInfo.TiFlashReplica`, including the DDL `ALTER TABLE ... SET TIFLASH REPLICA ...`. + +**Paths:** + +- Strip replica info while replaying meta KV (and record it): + - `br/pkg/task/stream.go` → `buildSchemaReplace` + - assigns `SchemasReplace.AfterTableRewrittenFn` + - in callback: + - updates `cfg.tiflashRecorder` via `AddTable/DelTable` + - then `tableInfo.TiFlashReplica = nil` + - `br/pkg/stream/rewrite_meta_rawkv.go` → `rewriteTableInfo` + - calls `AfterTableRewrittenFn(false, &tableInfo)` during table meta rewrite + +- Persist to checkpoint (so retries keep the same intended replica config): + - `br/pkg/restore/log_client/client.go` → `LoadOrCreateCheckpointMetadataForLogRestore` + - saves `CheckpointMetadataForLogRestore.TiFlashItems` + - `br/pkg/task/stream.go` → `RunPointInTimeRestore` + - loads `cfg.tiflashRecorder.Load(...TiFlashItems...)` when resuming from checkpoint + +- Restore replica config after PiTR finishes: + - `br/pkg/task/stream.go` → `restoreStream` + - `cfg.tiflashRecorder.GenerateAlterTableDDLs(InfoSchema)` + - `client.ResetTiflashReplicas(ctx, sqls, g)` + - `br/pkg/restore/log_client/client.go` → `ResetTiflashReplicas` + +--- + +## Summary (no inference) +From the current code paths, DDL-related special handling during log/meta restore is limited to: +1) **Ingest index repair** (specific ingest reorg DDL types only). +2) **Delete-range GC** for DDL jobs where `ddl.JobNeedGC(job)` is true. +3) **Table delete/recreate tracking** affecting meta refresh order (rename/exchange patterns). +4) **Meta filtering** to `mDB` + `mDDLJobHistory` keys only. +5) **TiFlash replica stripping + later restore** (restore-time `ALTER TABLE ... SET TIFLASH REPLICA ...`). + +No other DDL job types are explicitly enumerated or handled beyond these code paths. diff --git a/__agent_doc/main.md b/__agent_doc/main.md new file mode 100644 index 0000000000000..1766897150257 --- /dev/null +++ b/__agent_doc/main.md @@ -0,0 +1,53 @@ +# Your epic + +You need to implement a feature -- segmented restore. + +Now, `br restore point` requires the user to finish restore to a point-in-time within oneshot. + +We want `br restore point` can be "segmented", like: + +``` +br restore point --pd 127.0.0.1:2379 -s local:///Volumes/eXternal/Cache/tmp/20260122_003925/incr --restored-ts 463736294698909699 --full-backup-storage local:///Volumes/eXternal/Cache/tmp/20260122_003925/full --last=false +/br restore point --pd 127.0.0.1:2379 -s local:///Volumes/eXternal/Cache/tmp/20260122_003925/incr --restored-ts 463736295708426243 --start-ts 463736294698909699 --last=false +/br restore point --pd 127.0.0.1:2379 -s local:///Volumes/eXternal/Cache/tmp/20260122_003925/incr --restored-ts ... --start-ts 463736295708426243 --last=true +``` + +But for now, it is impossible. There are something we have known: tiflash replicas may be added back too early, indices may not be properly recreated... But more are unknown unknown. + +This is an epic to make it work. Be patient. You cannot fix all within one edition. You have git access, commit or rollback your work when need. `AGENT.md` may told you to enable failpoint before running test, don't follow that as we are running "integration" tests, also you may ignore all `bazel` related requirements, repeat, don't enable failpoints, don't try to build with bazel or `make bazel_prepare`. + +You may find that something is fully out of your scope. Say, the test environment was broken. In that scenario, **don't** try to hack, just stop and ask for help. Again, don't try to hack. + +If you have made some progress, record them in `__agent_doc/`, for those who come later. + +Suggestions: +- You may not be able to solve the problem in one "sprint". Always record your plan to `__agent_doc` before start. For your next run. + +## To budld BR + +``` +make build_br +``` + +## To run our test cases + +``` +bash /Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh +``` + +This command can only be run without sandbox. Request access to it before you start to do anything. + +This command runs for minutes, again, this command runs for minutes. Don't set a too short time out. + +Once this test script passes, our epic reaches its happy ending. + +Reading its content and record its behavior can be a good start point. + +## After Happy Ending (1) + +All tests are passed. It is time to tidy up our codebase. Inspect recent commits you did, and refactor your modifications with DRY principle. + +## Integrated Test (2026-01-23) +- The segmented PiTR external test is now integrated as `br/tests/br_pitr_segmented_restore/run.sh`. +- The workload source lives under `br/tests/seg_pitr_workload`; the test builds it via `go build`. +- Run via `TEST_NAME=br_pitr_segmented_restore br/tests/run.sh` or include it in `br/tests/run_group_br_tests.sh` (G07). diff --git a/__agent_doc/segmented_restore_plan.md b/__agent_doc/segmented_restore_plan.md new file mode 100644 index 0000000000000..ae9fb0811d403 --- /dev/null +++ b/__agent_doc/segmented_restore_plan.md @@ -0,0 +1,62 @@ +# Segmented Restore Plan (WIP) + +## Plan (2026-01-23, current run) +1) Read `/Volumes/eXternal/Developer/seg-pitr-workload` to understand external test structure and expected integration points. +2) Integrate the external test into this repo (tests, scripts, or CI hooks) with minimal duplication. +3) Build BR and run the integrated test (requires unsandboxed access) to verify behavior. +4) Record outcomes, gaps, and follow-ups here. + +## Plan (2026-01-22, current run) +1) Run `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` (requires unsandboxed access) to confirm current failure state. +2) If it fails, trace the failing path in BR restore (especially segmented point restore) and implement the minimal fix. +3) Re-run the script to verify, then record outcomes and follow-up risks here. + +## Plan (2025-xx-xx, current sprint) +1) Read `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` to understand the failing step and expected output. +2) Run the script (requires unsandboxed access) to reproduce the current failure and capture the exact error text. +3) Trace where the `AddIndex` checksum / Total_kvs mismatch is generated and ensure the error message is formatted as `Error: AddIndex: Total_kvs mismatch: ...`. +4) Fix any segmented-restore gaps discovered along the way and update this doc with results. + +## Current Status +- Ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` (first run). +- Failure in second segment restore: + - Error: `no base id map found from saved id or last restored PiTR` + - Stack: `br/pkg/restore/log_client/client.go:1068` via `GetBaseIDMapAndMerge`. + +## Hypothesis +- `tidb_pitr_id_map` has `restore_id` column; id maps are saved with restore_id from the first segment. +- Subsequent segments create a new restore_id, so loading by restore_id + restored_ts fails. + +## Plan +1) Update id-map loading to fall back to the latest `restore_id` for a given `restored_ts` when loading base maps (start-ts path). +2) Re-run `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` to find next failure. +3) Iterate on remaining segmented-restore gaps (e.g., tiflash replica handling, ingest index repair), recording findings here. + +## Progress +- Implemented id-map fallback for previous segments in `br/pkg/restore/log_client/id_map.go`. +- Rebuilt BR (`make build_br`). +- Re-ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` successfully. + - Now it fails. Tiflash cases are added. + - Error: exist table(s) have tiflash replica, please remove it before restore +- Added TiFlash recorder persistence across segments and only reset replicas on the final segment. +- Auto-treat `--last` as false when `--restored-ts` is set to a non-max TS and `--last` wasn't explicitly specified. +- Rebuilt BR and re-ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh`: **PASS**. +## Progress (2026-01-22) +- Reproduced failure: `AddIndex: Total_kvs mismatch` after segmented restore. +- Root cause: running ingest index repair in non-final segments writes new MVCC meta versions that block later log restore, leaving extra indexes. +- Fix: + - Persist ingest recorder items across segments. + - Skip ingest index repair until `--last=true`, then repair once. +- Rebuilt BR and re-ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh`: **PASS**. +## Progress (2026-01-xx) +- DRY cleanup: refactored PiTR ingest/tiflash item persistence to share JSON storage helpers and checkpoint fallback logic. +- Deduplicated ingest item counting by adding `ingestrec.CountItems` and reusing it in log client + stream restore logging. +## Progress (2026-02-xx) +- Unify PiTR ingest/tiflash item persistence with checkpoint storage type (table vs external). +- Added checkpoint-side PiTR item store and moved load/save logic off log backup storage. +- Guard segmented restore: non-final segments now require checkpoint storage. + +## Progress (2026-01-23) +- Integrated the segmented PiTR workload into `br/tests/seg_pitr_workload` and added a new br test `br/tests/br_pitr_segmented_restore/run.sh`. +- Switched workload state storage to a JSON file (no sqlite dependency) and updated CLI flags accordingly. +- Added the new test to `br/tests/run_group_br_tests.sh` (G07). From 7718738ba18e66321e3fb581cc8ad33c6182e704 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Tue, 27 Jan 2026 06:40:14 +0000 Subject: [PATCH 06/18] ported test workload framework Signed-off-by: Juncen Yu --- pkg/testkit/brhelper/workload/add_index.go | 326 +++++++++++++++ pkg/testkit/brhelper/workload/context.go | 68 +++ .../brhelper/workload/modify_tiflash.go | 248 +++++++++++ pkg/testkit/brhelper/workload/nexus_common.go | 137 ++++++ pkg/testkit/brhelper/workload/nexus_ddl.go | 265 ++++++++++++ .../workload/nexus_ddl_destructive.go | 196 +++++++++ pkg/testkit/brhelper/workload/registry.go | 25 ++ pkg/testkit/brhelper/workload/runner.go | 394 ++++++++++++++++++ pkg/testkit/brhelper/workload/state_store.go | 84 ++++ pkg/testkit/brhelper/workload/summary.go | 75 ++++ pkg/testkit/brhelper/workload/util.go | 178 ++++++++ .../brietest/segmented_restore_test.go | 216 ++++++++++ 12 files changed, 2212 insertions(+) create mode 100644 pkg/testkit/brhelper/workload/add_index.go create mode 100644 pkg/testkit/brhelper/workload/context.go create mode 100644 pkg/testkit/brhelper/workload/modify_tiflash.go create mode 100644 pkg/testkit/brhelper/workload/nexus_common.go create mode 100644 pkg/testkit/brhelper/workload/nexus_ddl.go create mode 100644 pkg/testkit/brhelper/workload/nexus_ddl_destructive.go create mode 100644 pkg/testkit/brhelper/workload/registry.go create mode 100644 pkg/testkit/brhelper/workload/runner.go create mode 100644 pkg/testkit/brhelper/workload/state_store.go create mode 100644 pkg/testkit/brhelper/workload/summary.go create mode 100644 pkg/testkit/brhelper/workload/util.go create mode 100644 tests/realtikvtest/brietest/segmented_restore_test.go diff --git a/pkg/testkit/brhelper/workload/add_index.go b/pkg/testkit/brhelper/workload/add_index.go new file mode 100644 index 0000000000000..6d1347a8930de --- /dev/null +++ b/pkg/testkit/brhelper/workload/add_index.go @@ -0,0 +1,326 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "encoding/json" + "fmt" + "strings" +) + +type AddIndexCase struct { + Suffix string `json:"suffix"` + N int `json:"n"` + NR int `json:"nr"` + + indexesAdded []addIndexSpec + indexesDropped []addIndexSpec +} + +type addIndexSpec struct { + Name string `json:"name"` + Columns []string `json:"columns"` +} + +type addIndexState struct { + Suffix string `json:"suffix"` + DB string `json:"db"` + Table string `json:"table"` + N int `json:"n"` + NR int `json:"nr"` + + Inserted int `json:"inserted"` + Ticked int `json:"ticked"` + + NextIndexID int `json:"next_index_id"` + + Indexes []addIndexSpec `json:"indexes"` + + Checksum TableChecksum `json:"checksum"` + LogDone bool `json:"log_done"` +} + +type addIndexSummary struct { + DB string `json:"db"` + Table string `json:"table"` + N int `json:"n"` + NR int `json:"nr"` + Ticked int `json:"ticked"` + + IndexesAdded []addIndexSpec `json:"indexes_added,omitempty"` + IndexesDropped []addIndexSpec `json:"indexes_dropped,omitempty"` +} + +func (s addIndexSummary) SummaryTable() string { + var b strings.Builder + _, _ = fmt.Fprintf(&b, "db=%s table=%s n=%d nr=%d ticked=%d", s.DB, s.Table, s.N, s.NR, s.Ticked) + if len(s.IndexesAdded) > 0 { + b.WriteString("\nindexes added:") + for _, idx := range s.IndexesAdded { + b.WriteString("\n - ") + b.WriteString(idx.Name) + if len(idx.Columns) > 0 { + b.WriteString("(" + joinWithComma(idx.Columns) + ")") + } + } + } + if len(s.IndexesDropped) > 0 { + b.WriteString("\nindexes dropped:") + for _, idx := range s.IndexesDropped { + b.WriteString("\n - ") + b.WriteString(idx.Name) + if len(idx.Columns) > 0 { + b.WriteString("(" + joinWithComma(idx.Columns) + ")") + } + } + } + return b.String() +} + +func (c *AddIndexCase) Name() string { return "AddIndex" } + +func (c *AddIndexCase) Prepare(ctx Context) (json.RawMessage, error) { + c.indexesAdded = nil + c.indexesDropped = nil + + suffix := c.Suffix + if suffix == "" { + var err error + suffix, err = RandSuffix() + if err != nil { + return nil, err + } + } + n := c.N + if n <= 0 { + n = 100 + } + nr := c.NR + if nr <= 0 { + nr = 150 + } + st := addIndexState{ + Suffix: suffix, + DB: fmt.Sprintf("test_add_index_%s", suffix), + Table: "t1", + N: n, + NR: nr, + NextIndexID: 0, + } + if err := ExecAll(ctx, ctx.DB, []string{ + "CREATE DATABASE IF NOT EXISTS " + QIdent(st.DB), + "CREATE TABLE IF NOT EXISTS " + QTable(st.DB, st.Table) + " (" + + "id BIGINT PRIMARY KEY AUTO_INCREMENT," + + "a BIGINT," + + "b BIGINT," + + "c BIGINT," + + "d BIGINT," + + "e BIGINT" + + ")", + }); err != nil { + return nil, err + } + + ctx.SetSummary(addIndexSummary{ + DB: st.DB, + Table: st.Table, + N: st.N, + NR: st.NR, + }) + return json.Marshal(st) +} + +func (c *AddIndexCase) Tick(ctx TickContext, raw json.RawMessage) error { + var st addIndexState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if st.N <= 0 { + st.N = 100 + } + if st.NR <= 0 { + st.NR = 150 + } + if st.NextIndexID < len(st.Indexes) { + st.NextIndexID = len(st.Indexes) + } + + // Insert data each tick. + v := int64(st.Inserted) + if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+QTable(st.DB, st.Table)+" (a,b,c,d,e) VALUES (?,?,?,?,?)", + v, v*7+1, v*11+2, v*13+3, v*17+4, + ); err != nil { + return err + } + st.Inserted++ + + tickNo := st.Ticked + 1 + + // Every N ticks, add a new index on 1~3 columns. + if st.N > 0 && tickNo%st.N == 0 { + allCols := []string{"a", "b", "c", "d", "e"} + idxID := st.NextIndexID + idxName := fmt.Sprintf("idx_%d", idxID) + + colN := 1 + (idxID % 3) + start := idxID % len(allCols) + cols := make([]string, 0, colN) + for i := 0; i < colN; i++ { + cols = append(cols, allCols[(start+i)%len(allCols)]) + } + + exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, idxName) + if err != nil { + return err + } + if !exists { + colSQL := make([]string, 0, len(cols)) + for _, c := range cols { + colSQL = append(colSQL, QIdent(c)) + } + stmt := "CREATE INDEX " + QIdent(idxName) + " ON " + QTable(st.DB, st.Table) + " (" + joinWithComma(colSQL) + ")" + if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { + return err + } + } + + spec := addIndexSpec{Name: idxName, Columns: cols} + if !hasAddIndexSpec(st.Indexes, idxName) { + st.Indexes = append(st.Indexes, spec) + } + if !hasAddIndexSpec(c.indexesAdded, idxName) { + c.indexesAdded = append(c.indexesAdded, spec) + } + st.NextIndexID++ + } + + // Every NR ticks, randomly drop an index. + if st.NR > 0 && tickNo%st.NR == 0 && len(st.Indexes) > 0 { + idx := pickIndex(st.Ticked, len(st.Indexes)) + dropSpec := st.Indexes[idx] + + exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, dropSpec.Name) + if err != nil { + return err + } + if exists { + stmt := "DROP INDEX " + QIdent(dropSpec.Name) + " ON " + QTable(st.DB, st.Table) + if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { + return err + } + } + c.indexesDropped = append(c.indexesDropped, dropSpec) + st.Indexes = append(st.Indexes[:idx], st.Indexes[idx+1:]...) + } + + st.Ticked++ + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *AddIndexCase) Exit(ctx ExitContext, raw json.RawMessage) error { + var st addIndexState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + + checksum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + st.Checksum = checksum + st.LogDone = true + + ctx.SetSummary(addIndexSummary{ + DB: st.DB, + Table: st.Table, + N: st.N, + NR: st.NR, + Ticked: st.Ticked, + IndexesAdded: c.indexesAdded, + IndexesDropped: c.indexesDropped, + }) + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *AddIndexCase) Verify(ctx Context, raw json.RawMessage) error { + var st addIndexState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if err := Require(st.LogDone, "AddIndex: log not executed"); err != nil { + return err + } + if err := Require(st.Checksum.TotalKvs != "", "AddIndex: checksum not recorded; run Exit first"); err != nil { + return err + } + + for _, idx := range st.Indexes { + ok, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, idx.Name) + if err != nil { + return err + } + if err := Require(ok, "AddIndex: index %q not found", idx.Name); err != nil { + return err + } + } + + checksum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + if err := Require(checksum.TotalKvs == st.Checksum.TotalKvs, "AddIndex: Total_kvs mismatch: got %q want %q", checksum.TotalKvs, st.Checksum.TotalKvs); err != nil { + return err + } + if st.Checksum.TotalBytes != "" { + return Require(checksum.TotalBytes == st.Checksum.TotalBytes, "AddIndex: Total_bytes mismatch: got %q want %q", checksum.TotalBytes, st.Checksum.TotalBytes) + } + return nil +} + +func hasAddIndexSpec(indexes []addIndexSpec, name string) bool { + for _, idx := range indexes { + if idx.Name == name { + return true + } + } + return false +} + +func joinWithComma(parts []string) string { + switch len(parts) { + case 0: + return "" + case 1: + return parts[0] + } + out := parts[0] + for _, p := range parts[1:] { + out += "," + p + } + return out +} diff --git a/pkg/testkit/brhelper/workload/context.go b/pkg/testkit/brhelper/workload/context.go new file mode 100644 index 0000000000000..67eac04ae1445 --- /dev/null +++ b/pkg/testkit/brhelper/workload/context.go @@ -0,0 +1,68 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "context" + "database/sql" + "encoding/json" +) + +type Context struct { + context.Context + DB *sql.DB + + CaseName string + Summary *Summary +} + +func (c Context) SetSummary(summary any) { + if c.Summary == nil || c.CaseName == "" { + return + } + c.Summary.Set(c.CaseName, summary) +} + +type TickContext struct { + Context + + UpdateStateFn func(json.RawMessage) +} + +func (c TickContext) UpdateState(state json.RawMessage) { + if c.UpdateStateFn != nil { + c.UpdateStateFn(state) + } +} + +type ExitContext struct { + Context + + UpdateStateFn func(json.RawMessage) +} + +func (c ExitContext) UpdateState(state json.RawMessage) { + if c.UpdateStateFn != nil { + c.UpdateStateFn(state) + } +} + +type Case interface { + Name() string + Prepare(Context) (json.RawMessage, error) + Tick(TickContext, json.RawMessage) error + Exit(ExitContext, json.RawMessage) error + Verify(Context, json.RawMessage) error +} diff --git a/pkg/testkit/brhelper/workload/modify_tiflash.go b/pkg/testkit/brhelper/workload/modify_tiflash.go new file mode 100644 index 0000000000000..b0bb76e8b60ad --- /dev/null +++ b/pkg/testkit/brhelper/workload/modify_tiflash.go @@ -0,0 +1,248 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "encoding/json" + "fmt" + "strings" +) + +type ModifyTiFlashCase struct { + Suffix string `json:"suffix"` + N int `json:"n"` + NAP int `json:"nap"` + + replicaHistory []replicaHistoryEntry +} + +type modifyTiFlashState struct { + Suffix string `json:"suffix"` + DB string `json:"db"` + Table string `json:"table"` + N int `json:"n"` + NAP int `json:"nap"` + + Ticked int `json:"ticked"` + Inserted int `json:"inserted"` + + Replica int `json:"replica"` + + Checksum TableChecksum `json:"checksum"` + LogDone bool `json:"log_done"` +} + +type replicaHistoryEntry struct { + Tick int `json:"tick"` + Replica int `json:"replica"` +} + +type modifyTiFlashSummary struct { + DB string `json:"db"` + Table string `json:"table"` + N int `json:"n"` + NAP int `json:"nap"` + Ticked int `json:"ticked"` + ReplicaHistory []replicaHistoryEntry `json:"replica_history,omitempty"` +} + +func (s modifyTiFlashSummary) SummaryTable() string { + var b strings.Builder + _, _ = fmt.Fprintf(&b, "db=%s table=%s n=%d nap=%d ticked=%d", s.DB, s.Table, s.N, s.NAP, s.Ticked) + if len(s.ReplicaHistory) > 0 { + b.WriteString("\nreplica history:") + for _, e := range s.ReplicaHistory { + _, _ = fmt.Fprintf(&b, "\n - [%d] %d", e.Tick, e.Replica) + } + } + return b.String() +} + +func (c *ModifyTiFlashCase) Name() string { return "ModifyTiFlash" } + +func (c *ModifyTiFlashCase) Prepare(ctx Context) (json.RawMessage, error) { + c.replicaHistory = nil + + suffix := c.Suffix + if suffix == "" { + var err error + suffix, err = RandSuffix() + if err != nil { + return nil, err + } + } + n := c.N + if n <= 0 { + n = 100 + } + nap := c.NAP + if nap <= 0 { + nap = 1 + } + st := modifyTiFlashState{ + Suffix: suffix, + DB: fmt.Sprintf("test_modify_tiflash_%s", suffix), + Table: "t1", + N: n, + NAP: nap, + Replica: 0, + } + c.replicaHistory = []replicaHistoryEntry{{Tick: 0, Replica: 0}} + if err := ExecAll(ctx, ctx.DB, []string{ + "CREATE DATABASE IF NOT EXISTS " + QIdent(st.DB), + "CREATE TABLE IF NOT EXISTS " + QTable(st.DB, st.Table) + " (" + + "id BIGINT PRIMARY KEY AUTO_INCREMENT," + + "a BIGINT," + + "b BIGINT," + + "c BIGINT" + + ")", + "ALTER TABLE " + QTable(st.DB, st.Table) + " SET TIFLASH REPLICA 0", + }); err != nil { + return nil, err + } + + ctx.SetSummary(modifyTiFlashSummary{ + DB: st.DB, + Table: st.Table, + N: st.N, + NAP: st.NAP, + ReplicaHistory: c.replicaHistory, + }) + return json.Marshal(st) +} + +func (c *ModifyTiFlashCase) Tick(ctx TickContext, raw json.RawMessage) error { + var st modifyTiFlashState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if st.N <= 0 { + st.N = 100 + } + if st.NAP <= 0 { + st.NAP = 2 + } + if len(c.replicaHistory) == 0 { + c.replicaHistory = []replicaHistoryEntry{{Tick: st.Ticked, Replica: st.Replica}} + } + + tickNo := st.Ticked + 1 + + if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+QTable(st.DB, st.Table)+" (a,b,c) VALUES (?,?,?)", + int64(st.Inserted), int64(st.Inserted*7+1), int64(st.Inserted*11+2), + ); err != nil { + return err + } + st.Inserted++ + + if st.N > 0 && tickNo%st.N == 0 { + max := st.NAP + if max > 0 { + next := tickNo % (max + 1) + if next == st.Replica { + next = (next + 1) % (max + 1) + } + stmt := fmt.Sprintf("ALTER TABLE %s SET TIFLASH REPLICA %d", QTable(st.DB, st.Table), next) + if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { + return err + } + st.Replica = next + c.replicaHistory = append(c.replicaHistory, replicaHistoryEntry{Tick: tickNo, Replica: next}) + } + } + + st.Ticked++ + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *ModifyTiFlashCase) Exit(ctx ExitContext, raw json.RawMessage) error { + var st modifyTiFlashState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if len(c.replicaHistory) == 0 { + c.replicaHistory = []replicaHistoryEntry{{Tick: st.Ticked, Replica: st.Replica}} + } + + sum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + replica, err := TiFlashReplicaCount(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + st.Checksum = sum + st.Replica = replica + if last := c.replicaHistory[len(c.replicaHistory)-1]; last.Replica != replica { + c.replicaHistory = append(c.replicaHistory, replicaHistoryEntry{Tick: st.Ticked, Replica: replica}) + } + st.LogDone = true + + ctx.SetSummary(modifyTiFlashSummary{ + DB: st.DB, + Table: st.Table, + N: st.N, + NAP: st.NAP, + Ticked: st.Ticked, + ReplicaHistory: c.replicaHistory, + }) + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *ModifyTiFlashCase) Verify(ctx Context, raw json.RawMessage) error { + var st modifyTiFlashState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if err := Require(st.LogDone, "ModifyTiFlash: log not executed"); err != nil { + return err + } + if err := Require(st.Checksum.TotalKvs != "", "ModifyTiFlash: checksum not recorded; run Exit first"); err != nil { + return err + } + + sum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + if err := Require(sum.TotalKvs == st.Checksum.TotalKvs, "ModifyTiFlash: Total_kvs mismatch: got %q want %q", sum.TotalKvs, st.Checksum.TotalKvs); err != nil { + return err + } + if st.Checksum.TotalBytes != "" { + if err := Require(sum.TotalBytes == st.Checksum.TotalBytes, "ModifyTiFlash: Total_bytes mismatch: got %q want %q", sum.TotalBytes, st.Checksum.TotalBytes); err != nil { + return err + } + } + + replica, err := TiFlashReplicaCount(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + return Require(replica == st.Replica, "ModifyTiFlash: tiflash replica mismatch: got %d want %d", replica, st.Replica) +} diff --git a/pkg/testkit/brhelper/workload/nexus_common.go b/pkg/testkit/brhelper/workload/nexus_common.go new file mode 100644 index 0000000000000..c16d6338332b8 --- /dev/null +++ b/pkg/testkit/brhelper/workload/nexus_common.go @@ -0,0 +1,137 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "context" + "database/sql" + "fmt" + "strings" +) + +type nexusTableState struct { + Name string `json:"name"` + NextColID int `json:"next_col_id,omitempty"` + Cols []string `json:"cols,omitempty"` +} + +type nexusDDLEvent struct { + Tick int `json:"tick"` + Stmt string `json:"stmt"` +} + +type nexusState struct { + Suffix string `json:"suffix"` + DB string `json:"db"` + N int `json:"n"` + + Ticked int `json:"ticked"` + NextTableID int `json:"next_table_id"` + Tables []nexusTableState `json:"tables"` + + Checksums map[string]TableChecksum `json:"checksums,omitempty"` + LogDone bool `json:"log_done"` +} + +type nexusSummary struct { + DB string `json:"db"` + N int `json:"n"` + Ticked int `json:"ticked"` + DDLs []nexusDDLEvent `json:"ddls,omitempty"` +} + +func (s nexusSummary) SummaryTable() string { + var b strings.Builder + _, _ = fmt.Fprintf(&b, "db=%s n=%d ticked=%d", s.DB, s.N, s.Ticked) + if len(s.DDLs) > 0 { + b.WriteString("\nddls:") + for _, e := range s.DDLs { + _, _ = fmt.Fprintf(&b, "\n - [%d] %s", e.Tick, e.Stmt) + } + } + return b.String() +} + +func nexusDefaultN(n int) int { + if n <= 0 { + return 50 + } + return n +} + +func nexusHalf(n int) int { + h := n / 2 + if h <= 0 { + return 1 + } + return h +} + +func nexusTableName(id int) string { + return fmt.Sprintf("t_%d", id) +} + +func nexusExecDDL(ctx context.Context, db *sql.DB, ddls *[]nexusDDLEvent, tick int, stmt string) error { + if ddls != nil { + *ddls = append(*ddls, nexusDDLEvent{Tick: tick, Stmt: stmt}) + } + _, err := db.ExecContext(ctx, stmt) + return err +} + +func nexusCreateTable(ctx context.Context, db *sql.DB, ddls *[]nexusDDLEvent, tick int, schema, table string) error { + stmt := "CREATE TABLE IF NOT EXISTS " + QTable(schema, table) + " (" + + "id BIGINT PRIMARY KEY AUTO_INCREMENT," + + "v BIGINT," + + "s VARCHAR(64) NOT NULL" + + ")" + return nexusExecDDL(ctx, db, ddls, tick, stmt) +} + +func nexusInsertRow(ctx context.Context, db *sql.DB, schema, table string, tick int) error { + _, err := db.ExecContext(ctx, "INSERT INTO "+QTable(schema, table)+" (v,s) VALUES (?,?)", + int64(tick), fmt.Sprintf("%s_%d", table, tick), + ) + return err +} + +func pickIndex(seed int, n int) int { + if n <= 0 { + return 0 + } + x := uint64(seed)*1103515245 + 12345 + return int(x % uint64(n)) +} + +func nexusRecordChecksums(ctx context.Context, db *sql.DB, schema string, tables []nexusTableState) (map[string]TableChecksum, error) { + out := make(map[string]TableChecksum, len(tables)) + for _, t := range tables { + sum, err := AdminChecksumTable(ctx, db, schema, t.Name) + if err != nil { + return nil, err + } + out[t.Name] = sum + } + return out, nil +} + +func containsString(ss []string, s string) bool { + for _, v := range ss { + if v == s { + return true + } + } + return false +} diff --git a/pkg/testkit/brhelper/workload/nexus_ddl.go b/pkg/testkit/brhelper/workload/nexus_ddl.go new file mode 100644 index 0000000000000..4c9e9254847f0 --- /dev/null +++ b/pkg/testkit/brhelper/workload/nexus_ddl.go @@ -0,0 +1,265 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" +) + +type NexusDDLCase struct { + Suffix string `json:"suffix"` + N int `json:"n"` + + ddls []nexusDDLEvent +} + +func (c *NexusDDLCase) Name() string { return "NexusDDL" } + +func (c *NexusDDLCase) Prepare(ctx Context) (json.RawMessage, error) { + c.ddls = nil + + suffix := c.Suffix + if suffix == "" { + var err error + suffix, err = RandSuffix() + if err != nil { + return nil, err + } + } + n := c.N + if n <= 0 { + n = 50 + } + st := nexusState{ + Suffix: suffix, + DB: fmt.Sprintf("test_nexus_ddl_%s", suffix), + N: n, + Ticked: 0, + NextTableID: 1, + Tables: []nexusTableState{{Name: "t_0"}}, + } + if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, 0, "CREATE DATABASE IF NOT EXISTS "+QIdent(st.DB)); err != nil { + return nil, err + } + if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, 0, st.DB, st.Tables[0].Name); err != nil { + return nil, err + } + + ctx.SetSummary(nexusSummary{ + DB: st.DB, + N: st.N, + Ticked: st.Ticked, + DDLs: c.ddls, + }) + return json.Marshal(st) +} + +func (c *NexusDDLCase) Tick(ctx TickContext, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + st.N = nexusDefaultN(st.N) + if st.NextTableID <= 0 { + st.NextTableID = len(st.Tables) + } + for i := range st.Tables { + if st.Tables[i].NextColID < len(st.Tables[i].Cols) { + st.Tables[i].NextColID = len(st.Tables[i].Cols) + } + } + + tickNo := st.Ticked + 1 + half := nexusHalf(st.N) + + if st.N > 0 && tickNo%(2*st.N) == 0 && len(st.Tables) > 0 { + oldest := st.Tables[0].Name + stmt := "DROP TABLE IF EXISTS " + QTable(st.DB, oldest) + if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { + return err + } + st.Tables = st.Tables[1:] + } + + if st.N > 0 && tickNo%st.N == 0 { + name := nexusTableName(st.NextTableID) + st.NextTableID++ + if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, tickNo, st.DB, name); err != nil { + return err + } + st.Tables = append(st.Tables, nexusTableState{Name: name}) + } + + if tickNo%half == 0 && len(st.Tables) > 0 { + youngest := &st.Tables[len(st.Tables)-1] + if err := nexusAddOneColumn(ctx, ctx.DB, &st, &c.ddls, tickNo, youngest); err != nil { + return err + } + } + + if st.N > 0 && tickNo%st.N == 0 && len(st.Tables) > 0 { + oldest := &st.Tables[0] + if err := nexusDropOneColumn(ctx, ctx.DB, &st, &c.ddls, tickNo, oldest); err != nil { + return err + } + } + + for _, t := range st.Tables { + if err := nexusInsertRow(ctx, ctx.DB, st.DB, t.Name, tickNo); err != nil { + return err + } + } + + st.Ticked++ + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *NexusDDLCase) Exit(ctx ExitContext, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + + sums, err := nexusRecordChecksums(ctx, ctx.DB, st.DB, st.Tables) + if err != nil { + return err + } + st.Checksums = sums + st.LogDone = true + + ctx.SetSummary(nexusSummary{ + DB: st.DB, + N: st.N, + Ticked: st.Ticked, + DDLs: c.ddls, + }) + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *NexusDDLCase) Verify(ctx Context, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if err := Require(st.LogDone, "NexusDDL: log not executed"); err != nil { + return err + } + if err := Require(len(st.Checksums) > 0, "NexusDDL: checksum not recorded; run Exit first"); err != nil { + return err + } + + for _, t := range st.Tables { + ok, err := TableExists(ctx, ctx.DB, st.DB, t.Name) + if err != nil { + return err + } + if err := Require(ok, "NexusDDL: table %s.%s not found", st.DB, t.Name); err != nil { + return err + } + + for _, col := range t.Cols { + has, err := ColumnExists(ctx, ctx.DB, st.DB, t.Name, col) + if err != nil { + return err + } + if err := Require(has, "NexusDDL: %s.%s column %q not found", st.DB, t.Name, col); err != nil { + return err + } + } + + want, ok := st.Checksums[t.Name] + if !ok { + return fmt.Errorf("NexusDDL: missing checksum for table %s.%s", st.DB, t.Name) + } + got, err := AdminChecksumTable(ctx, ctx.DB, st.DB, t.Name) + if err != nil { + return err + } + if err := Require(got.TotalKvs == want.TotalKvs, "NexusDDL: Total_kvs mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalKvs, want.TotalKvs); err != nil { + return err + } + if want.TotalBytes != "" { + if err := Require(got.TotalBytes == want.TotalBytes, "NexusDDL: Total_bytes mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalBytes, want.TotalBytes); err != nil { + return err + } + } + } + return nil +} + +func nexusAddOneColumn(ctx context.Context, db *sql.DB, st *nexusState, ddls *[]nexusDDLEvent, tick int, t *nexusTableState) error { + if t == nil { + return nil + } + if t.NextColID < len(t.Cols) { + t.NextColID = len(t.Cols) + } + + col := fmt.Sprintf("c_%d", t.NextColID) + exists, err := ColumnExists(ctx, db, st.DB, t.Name, col) + if err != nil { + return err + } + if exists { + if !containsString(t.Cols, col) { + t.Cols = append(t.Cols, col) + } + t.NextColID++ + return nil + } + + stmt := "ALTER TABLE " + QTable(st.DB, t.Name) + " ADD COLUMN " + QIdent(col) + " BIGINT" + if err := nexusExecDDL(ctx, db, ddls, tick, stmt); err != nil { + return err + } + t.Cols = append(t.Cols, col) + t.NextColID++ + return nil +} + +func nexusDropOneColumn(ctx context.Context, db *sql.DB, st *nexusState, ddls *[]nexusDDLEvent, tick int, t *nexusTableState) error { + if t == nil || len(t.Cols) == 0 { + return nil + } + col := t.Cols[0] + exists, err := ColumnExists(ctx, db, st.DB, t.Name, col) + if err != nil { + return err + } + if exists { + stmt := "ALTER TABLE " + QTable(st.DB, t.Name) + " DROP COLUMN " + QIdent(col) + if err := nexusExecDDL(ctx, db, ddls, tick, stmt); err != nil { + return err + } + } + t.Cols = t.Cols[1:] + return nil +} diff --git a/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go b/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go new file mode 100644 index 0000000000000..a7ed96a4de68d --- /dev/null +++ b/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go @@ -0,0 +1,196 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "encoding/json" + "fmt" +) + +type NexusDDLDestructiveCase struct { + Suffix string `json:"suffix"` + N int `json:"n"` + + ddls []nexusDDLEvent +} + +func (c *NexusDDLDestructiveCase) Name() string { return "NexusDDLDestructive" } + +func (c *NexusDDLDestructiveCase) Prepare(ctx Context) (json.RawMessage, error) { + c.ddls = nil + + suffix := c.Suffix + if suffix == "" { + var err error + suffix, err = RandSuffix() + if err != nil { + return nil, err + } + } + n := c.N + if n <= 0 { + n = 50 + } + st := nexusState{ + Suffix: suffix, + DB: fmt.Sprintf("test_nexus_ddl_destructive_%s", suffix), + N: n, + Ticked: 0, + NextTableID: 1, + Tables: []nexusTableState{{Name: "t_0"}}, + } + if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, 0, "CREATE DATABASE IF NOT EXISTS "+QIdent(st.DB)); err != nil { + return nil, err + } + if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, 0, st.DB, st.Tables[0].Name); err != nil { + return nil, err + } + + ctx.SetSummary(nexusSummary{ + DB: st.DB, + N: st.N, + Ticked: st.Ticked, + DDLs: c.ddls, + }) + return json.Marshal(st) +} + +func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + st.N = nexusDefaultN(st.N) + if st.NextTableID <= 0 { + st.NextTableID = len(st.Tables) + } + + tickNo := st.Ticked + 1 + half := nexusHalf(st.N) + + if st.N > 0 && tickNo%st.N == 0 { + name := nexusTableName(st.NextTableID) + st.NextTableID++ + if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, tickNo, st.DB, name); err != nil { + return err + } + st.Tables = append(st.Tables, nexusTableState{Name: name}) + } + + if tickNo%half == 0 && len(st.Tables) > 0 { + idx := pickIndex(st.Ticked, len(st.Tables)) + oldName := st.Tables[idx].Name + newName := nexusTableName(st.NextTableID) + st.NextTableID++ + stmt := "RENAME TABLE " + QTable(st.DB, oldName) + " TO " + QTable(st.DB, newName) + if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { + return err + } + st.Tables[idx].Name = newName + } + + if st.N > 0 && tickNo%(2*st.N) == 0 && len(st.Tables) > 0 { + idx := pickIndex(st.Ticked*2+1, len(st.Tables)) + stmt := "TRUNCATE TABLE " + QTable(st.DB, st.Tables[idx].Name) + if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { + return err + } + } + + for _, t := range st.Tables { + if err := nexusInsertRow(ctx, ctx.DB, st.DB, t.Name, tickNo); err != nil { + return err + } + } + + st.Ticked++ + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *NexusDDLDestructiveCase) Exit(ctx ExitContext, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + + sums, err := nexusRecordChecksums(ctx, ctx.DB, st.DB, st.Tables) + if err != nil { + return err + } + st.Checksums = sums + st.LogDone = true + + ctx.SetSummary(nexusSummary{ + DB: st.DB, + N: st.N, + Ticked: st.Ticked, + DDLs: c.ddls, + }) + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *NexusDDLDestructiveCase) Verify(ctx Context, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if err := Require(st.LogDone, "NexusDDLDestructive: log not executed"); err != nil { + return err + } + if err := Require(len(st.Checksums) > 0, "NexusDDLDestructive: checksum not recorded; run Exit first"); err != nil { + return err + } + + for _, t := range st.Tables { + ok, err := TableExists(ctx, ctx.DB, st.DB, t.Name) + if err != nil { + return err + } + if err := Require(ok, "NexusDDLDestructive: table %s.%s not found", st.DB, t.Name); err != nil { + return err + } + + want, ok := st.Checksums[t.Name] + if !ok { + return fmt.Errorf("NexusDDLDestructive: missing checksum for table %s.%s", st.DB, t.Name) + } + got, err := AdminChecksumTable(ctx, ctx.DB, st.DB, t.Name) + if err != nil { + return err + } + if err := Require(got.TotalKvs == want.TotalKvs, "NexusDDLDestructive: Total_kvs mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalKvs, want.TotalKvs); err != nil { + return err + } + if want.TotalBytes != "" { + if err := Require(got.TotalBytes == want.TotalBytes, "NexusDDLDestructive: Total_bytes mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalBytes, want.TotalBytes); err != nil { + return err + } + } + } + return nil +} diff --git a/pkg/testkit/brhelper/workload/registry.go b/pkg/testkit/brhelper/workload/registry.go new file mode 100644 index 0000000000000..1a48dd0b7afd9 --- /dev/null +++ b/pkg/testkit/brhelper/workload/registry.go @@ -0,0 +1,25 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +// AllCases returns the default workload cases. +func AllCases() []Case { + return []Case{ + &AddIndexCase{}, + &NexusDDLCase{}, + &NexusDDLDestructiveCase{}, + &ModifyTiFlashCase{}, + } +} diff --git a/pkg/testkit/brhelper/workload/runner.go b/pkg/testkit/brhelper/workload/runner.go new file mode 100644 index 0000000000000..515a65850b44e --- /dev/null +++ b/pkg/testkit/brhelper/workload/runner.go @@ -0,0 +1,394 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "math/rand/v2" + "strings" + "sync" + "time" +) + +type CaseSpec struct { + Name string + Case Case +} + +type RunConfig struct { + TickCount int + TickInterval time.Duration + PersistEvery uint64 + Seed int64 + Parallel bool +} + +type Runner struct { + db *sql.DB + store StateStore + cases []CaseSpec +} + +func NewRunner(db *sql.DB, store StateStore, cases ...Case) (*Runner, error) { + specs := make([]CaseSpec, len(cases)) + for i, c := range cases { + specs[i] = CaseSpec{Case: c} + } + return NewRunnerWithSpecs(db, store, specs...) +} + +func NewRunnerWithSpecs(db *sql.DB, store StateStore, specs ...CaseSpec) (*Runner, error) { + if db == nil { + return nil, fmt.Errorf("workload: nil db") + } + if store == nil { + return nil, fmt.Errorf("workload: nil state store") + } + + caseSpecs, err := normalizeCaseSpecs(specs) + if err != nil { + return nil, err + } + return &Runner{db: db, store: store, cases: caseSpecs}, nil +} + +func (r *Runner) Cases() []CaseSpec { + out := make([]CaseSpec, len(r.cases)) + copy(out, r.cases) + return out +} + +func (r *Runner) Prepare(ctx context.Context) (*Summary, error) { + if err := r.store.Reset(ctx); err != nil { + return nil, err + } + + summary := NewSummary() + for _, spec := range r.cases { + state, err := spec.Case.Prepare(Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}) + if err != nil { + return nil, err + } + if err := r.store.Put(ctx, spec.Name, state); err != nil { + return nil, err + } + } + return summary, nil +} + +func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { + if cfg.TickCount <= 0 { + return nil, fmt.Errorf("workload: TickCount must be > 0") + } + if cfg.TickInterval < 0 { + return nil, fmt.Errorf("workload: TickInterval must be >= 0") + } + if cfg.PersistEvery == 0 { + cfg.PersistEvery = ^uint64(0) + } + + states, err := r.store.GetAll(ctx) + if err != nil { + return nil, err + } + byName := make(map[string]Case, len(r.cases)) + for _, spec := range r.cases { + byName[spec.Name] = spec.Case + } + for name := range states { + if _, ok := byName[name]; !ok { + return nil, fmt.Errorf("workload: unknown case %q in state store", name) + } + } + + selected := make([]CaseSpec, 0, len(r.cases)) + for _, spec := range r.cases { + if _, ok := states[spec.Name]; ok { + selected = append(selected, spec) + } + } + if len(selected) == 0 { + return nil, fmt.Errorf("workload: no cases in state store; run Prepare first") + } + + summary := NewSummary() + if cfg.Parallel { + if err := r.runParallelTicks(ctx, cfg, selected, states, summary); err != nil { + return nil, err + } + } else { + if err := r.runSequentialTicks(ctx, cfg, selected, states, summary); err != nil { + return nil, err + } + } + + for _, spec := range selected { + state, ok := states[spec.Name] + if !ok { + return nil, fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + } + exitCtx := ExitContext{ + Context: Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}, + UpdateStateFn: func(updated json.RawMessage) { + states[spec.Name] = updated + }, + } + if err := spec.Case.Exit(exitCtx, state); err != nil { + return nil, err + } + } + + finalStates := make(map[string]json.RawMessage, len(selected)) + for _, spec := range selected { + if state, ok := states[spec.Name]; ok { + finalStates[spec.Name] = state + } + } + if err := r.store.PutMany(ctx, finalStates); err != nil { + return nil, err + } + return summary, nil +} + +func (r *Runner) Verify(ctx context.Context) error { + states, err := r.store.GetAll(ctx) + if err != nil { + return err + } + byName := make(map[string]Case, len(r.cases)) + for _, spec := range r.cases { + byName[spec.Name] = spec.Case + } + + for name, state := range states { + c, ok := byName[name] + if !ok { + base, _, cut := strings.Cut(name, "#") + if cut { + c, ok = byName[base] + } + } + if !ok { + return fmt.Errorf("workload: unknown case %q in state store", name) + } + if err := c.Verify(Context{Context: ctx, DB: r.db, CaseName: name}, state); err != nil { + return err + } + } + return nil +} + +func (r *Runner) runSequentialTicks( + ctx context.Context, + cfg RunConfig, + selected []CaseSpec, + states map[string]json.RawMessage, + summary *Summary, +) error { + rng := rand.New(rand.NewPCG(uint64(cfg.Seed), uint64(cfg.Seed>>1))) + for tick := 0; tick < cfg.TickCount; tick++ { + rng.Shuffle(len(selected), func(i, j int) { selected[i], selected[j] = selected[j], selected[i] }) + + for _, spec := range selected { + state, ok := states[spec.Name] + if !ok { + return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + } + tickCtx := TickContext{ + Context: Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}, + UpdateStateFn: func(updated json.RawMessage) { + states[spec.Name] = updated + }, + } + if err := spec.Case.Tick(tickCtx, state); err != nil { + return err + } + } + + if shouldPersistTick(tick, cfg.PersistEvery) { + flush := make(map[string]json.RawMessage, len(selected)) + for _, spec := range selected { + state, ok := states[spec.Name] + if !ok { + return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + } + flush[spec.Name] = state + } + if err := r.store.PutMany(ctx, flush); err != nil { + return err + } + } + + if tick != cfg.TickCount-1 { + if err := sleep(ctx, cfg.TickInterval); err != nil { + return err + } + } + } + return nil +} + +func (r *Runner) runParallelTicks( + ctx context.Context, + cfg RunConfig, + selected []CaseSpec, + states map[string]json.RawMessage, + summary *Summary, +) error { + var mu sync.Mutex + for tick := 0; tick < cfg.TickCount; tick++ { + if err := r.runParallelTick(ctx, selected, states, summary, &mu); err != nil { + return err + } + + if shouldPersistTick(tick, cfg.PersistEvery) { + flush := make(map[string]json.RawMessage, len(selected)) + mu.Lock() + for _, spec := range selected { + state, ok := states[spec.Name] + if !ok { + mu.Unlock() + return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + } + flush[spec.Name] = state + } + mu.Unlock() + if err := r.store.PutMany(ctx, flush); err != nil { + return err + } + } + + if tick != cfg.TickCount-1 { + if err := sleep(ctx, cfg.TickInterval); err != nil { + return err + } + } + } + return nil +} + +func (r *Runner) runParallelTick( + ctx context.Context, + selected []CaseSpec, + states map[string]json.RawMessage, + summary *Summary, + mu *sync.Mutex, +) error { + if len(selected) == 0 { + return nil + } + + runCtx, cancel := context.WithCancel(ctx) + defer cancel() + + var wg sync.WaitGroup + var once sync.Once + var firstErr error + + for _, spec := range selected { + spec := spec + wg.Add(1) + go func() { + defer wg.Done() + + mu.Lock() + state, ok := states[spec.Name] + mu.Unlock() + if !ok { + once.Do(func() { + firstErr = fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + cancel() + }) + return + } + + tickCtx := TickContext{ + Context: Context{Context: runCtx, DB: r.db, CaseName: spec.Name, Summary: summary}, + UpdateStateFn: func(updated json.RawMessage) { + mu.Lock() + states[spec.Name] = updated + mu.Unlock() + }, + } + if err := spec.Case.Tick(tickCtx, state); err != nil { + once.Do(func() { + firstErr = err + cancel() + }) + return + } + }() + } + wg.Wait() + return firstErr +} + +func normalizeCaseSpecs(specs []CaseSpec) ([]CaseSpec, error) { + out := make([]CaseSpec, 0, len(specs)) + nameCounts := make(map[string]int, len(specs)) + for _, spec := range specs { + if spec.Case == nil { + return nil, fmt.Errorf("workload: nil case") + } + if spec.Name == "" { + nameCounts[spec.Case.Name()]++ + } + } + + used := make(map[string]struct{}, len(specs)) + caseIndex := make(map[string]int, len(specs)) + for _, spec := range specs { + name := spec.Name + if name == "" { + base := spec.Case.Name() + if nameCounts[base] > 1 { + caseIndex[base]++ + name = fmt.Sprintf("%s#%d", base, caseIndex[base]) + } else { + name = base + } + } + if _, ok := used[name]; ok { + return nil, fmt.Errorf("workload: duplicate case name %q", name) + } + used[name] = struct{}{} + out = append(out, CaseSpec{Name: name, Case: spec.Case}) + } + return out, nil +} + +func sleep(ctx context.Context, d time.Duration) error { + if d <= 0 { + return nil + } + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-t.C: + return nil + } +} + +func shouldPersistTick(tick int, persistEvery uint64) bool { + if persistEvery == 0 || persistEvery == ^uint64(0) { + return false + } + return uint64(tick+1)%persistEvery == 0 +} diff --git a/pkg/testkit/brhelper/workload/state_store.go b/pkg/testkit/brhelper/workload/state_store.go new file mode 100644 index 0000000000000..f484ed385e4c8 --- /dev/null +++ b/pkg/testkit/brhelper/workload/state_store.go @@ -0,0 +1,84 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "context" + "encoding/json" + "sync" +) + +type StateStore interface { + Reset(ctx context.Context) error + Put(ctx context.Context, caseName string, state json.RawMessage) error + PutMany(ctx context.Context, states map[string]json.RawMessage) error + GetAll(ctx context.Context) (map[string]json.RawMessage, error) +} + +type MemoryStore struct { + mu sync.RWMutex + states map[string]json.RawMessage +} + +func NewMemoryStore() *MemoryStore { + return &MemoryStore{ + states: make(map[string]json.RawMessage), + } +} + +func (s *MemoryStore) Reset(ctx context.Context) error { + s.mu.Lock() + defer s.mu.Unlock() + s.states = make(map[string]json.RawMessage) + return nil +} + +func (s *MemoryStore) Put(ctx context.Context, caseName string, state json.RawMessage) error { + s.mu.Lock() + s.states[caseName] = cloneRaw(state) + s.mu.Unlock() + return nil +} + +func (s *MemoryStore) PutMany(ctx context.Context, states map[string]json.RawMessage) error { + if len(states) == 0 { + return nil + } + s.mu.Lock() + for caseName, state := range states { + s.states[caseName] = cloneRaw(state) + } + s.mu.Unlock() + return nil +} + +func (s *MemoryStore) GetAll(ctx context.Context) (map[string]json.RawMessage, error) { + s.mu.RLock() + defer s.mu.RUnlock() + out := make(map[string]json.RawMessage, len(s.states)) + for caseName, state := range s.states { + out[caseName] = cloneRaw(state) + } + return out, nil +} + +func cloneRaw(state json.RawMessage) json.RawMessage { + if len(state) == 0 { + return nil + } + out := make([]byte, len(state)) + copy(out, state) + return out +} diff --git a/pkg/testkit/brhelper/workload/summary.go b/pkg/testkit/brhelper/workload/summary.go new file mode 100644 index 0000000000000..ae5265545b0dc --- /dev/null +++ b/pkg/testkit/brhelper/workload/summary.go @@ -0,0 +1,75 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import "sync" + +type SummaryEntry struct { + Case string `json:"case"` + Summary any `json:"summary"` +} + +type TableSummary interface { + SummaryTable() string +} + +type Summary struct { + mu sync.Mutex + byCase map[string]int + entries []SummaryEntry +} + +func NewSummary() *Summary { + return &Summary{ + byCase: make(map[string]int), + } +} + +func (s *Summary) Set(caseName string, summary any) { + if s == nil || caseName == "" { + return + } + s.mu.Lock() + defer s.mu.Unlock() + + if idx, ok := s.byCase[caseName]; ok { + s.entries[idx].Summary = summary + return + } + + s.byCase[caseName] = len(s.entries) + s.entries = append(s.entries, SummaryEntry{Case: caseName, Summary: summary}) +} + +func (s *Summary) Entries() []SummaryEntry { + if s == nil { + return nil + } + s.mu.Lock() + defer s.mu.Unlock() + + out := make([]SummaryEntry, len(s.entries)) + copy(out, s.entries) + return out +} + +func (s *Summary) Empty() bool { + if s == nil { + return true + } + s.mu.Lock() + defer s.mu.Unlock() + return len(s.entries) == 0 +} diff --git a/pkg/testkit/brhelper/workload/util.go b/pkg/testkit/brhelper/workload/util.go new file mode 100644 index 0000000000000..cb276674e0e57 --- /dev/null +++ b/pkg/testkit/brhelper/workload/util.go @@ -0,0 +1,178 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workload + +import ( + "context" + "crypto/rand" + "database/sql" + "encoding/hex" + "encoding/json" + "fmt" + "strings" +) + +func RandSuffix() (string, error) { + var b [8]byte + if _, err := rand.Read(b[:]); err != nil { + return "", err + } + return hex.EncodeToString(b[:]), nil +} + +func QIdent(s string) string { + return "`" + strings.ReplaceAll(s, "`", "``") + "`" +} + +func QTable(schema, table string) string { + return QIdent(schema) + "." + QIdent(table) +} + +func ExecAll(ctx context.Context, db *sql.DB, stmts []string) error { + for _, stmt := range stmts { + if _, err := db.ExecContext(ctx, stmt); err != nil { + return err + } + } + return nil +} + +func SchemaExists(ctx context.Context, db *sql.DB, schema string) (bool, error) { + var n int + err := db.QueryRowContext(ctx, "SELECT COUNT(*) FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = ?", schema).Scan(&n) + return n > 0, err +} + +func TableExists(ctx context.Context, db *sql.DB, schema, table string) (bool, error) { + var n int + err := db.QueryRowContext(ctx, "SELECT COUNT(*) FROM information_schema.TABLES WHERE TABLE_SCHEMA = ? AND TABLE_NAME = ?", schema, table).Scan(&n) + return n > 0, err +} + +func ColumnExists(ctx context.Context, db *sql.DB, schema, table, column string) (bool, error) { + var n int + err := db.QueryRowContext(ctx, "SELECT COUNT(*) FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = ? AND TABLE_NAME = ? AND COLUMN_NAME = ?", schema, table, column).Scan(&n) + return n > 0, err +} + +func IndexExists(ctx context.Context, db *sql.DB, schema, table, index string) (bool, error) { + var n int + err := db.QueryRowContext(ctx, "SELECT COUNT(*) FROM information_schema.STATISTICS WHERE TABLE_SCHEMA = ? AND TABLE_NAME = ? AND INDEX_NAME = ?", schema, table, index).Scan(&n) + return n > 0, err +} + +func TiFlashReplicaCount(ctx context.Context, db *sql.DB, schema, table string) (int, error) { + var n sql.NullInt64 + err := db.QueryRowContext(ctx, "SELECT REPLICA_COUNT FROM information_schema.TIFLASH_REPLICA WHERE TABLE_SCHEMA = ? AND TABLE_NAME = ?", schema, table).Scan(&n) + if err == sql.ErrNoRows { + return 0, nil + } + if err != nil { + return 0, err + } + if !n.Valid { + return 0, nil + } + return int(n.Int64), nil +} + +type TableChecksum struct { + TotalKvs string `json:"total_kvs,omitempty"` + TotalBytes string `json:"total_bytes,omitempty"` + ChecksumCRC64Xor string `json:"checksum_crc64_xor,omitempty"` +} + +func (c *TableChecksum) UnmarshalJSON(b []byte) error { + if len(b) == 0 || string(b) == "null" { + return nil + } + if b[0] == '"' { + var s string + if err := json.Unmarshal(b, &s); err != nil { + return err + } + *c = TableChecksum{TotalKvs: s} + return nil + } + type alias TableChecksum + var v alias + if err := json.Unmarshal(b, &v); err != nil { + return err + } + *c = TableChecksum(v) + return nil +} + +func AdminChecksumTable(ctx context.Context, db *sql.DB, schema, table string) (TableChecksum, error) { + rows, err := db.QueryContext(ctx, "ADMIN CHECKSUM TABLE "+QTable(schema, table)) + if err != nil { + return TableChecksum{}, err + } + defer rows.Close() + + cols, err := rows.Columns() + if err != nil { + return TableChecksum{}, err + } + if !rows.Next() { + if err := rows.Err(); err != nil { + return TableChecksum{}, err + } + return TableChecksum{}, fmt.Errorf("checksum: no rows returned") + } + raw := make([]sql.RawBytes, len(cols)) + dest := make([]any, len(cols)) + for i := range raw { + dest[i] = &raw[i] + } + if err := rows.Scan(dest...); err != nil { + return TableChecksum{}, err + } + + var out TableChecksum + for i, c := range cols { + v := strings.TrimSpace(string(raw[i])) + switch { + case strings.EqualFold(c, "Total_kvs"): + out.TotalKvs = v + case strings.EqualFold(c, "Total_bytes"): + out.TotalBytes = v + case strings.EqualFold(c, "Checksum_crc64_xor"): + out.ChecksumCRC64Xor = v + } + } + + var missing []string + if out.TotalKvs == "" { + missing = append(missing, "Total_kvs") + } + if out.TotalBytes == "" { + missing = append(missing, "Total_bytes") + } + if out.ChecksumCRC64Xor == "" { + missing = append(missing, "Checksum_crc64_xor") + } + if len(missing) > 0 { + return TableChecksum{}, fmt.Errorf("checksum: column(s) not found: %v; columns=%v", missing, cols) + } + return out, nil +} + +func Require(cond bool, format string, args ...any) error { + if cond { + return nil + } + return fmt.Errorf(format, args...) +} diff --git a/tests/realtikvtest/brietest/segmented_restore_test.go b/tests/realtikvtest/brietest/segmented_restore_test.go new file mode 100644 index 0000000000000..7d360b6451ed6 --- /dev/null +++ b/tests/realtikvtest/brietest/segmented_restore_test.go @@ -0,0 +1,216 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package brietest + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/pingcap/tidb/br/pkg/metautil" + "github.com/pingcap/tidb/br/pkg/registry" + "github.com/pingcap/tidb/br/pkg/task" + "github.com/pingcap/tidb/pkg/testkit" + "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" + "github.com/stretchr/testify/require" +) + +func TestSegmentedRestoreWorkload(t *testing.T) { + kit := NewLogBackupKit(t) + taskName := "segmented_restore_" + t.Name() + kit.StopTaskIfExists(taskName) + + db := testkit.CreateMockDB(kit.tk) + t.Cleanup(func() { + _ = db.Close() + }) + + store := workload.NewMemoryStore() + cases := []workload.Case{ + &workload.NexusDDLDestructiveCase{}, + &workload.NexusDDLCase{}, + &workload.AddIndexCase{}, + } + if tiflashCount := tiflashStoreCount(t, kit.tk); tiflashCount > 0 { + cases = append(cases, &workload.ModifyTiFlashCase{}) + } + runner, err := workload.NewRunner(db, store, cases...) + require.NoError(t, err) + + ctx := context.Background() + _, err = runner.Prepare(ctx) + require.NoError(t, err) + + kit.RunFullBackup(func(cfg *task.BackupConfig) { + cfg.Storage = kit.LocalURI("full") + }) + backupTS := readBackupEndTS(t, kit.LocalURI("full")) + + kit.RunLogStart(taskName, func(cfg *task.StreamConfig) { + cfg.StartTS = backupTS + }) + t.Cleanup(func() { + kit.StopTaskIfExists(taskName) + }) + + checkpoints := make([]uint64, 0, 5) + runCfg := workload.RunConfig{ + TickCount: 100, + TickInterval: 0, + PersistEvery: ^uint64(0), + Seed: 1, + Parallel: true, + } + + for i := 0; i < 4; i++ { + _, err := runner.Run(ctx, runCfg) + require.NoError(t, err) + kit.forceFlushAndWait(taskName) + checkpoints = append(checkpoints, kit.CheckpointTSOf(taskName)) + } + kit.forceFlushAndWait(taskName) + checkpoints = append(checkpoints, kit.CheckpointTSOf(taskName)) + kit.StopTaskIfExists(taskName) + + cleanupWorkloadSchemas(t, kit.tk) + cleanupRestoreRegistry(t, kit.tk) + + checkpointDir := filepath.Join(kit.base, "checkpoint") + require.NoError(t, os.RemoveAll(checkpointDir)) + + for i, restoreTS := range checkpoints { + idx := i + rcTS := restoreTS + kit.RunStreamRestore(func(rc *task.RestoreConfig) { + rc.RestoreTS = rcTS + rc.IsRestoredTSUserSpecified = true + rc.LastRestore = idx == len(checkpoints)-1 + rc.IsLastRestoreUserSpecified = true + rc.UseCheckpoint = true + rc.CheckpointStorage = kit.LocalURI("checkpoint") + if idx > 0 { + rc.StartTS = checkpoints[idx-1] + rc.FullBackupStorage = "" + } + kit.SetFilter(&rc.Config, "test*.*") + if idx != len(checkpoints)-1 { + rc.ExplicitFilter = false + } + }) + } + + require.NoError(t, runner.Verify(ctx)) +} + +func readBackupEndTS(t *testing.T, storage string) uint64 { + cfg := task.DefaultConfig() + cfg.Storage = storage + _, _, backupMeta, err := task.ReadBackupMeta(context.Background(), metautil.MetaFile, &cfg) + require.NoError(t, err) + return backupMeta.GetEndVersion() +} + +func cleanupWorkloadSchemas(t *testing.T, tk *testkit.TestKit) { + t.Helper() + + droppedAt := make(map[string]time.Time) + var lastLog time.Time + require.Eventuallyf(t, func() bool { + rows := tk.MustQuery("SELECT schema_name FROM information_schema.schemata").Rows() + remaining := make([]string, 0, len(rows)) + now := time.Now() + + for _, row := range rows { + name := fmt.Sprint(row[0]) + if isSystemSchema(name) { + continue + } + key := strings.ToLower(name) + remaining = append(remaining, name) + if last, ok := droppedAt[key]; !ok || now.Sub(last) > 5*time.Second { + tk.MustExec("DROP DATABASE IF EXISTS " + workload.QIdent(name)) + droppedAt[key] = now + } + } + + if len(remaining) == 0 { + return true + } + if now.Sub(lastLog) > 10*time.Second { + t.Logf("waiting for schemas to drop: %v", remaining) + lastLog = now + } + return false + }, 2*time.Minute, 500*time.Millisecond, "user schemas still exist") + + tk.MustExec("CREATE DATABASE IF NOT EXISTS test") +} + +func cleanupRestoreRegistry(t *testing.T, tk *testkit.TestKit) { + t.Helper() + + rows := tk.MustQuery(fmt.Sprintf( + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '%s' AND table_name = '%s'", + registry.RestoreRegistryDBName, + registry.RestoreRegistryTableName, + )).Rows() + require.Len(t, rows, 1) + count, err := parseCount(rows[0][0]) + require.NoError(t, err) + if count == 0 { + return + } + tk.MustExec(fmt.Sprintf("DELETE FROM %s.%s", registry.RestoreRegistryDBName, registry.RestoreRegistryTableName)) +} + +func isSystemSchema(name string) bool { + switch strings.ToLower(name) { + case "mysql", + "information_schema", + "performance_schema", + "sys", + "metrics_schema": + return true + default: + return false + } +} + +func tiflashStoreCount(t *testing.T, tk *testkit.TestKit) int { + rows := tk.MustQuery("SELECT COUNT(*) FROM information_schema.tikv_store_status WHERE JSON_SEARCH(LABEL, 'one', 'tiflash') IS NOT NULL").Rows() + require.Len(t, rows, 1) + count, err := parseCount(rows[0][0]) + require.NoError(t, err) + return count +} + +func parseCount(raw any) (int, error) { + switch v := raw.(type) { + case string: + var out int + _, err := fmt.Sscanf(v, "%d", &out) + return out, err + case int: + return v, nil + case int64: + return int(v), nil + default: + return 0, fmt.Errorf("unexpected count type %T", raw) + } +} From 812beec98f7e33d05b1386e61c8f8cf4ee5abbc9 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Tue, 27 Jan 2026 08:57:29 +0000 Subject: [PATCH 07/18] tidy up codes Signed-off-by: Juncen Yu --- br/pkg/restore/log_client/client.go | 60 +++++- pkg/testkit/brhelper/workload/add_index.go | 178 +++++++++--------- pkg/testkit/brhelper/workload/context.go | 2 + .../brhelper/workload/modify_tiflash.go | 4 +- pkg/testkit/brhelper/workload/nexus_common.go | 19 +- pkg/testkit/brhelper/workload/nexus_ddl.go | 11 +- .../workload/nexus_ddl_destructive.go | 10 +- pkg/testkit/brhelper/workload/runner.go | 76 +++----- pkg/testkit/brhelper/workload/util.go | 4 + .../brietest/segmented_restore_test.go | 5 +- 10 files changed, 195 insertions(+), 174 deletions(-) diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index e11b2e7c21920..a19d7931c64b7 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -2155,22 +2155,23 @@ func (rc *LogClient) RefreshMetaForTables(ctx context.Context, schemasReplace *s return errors.Errorf("the deleted table(upstream ID: %d) has no record in replace map", upstreamTableID) } + involvedDB, involvedTable := rc.resolveInvolvingNames(ctx, dbReplace, tableReplace) args := &model.RefreshMetaArgs{ SchemaID: dbReplace.DbID, TableID: tableReplace.TableID, - InvolvedDB: dbReplace.Name, - InvolvedTable: tableReplace.Name, + InvolvedDB: involvedDB, + InvolvedTable: involvedTable, } log.Info("refreshing deleted table meta", zap.Int64("schemaID", dbReplace.DbID), - zap.String("dbName", dbReplace.Name), + zap.String("dbName", involvedDB), zap.Any("tableID", tableReplace.TableID), - zap.String("tableName", tableReplace.Name)) + zap.String("tableName", involvedTable)) if err := rc.unsafeSession.RefreshMeta(ctx, args); err != nil { return errors.Annotatef(err, "failed to refresh meta for deleted table with schemaID=%d, tableID=%d, dbName=%s, tableName=%s", - dbReplace.DbID, tableReplace.TableID, dbReplace.Name, tableReplace.Name) + dbReplace.DbID, tableReplace.TableID, involvedDB, involvedTable) } deletedTableCount++ } @@ -2251,20 +2252,21 @@ func (rc *LogClient) RefreshMetaForTables(ctx context.Context, schemasReplace *s } } + involvedDB, involvedTable := rc.resolveInvolvingNames(ctx, dbReplace, tableReplace) args := &model.RefreshMetaArgs{ SchemaID: dbReplace.DbID, TableID: tableReplace.TableID, - InvolvedDB: dbReplace.Name, - InvolvedTable: tableReplace.Name, + InvolvedDB: involvedDB, + InvolvedTable: involvedTable, } log.Info("refreshing regular table meta", zap.Int64("schemaID", dbReplace.DbID), - zap.String("dbName", dbReplace.Name), + zap.String("dbName", involvedDB), zap.Any("tableID", tableReplace.TableID), - zap.String("tableName", tableReplace.Name)) + zap.String("tableName", involvedTable)) if err := rc.unsafeSession.RefreshMeta(ctx, args); err != nil { return errors.Annotatef(err, "failed to refresh meta for table with schemaID=%d, tableID=%d, dbName=%s, tableName=%s", - dbReplace.DbID, tableReplace.TableID, dbReplace.Name, tableReplace.Name) + dbReplace.DbID, tableReplace.TableID, involvedDB, involvedTable) } regularCount++ } @@ -2275,3 +2277,41 @@ func (rc *LogClient) RefreshMetaForTables(ctx context.Context, schemasReplace *s zap.Int("regularTableCount", regularCount)) return nil } + +func (rc *LogClient) resolveInvolvingNames( + ctx context.Context, + dbReplace *stream.DBReplace, + tableReplace *stream.TableReplace, +) (string, string) { + dbName := "" + if dbReplace != nil { + dbName = dbReplace.Name + } + tableName := "" + if tableReplace != nil { + tableName = tableReplace.Name + } + + infoSchema := rc.dom.InfoSchema() + if dbName == "" && dbReplace != nil { + if dbInfo, ok := infoSchema.SchemaByID(dbReplace.DbID); ok { + dbName = dbInfo.Name.O + } + } + if tableName == "" && tableReplace != nil && tableReplace.TableID != 0 { + if tbl, ok := infoSchema.TableByID(ctx, tableReplace.TableID); ok { + tableName = tbl.Meta().Name.O + } + } + + if dbName == "" { + dbName = model.InvolvingAll + } + if tableName == "" { + tableName = model.InvolvingAll + } + if dbName == model.InvolvingAll && tableName != model.InvolvingAll { + tableName = model.InvolvingAll + } + return dbName, tableName +} diff --git a/pkg/testkit/brhelper/workload/add_index.go b/pkg/testkit/brhelper/workload/add_index.go index 6d1347a8930de..1c8987808dbab 100644 --- a/pkg/testkit/brhelper/workload/add_index.go +++ b/pkg/testkit/brhelper/workload/add_index.go @@ -49,7 +49,7 @@ type addIndexState struct { Indexes []addIndexSpec `json:"indexes"` Checksum TableChecksum `json:"checksum"` - LogDone bool `json:"log_done"` + LogDone bool `json:"log_done"` } type addIndexSummary struct { @@ -72,7 +72,7 @@ func (s addIndexSummary) SummaryTable() string { b.WriteString("\n - ") b.WriteString(idx.Name) if len(idx.Columns) > 0 { - b.WriteString("(" + joinWithComma(idx.Columns) + ")") + b.WriteString("(" + strings.Join(idx.Columns, ",") + ")") } } } @@ -82,7 +82,7 @@ func (s addIndexSummary) SummaryTable() string { b.WriteString("\n - ") b.WriteString(idx.Name) if len(idx.Columns) > 0 { - b.WriteString("(" + joinWithComma(idx.Columns) + ")") + b.WriteString("(" + strings.Join(idx.Columns, ",") + ")") } } } @@ -147,82 +147,18 @@ func (c *AddIndexCase) Tick(ctx TickContext, raw json.RawMessage) error { if err := json.Unmarshal(raw, &st); err != nil { return err } - if st.N <= 0 { - st.N = 100 - } - if st.NR <= 0 { - st.NR = 150 - } - if st.NextIndexID < len(st.Indexes) { - st.NextIndexID = len(st.Indexes) - } - - // Insert data each tick. - v := int64(st.Inserted) - if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+QTable(st.DB, st.Table)+" (a,b,c,d,e) VALUES (?,?,?,?,?)", - v, v*7+1, v*11+2, v*13+3, v*17+4, - ); err != nil { - return err - } - st.Inserted++ + normalizeAddIndexState(&st) tickNo := st.Ticked + 1 - // Every N ticks, add a new index on 1~3 columns. - if st.N > 0 && tickNo%st.N == 0 { - allCols := []string{"a", "b", "c", "d", "e"} - idxID := st.NextIndexID - idxName := fmt.Sprintf("idx_%d", idxID) - - colN := 1 + (idxID % 3) - start := idxID % len(allCols) - cols := make([]string, 0, colN) - for i := 0; i < colN; i++ { - cols = append(cols, allCols[(start+i)%len(allCols)]) - } - - exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, idxName) - if err != nil { - return err - } - if !exists { - colSQL := make([]string, 0, len(cols)) - for _, c := range cols { - colSQL = append(colSQL, QIdent(c)) - } - stmt := "CREATE INDEX " + QIdent(idxName) + " ON " + QTable(st.DB, st.Table) + " (" + joinWithComma(colSQL) + ")" - if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { - return err - } - } - - spec := addIndexSpec{Name: idxName, Columns: cols} - if !hasAddIndexSpec(st.Indexes, idxName) { - st.Indexes = append(st.Indexes, spec) - } - if !hasAddIndexSpec(c.indexesAdded, idxName) { - c.indexesAdded = append(c.indexesAdded, spec) - } - st.NextIndexID++ + if err := addIndexInsertRow(ctx, &st); err != nil { + return err } - - // Every NR ticks, randomly drop an index. - if st.NR > 0 && tickNo%st.NR == 0 && len(st.Indexes) > 0 { - idx := pickIndex(st.Ticked, len(st.Indexes)) - dropSpec := st.Indexes[idx] - - exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, dropSpec.Name) - if err != nil { - return err - } - if exists { - stmt := "DROP INDEX " + QIdent(dropSpec.Name) + " ON " + QTable(st.DB, st.Table) - if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { - return err - } - } - c.indexesDropped = append(c.indexesDropped, dropSpec) - st.Indexes = append(st.Indexes[:idx], st.Indexes[idx+1:]...) + if err := c.maybeAddIndex(ctx, &st, tickNo); err != nil { + return err + } + if err := c.maybeDropIndex(ctx, &st, tickNo); err != nil { + return err } st.Ticked++ @@ -311,16 +247,88 @@ func hasAddIndexSpec(indexes []addIndexSpec, name string) bool { return false } -func joinWithComma(parts []string) string { - switch len(parts) { - case 0: - return "" - case 1: - return parts[0] +func normalizeAddIndexState(st *addIndexState) { + if st.N <= 0 { + st.N = 100 } - out := parts[0] - for _, p := range parts[1:] { - out += "," + p + if st.NR <= 0 { + st.NR = 150 + } + if st.NextIndexID < len(st.Indexes) { + st.NextIndexID = len(st.Indexes) } - return out +} + +func addIndexInsertRow(ctx TickContext, st *addIndexState) error { + v := int64(st.Inserted) + if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+QTable(st.DB, st.Table)+" (a,b,c,d,e) VALUES (?,?,?,?,?)", + v, v*7+1, v*11+2, v*13+3, v*17+4, + ); err != nil { + return err + } + st.Inserted++ + return nil +} + +func (c *AddIndexCase) maybeAddIndex(ctx TickContext, st *addIndexState, tickNo int) error { + if !EveryNTick(tickNo, st.N) { + return nil + } + allCols := []string{"a", "b", "c", "d", "e"} + idxID := st.NextIndexID + idxName := fmt.Sprintf("idx_%d", idxID) + + colN := 1 + (idxID % 3) + start := idxID % len(allCols) + cols := make([]string, 0, colN) + for i := 0; i < colN; i++ { + cols = append(cols, allCols[(start+i)%len(allCols)]) + } + + exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, idxName) + if err != nil { + return err + } + if !exists { + colSQL := make([]string, 0, len(cols)) + for _, col := range cols { + colSQL = append(colSQL, QIdent(col)) + } + stmt := "CREATE INDEX " + QIdent(idxName) + " ON " + QTable(st.DB, st.Table) + " (" + strings.Join(colSQL, ",") + ")" + if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { + return err + } + } + + spec := addIndexSpec{Name: idxName, Columns: cols} + if !hasAddIndexSpec(st.Indexes, idxName) { + st.Indexes = append(st.Indexes, spec) + } + if !hasAddIndexSpec(c.indexesAdded, idxName) { + c.indexesAdded = append(c.indexesAdded, spec) + } + st.NextIndexID++ + return nil +} + +func (c *AddIndexCase) maybeDropIndex(ctx TickContext, st *addIndexState, tickNo int) error { + if !EveryNTick(tickNo, st.NR) || len(st.Indexes) == 0 { + return nil + } + idx := ctx.RNG.IntN(len(st.Indexes)) + dropSpec := st.Indexes[idx] + + exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, dropSpec.Name) + if err != nil { + return err + } + if exists { + stmt := "DROP INDEX " + QIdent(dropSpec.Name) + " ON " + QTable(st.DB, st.Table) + if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { + return err + } + } + c.indexesDropped = append(c.indexesDropped, dropSpec) + st.Indexes = append(st.Indexes[:idx], st.Indexes[idx+1:]...) + return nil } diff --git a/pkg/testkit/brhelper/workload/context.go b/pkg/testkit/brhelper/workload/context.go index 67eac04ae1445..617626c08e873 100644 --- a/pkg/testkit/brhelper/workload/context.go +++ b/pkg/testkit/brhelper/workload/context.go @@ -18,6 +18,7 @@ import ( "context" "database/sql" "encoding/json" + "math/rand/v2" ) type Context struct { @@ -37,6 +38,7 @@ func (c Context) SetSummary(summary any) { type TickContext struct { Context + RNG *rand.Rand UpdateStateFn func(json.RawMessage) } diff --git a/pkg/testkit/brhelper/workload/modify_tiflash.go b/pkg/testkit/brhelper/workload/modify_tiflash.go index b0bb76e8b60ad..5f3cd8b7b05cc 100644 --- a/pkg/testkit/brhelper/workload/modify_tiflash.go +++ b/pkg/testkit/brhelper/workload/modify_tiflash.go @@ -41,7 +41,7 @@ type modifyTiFlashState struct { Replica int `json:"replica"` Checksum TableChecksum `json:"checksum"` - LogDone bool `json:"log_done"` + LogDone bool `json:"log_done"` } type replicaHistoryEntry struct { @@ -147,7 +147,7 @@ func (c *ModifyTiFlashCase) Tick(ctx TickContext, raw json.RawMessage) error { } st.Inserted++ - if st.N > 0 && tickNo%st.N == 0 { + if EveryNTick(tickNo, st.N) { max := st.NAP if max > 0 { next := tickNo % (max + 1) diff --git a/pkg/testkit/brhelper/workload/nexus_common.go b/pkg/testkit/brhelper/workload/nexus_common.go index c16d6338332b8..ed29d6c34af41 100644 --- a/pkg/testkit/brhelper/workload/nexus_common.go +++ b/pkg/testkit/brhelper/workload/nexus_common.go @@ -42,7 +42,7 @@ type nexusState struct { Tables []nexusTableState `json:"tables"` Checksums map[string]TableChecksum `json:"checksums,omitempty"` - LogDone bool `json:"log_done"` + LogDone bool `json:"log_done"` } type nexusSummary struct { @@ -107,14 +107,6 @@ func nexusInsertRow(ctx context.Context, db *sql.DB, schema, table string, tick return err } -func pickIndex(seed int, n int) int { - if n <= 0 { - return 0 - } - x := uint64(seed)*1103515245 + 12345 - return int(x % uint64(n)) -} - func nexusRecordChecksums(ctx context.Context, db *sql.DB, schema string, tables []nexusTableState) (map[string]TableChecksum, error) { out := make(map[string]TableChecksum, len(tables)) for _, t := range tables { @@ -126,12 +118,3 @@ func nexusRecordChecksums(ctx context.Context, db *sql.DB, schema string, tables } return out, nil } - -func containsString(ss []string, s string) bool { - for _, v := range ss { - if v == s { - return true - } - } - return false -} diff --git a/pkg/testkit/brhelper/workload/nexus_ddl.go b/pkg/testkit/brhelper/workload/nexus_ddl.go index 4c9e9254847f0..101ec5d7cce0a 100644 --- a/pkg/testkit/brhelper/workload/nexus_ddl.go +++ b/pkg/testkit/brhelper/workload/nexus_ddl.go @@ -19,6 +19,7 @@ import ( "database/sql" "encoding/json" "fmt" + "slices" ) type NexusDDLCase struct { @@ -87,7 +88,7 @@ func (c *NexusDDLCase) Tick(ctx TickContext, raw json.RawMessage) error { tickNo := st.Ticked + 1 half := nexusHalf(st.N) - if st.N > 0 && tickNo%(2*st.N) == 0 && len(st.Tables) > 0 { + if EveryNTick(tickNo, 2*st.N) && len(st.Tables) > 0 { oldest := st.Tables[0].Name stmt := "DROP TABLE IF EXISTS " + QTable(st.DB, oldest) if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { @@ -96,7 +97,7 @@ func (c *NexusDDLCase) Tick(ctx TickContext, raw json.RawMessage) error { st.Tables = st.Tables[1:] } - if st.N > 0 && tickNo%st.N == 0 { + if EveryNTick(tickNo, st.N) { name := nexusTableName(st.NextTableID) st.NextTableID++ if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, tickNo, st.DB, name); err != nil { @@ -105,14 +106,14 @@ func (c *NexusDDLCase) Tick(ctx TickContext, raw json.RawMessage) error { st.Tables = append(st.Tables, nexusTableState{Name: name}) } - if tickNo%half == 0 && len(st.Tables) > 0 { + if EveryNTick(tickNo, half) && len(st.Tables) > 0 { youngest := &st.Tables[len(st.Tables)-1] if err := nexusAddOneColumn(ctx, ctx.DB, &st, &c.ddls, tickNo, youngest); err != nil { return err } } - if st.N > 0 && tickNo%st.N == 0 && len(st.Tables) > 0 { + if EveryNTick(tickNo, st.N) && len(st.Tables) > 0 { oldest := &st.Tables[0] if err := nexusDropOneColumn(ctx, ctx.DB, &st, &c.ddls, tickNo, oldest); err != nil { return err @@ -229,7 +230,7 @@ func nexusAddOneColumn(ctx context.Context, db *sql.DB, st *nexusState, ddls *[] return err } if exists { - if !containsString(t.Cols, col) { + if !slices.Contains(t.Cols, col) { t.Cols = append(t.Cols, col) } t.NextColID++ diff --git a/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go b/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go index a7ed96a4de68d..65c9316874297 100644 --- a/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go +++ b/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go @@ -80,7 +80,7 @@ func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) err tickNo := st.Ticked + 1 half := nexusHalf(st.N) - if st.N > 0 && tickNo%st.N == 0 { + if EveryNTick(tickNo, st.N) { name := nexusTableName(st.NextTableID) st.NextTableID++ if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, tickNo, st.DB, name); err != nil { @@ -89,8 +89,8 @@ func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) err st.Tables = append(st.Tables, nexusTableState{Name: name}) } - if tickNo%half == 0 && len(st.Tables) > 0 { - idx := pickIndex(st.Ticked, len(st.Tables)) + if EveryNTick(tickNo, half) && len(st.Tables) > 0 { + idx := ctx.RNG.IntN(len(st.Tables)) oldName := st.Tables[idx].Name newName := nexusTableName(st.NextTableID) st.NextTableID++ @@ -101,8 +101,8 @@ func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) err st.Tables[idx].Name = newName } - if st.N > 0 && tickNo%(2*st.N) == 0 && len(st.Tables) > 0 { - idx := pickIndex(st.Ticked*2+1, len(st.Tables)) + if EveryNTick(tickNo, 2*st.N) && len(st.Tables) > 0 { + idx := ctx.RNG.IntN(len(st.Tables)) stmt := "TRUNCATE TABLE " + QTable(st.DB, st.Tables[idx].Name) if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { return err diff --git a/pkg/testkit/brhelper/workload/runner.go b/pkg/testkit/brhelper/workload/runner.go index 515a65850b44e..7628a50b23106 100644 --- a/pkg/testkit/brhelper/workload/runner.go +++ b/pkg/testkit/brhelper/workload/runner.go @@ -19,6 +19,7 @@ import ( "database/sql" "encoding/json" "fmt" + "hash/fnv" "math/rand/v2" "strings" "sync" @@ -33,7 +34,6 @@ type CaseSpec struct { type RunConfig struct { TickCount int TickInterval time.Duration - PersistEvery uint64 Seed int64 Parallel bool } @@ -98,9 +98,6 @@ func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { if cfg.TickInterval < 0 { return nil, fmt.Errorf("workload: TickInterval must be >= 0") } - if cfg.PersistEvery == 0 { - cfg.PersistEvery = ^uint64(0) - } states, err := r.store.GetAll(ctx) if err != nil { @@ -127,12 +124,13 @@ func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { } summary := NewSummary() + rngs := newCaseRNGs(cfg.Seed, selected) if cfg.Parallel { - if err := r.runParallelTicks(ctx, cfg, selected, states, summary); err != nil { + if err := r.runParallelTicks(ctx, cfg, selected, states, summary, rngs); err != nil { return nil, err } } else { - if err := r.runSequentialTicks(ctx, cfg, selected, states, summary); err != nil { + if err := r.runSequentialTicks(ctx, cfg, selected, states, summary, rngs); err != nil { return nil, err } } @@ -199,18 +197,21 @@ func (r *Runner) runSequentialTicks( selected []CaseSpec, states map[string]json.RawMessage, summary *Summary, + rngs map[string]*rand.Rand, ) error { - rng := rand.New(rand.NewPCG(uint64(cfg.Seed), uint64(cfg.Seed>>1))) + shuffleRNG := rand.New(rand.NewPCG(uint64(cfg.Seed), uint64(cfg.Seed>>1))) for tick := 0; tick < cfg.TickCount; tick++ { - rng.Shuffle(len(selected), func(i, j int) { selected[i], selected[j] = selected[j], selected[i] }) + shuffleRNG.Shuffle(len(selected), func(i, j int) { selected[i], selected[j] = selected[j], selected[i] }) for _, spec := range selected { state, ok := states[spec.Name] if !ok { return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) } + rng := rngs[spec.Name] tickCtx := TickContext{ Context: Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}, + RNG: rng, UpdateStateFn: func(updated json.RawMessage) { states[spec.Name] = updated }, @@ -220,20 +221,6 @@ func (r *Runner) runSequentialTicks( } } - if shouldPersistTick(tick, cfg.PersistEvery) { - flush := make(map[string]json.RawMessage, len(selected)) - for _, spec := range selected { - state, ok := states[spec.Name] - if !ok { - return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) - } - flush[spec.Name] = state - } - if err := r.store.PutMany(ctx, flush); err != nil { - return err - } - } - if tick != cfg.TickCount-1 { if err := sleep(ctx, cfg.TickInterval); err != nil { return err @@ -249,30 +236,14 @@ func (r *Runner) runParallelTicks( selected []CaseSpec, states map[string]json.RawMessage, summary *Summary, + rngs map[string]*rand.Rand, ) error { var mu sync.Mutex for tick := 0; tick < cfg.TickCount; tick++ { - if err := r.runParallelTick(ctx, selected, states, summary, &mu); err != nil { + if err := r.runParallelTick(ctx, selected, states, summary, rngs, &mu); err != nil { return err } - if shouldPersistTick(tick, cfg.PersistEvery) { - flush := make(map[string]json.RawMessage, len(selected)) - mu.Lock() - for _, spec := range selected { - state, ok := states[spec.Name] - if !ok { - mu.Unlock() - return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) - } - flush[spec.Name] = state - } - mu.Unlock() - if err := r.store.PutMany(ctx, flush); err != nil { - return err - } - } - if tick != cfg.TickCount-1 { if err := sleep(ctx, cfg.TickInterval); err != nil { return err @@ -287,6 +258,7 @@ func (r *Runner) runParallelTick( selected []CaseSpec, states map[string]json.RawMessage, summary *Summary, + rngs map[string]*rand.Rand, mu *sync.Mutex, ) error { if len(selected) == 0 { @@ -316,9 +288,11 @@ func (r *Runner) runParallelTick( }) return } + rng := rngs[spec.Name] tickCtx := TickContext{ Context: Context{Context: runCtx, DB: r.db, CaseName: spec.Name, Summary: summary}, + RNG: rng, UpdateStateFn: func(updated json.RawMessage) { mu.Lock() states[spec.Name] = updated @@ -338,6 +312,21 @@ func (r *Runner) runParallelTick( return firstErr } +func newCaseRNGs(seed int64, selected []CaseSpec) map[string]*rand.Rand { + out := make(map[string]*rand.Rand, len(selected)) + for _, spec := range selected { + out[spec.Name] = newCaseRNG(seed, spec.Name) + } + return out +} + +func newCaseRNG(seed int64, name string) *rand.Rand { + h := fnv.New64a() + _, _ = h.Write([]byte(name)) + seq := h.Sum64() | 1 + return rand.New(rand.NewPCG(uint64(seed), seq)) +} + func normalizeCaseSpecs(specs []CaseSpec) ([]CaseSpec, error) { out := make([]CaseSpec, 0, len(specs)) nameCounts := make(map[string]int, len(specs)) @@ -385,10 +374,3 @@ func sleep(ctx context.Context, d time.Duration) error { return nil } } - -func shouldPersistTick(tick int, persistEvery uint64) bool { - if persistEvery == 0 || persistEvery == ^uint64(0) { - return false - } - return uint64(tick+1)%persistEvery == 0 -} diff --git a/pkg/testkit/brhelper/workload/util.go b/pkg/testkit/brhelper/workload/util.go index cb276674e0e57..3f88b394dce71 100644 --- a/pkg/testkit/brhelper/workload/util.go +++ b/pkg/testkit/brhelper/workload/util.go @@ -176,3 +176,7 @@ func Require(cond bool, format string, args ...any) error { } return fmt.Errorf(format, args...) } + +func EveryNTick(tick int, n int) bool { + return n > 0 && tick%n == 0 +} diff --git a/tests/realtikvtest/brietest/segmented_restore_test.go b/tests/realtikvtest/brietest/segmented_restore_test.go index 7d360b6451ed6..e21cec9790ca3 100644 --- a/tests/realtikvtest/brietest/segmented_restore_test.go +++ b/tests/realtikvtest/brietest/segmented_restore_test.go @@ -48,7 +48,9 @@ func TestSegmentedRestoreWorkload(t *testing.T) { &workload.AddIndexCase{}, } if tiflashCount := tiflashStoreCount(t, kit.tk); tiflashCount > 0 { - cases = append(cases, &workload.ModifyTiFlashCase{}) + cases = append(cases, &workload.ModifyTiFlashCase{NAP: tiflashCount}) + } else { + t.Log("TiFlash not found in environment, won't run tiflash related cases.") } runner, err := workload.NewRunner(db, store, cases...) require.NoError(t, err) @@ -73,7 +75,6 @@ func TestSegmentedRestoreWorkload(t *testing.T) { runCfg := workload.RunConfig{ TickCount: 100, TickInterval: 0, - PersistEvery: ^uint64(0), Seed: 1, Parallel: true, } From f790632d228e583592a847453b35621e1bccc65f Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Wed, 28 Jan 2026 04:19:08 +0000 Subject: [PATCH 08/18] fix id remap Signed-off-by: Juncen Yu --- br/pkg/errors/errors.go | 1 + br/pkg/restore/log_client/client.go | 2 +- br/pkg/restore/log_client/id_map.go | 101 +++--------------- .../brietest/segmented_restore_test.go | 7 +- 4 files changed, 17 insertions(+), 94 deletions(-) diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go index a21855f95c589..ca5cc3b06f1ae 100644 --- a/br/pkg/errors/errors.go +++ b/br/pkg/errors/errors.go @@ -88,6 +88,7 @@ var ( ErrPiTRTaskNotFound = errors.Normalize("task not found", errors.RFCCodeText("BR:PiTR:ErrTaskNotFound")) ErrPiTRInvalidTaskInfo = errors.Normalize("task info is invalid", errors.RFCCodeText("BR:PiTR:ErrInvalidTaskInfo")) ErrPiTRMalformedMetadata = errors.Normalize("malformed metadata", errors.RFCCodeText("BR:PiTR:ErrMalformedMetadata")) + ErrPiTRIDMapTableNotFound = errors.Normalize("id map table not found", errors.RFCCodeText("BR:PiTR:IDMapTableNotFound")) ErrStorageUnknown = errors.Normalize("unknown external storage error", errors.RFCCodeText("BR:ExternalStorage:ErrStorageUnknown")) ErrStorageInvalidConfig = errors.Normalize("invalid external storage config", errors.RFCCodeText("BR:ExternalStorage:ErrStorageInvalidConfig")) diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index a19d7931c64b7..923913f694ba1 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -1055,7 +1055,7 @@ func (rc *LogClient) GetBaseIDMapAndMerge( // schemas map whose `restore-ts`` is the task's `start-ts`. if len(dbMaps) <= 0 && !hasFullBackupStorageConfig { log.Info("try to load pitr id maps of the previous task", zap.Uint64("start-ts", rc.startTS)) - dbMaps, err = rc.loadSchemasMap(ctx, rc.startTS, logCheckpointMetaManager) + dbMaps, err = rc.loadSchemasMapFromLastTask(ctx, rc.startTS) if err != nil { return errors.Trace(err) } diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 49d7a13c55308..208d66c5065fd 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -23,6 +23,7 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" + berrors "github.com/pingcap/tidb/br/pkg/errors" "github.com/pingcap/tidb/br/pkg/metautil" "github.com/pingcap/tidb/br/pkg/restore" "github.com/pingcap/tidb/br/pkg/stream" @@ -161,48 +162,24 @@ func (rc *LogClient) loadSchemasMap( if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { log.Info("checkpoint storage is specified, load pitr id map from the checkpoint storage.") dbMaps, err := rc.loadSchemasMapFromStorage(ctx, checkpointStorage, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if len(dbMaps) > 0 { - return dbMaps, nil - } + return dbMaps, errors.Trace(err) } if rc.pitrIDMapTableExists() { - dbMaps, err := rc.loadSchemasMapFromTable(ctx, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if len(dbMaps) > 0 { - return dbMaps, nil - } - // If we are loading the base map for a previous restore (restoredTS != rc.restoreTS), - // fall back to the latest restore_id for this restoredTS. - if restoredTS != rc.restoreTS && rc.pitrIDMapHasRestoreIDColumn() { - dbMaps, fallbackRestoreID, err := rc.loadSchemasMapFromTableLatestRestoreID(ctx, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if len(dbMaps) > 0 { - log.Info("load pitr id map from latest restore_id for previous segment", - zap.Uint64("restored_ts", restoredTS), - zap.Uint64("restore_id", fallbackRestoreID), - zap.Uint64("current_restore_id", rc.restoreID)) - return dbMaps, nil - } - } - if rc.storage != nil { - log.Info("fallback to log backup storage for pitr id map", zap.Uint64("restored_ts", restoredTS)) - dbMaps, err := rc.loadSchemasMapFromStorage(ctx, rc.storage, restoredTS) - return dbMaps, errors.Trace(err) - } - return nil, nil + dbMaps, err := rc.loadSchemasMapFromTable(ctx, restoredTS, true) + return dbMaps, errors.Trace(err) } log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") dbMaps, err := rc.loadSchemasMapFromStorage(ctx, rc.storage, restoredTS) return dbMaps, errors.Trace(err) } +func (rc *LogClient) loadSchemasMapFromLastTask(ctx context.Context, lastRestoredTS uint64) ([]*backuppb.PitrDBMap, error) { + if !rc.pitrIDMapTableExists() { + return nil, errors.Annotatef(berrors.ErrPiTRIDMapTableNotFound, "segmented restore is impossible") + } + return rc.loadSchemasMapFromTable(ctx, lastRestoredTS, false) +} + func (rc *LogClient) loadSchemasMapFromStorage( ctx context.Context, storage storeapi.Storage, @@ -233,19 +210,18 @@ func (rc *LogClient) loadSchemasMapFromStorage( func (rc *LogClient) loadSchemasMapFromTable( ctx context.Context, restoredTS uint64, + onlyThisRestore bool, ) ([]*backuppb.PitrDBMap, error) { - hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() + useRestoreIDFilter := onlyThisRestore && rc.pitrIDMapHasRestoreIDColumn() var getPitrIDMapSQL string var args []any - if hasRestoreIDColumn { + if useRestoreIDFilter { // new version with restore_id column getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" args = []any{rc.restoreID, restoredTS, rc.upstreamClusterID} } else { - // old version without restore_id column - log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" args = []any{restoredTS, rc.upstreamClusterID} } @@ -283,52 +259,3 @@ func (rc *LogClient) loadSchemasMapFromTable( return backupMeta.GetDbMaps(), nil } - -func (rc *LogClient) loadSchemasMapFromTableLatestRestoreID( - ctx context.Context, - restoredTS uint64, -) ([]*backuppb.PitrDBMap, uint64, error) { - getPitrIDMapSQL := "SELECT restore_id, segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY restore_id DESC, segment_id;" - args := []any{restoredTS, rc.upstreamClusterID} - - execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() - rows, _, errSQL := execCtx.ExecRestrictedSQL( - kv.WithInternalSourceType(ctx, kv.InternalTxnBR), - nil, - getPitrIDMapSQL, - args..., - ) - if errSQL != nil { - return nil, 0, errors.Annotatef(errSQL, "failed to get pitr id map from mysql.tidb_pitr_id_map") - } - if len(rows) == 0 { - log.Info("pitr id map does not exist", zap.Uint64("restored ts", restoredTS)) - return nil, 0, nil - } - - targetRestoreID := rows[0].GetUint64(0) - expectedSegmentID := uint64(0) - metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) - for _, row := range rows { - restoreID := row.GetUint64(0) - if restoreID != targetRestoreID { - break - } - elementID := row.GetUint64(1) - if expectedSegmentID != elementID { - return nil, 0, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", expectedSegmentID) - } - d := row.GetBytes(2) - if len(d) == 0 { - return nil, 0, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", expectedSegmentID) - } - metaData = append(metaData, d...) - expectedSegmentID += 1 - } - backupMeta := &backuppb.BackupMeta{} - if err := backupMeta.Unmarshal(metaData); err != nil { - return nil, 0, errors.Trace(err) - } - - return backupMeta.GetDbMaps(), targetRestoreID, nil -} diff --git a/tests/realtikvtest/brietest/segmented_restore_test.go b/tests/realtikvtest/brietest/segmented_restore_test.go index e21cec9790ca3..174a699b1f223 100644 --- a/tests/realtikvtest/brietest/segmented_restore_test.go +++ b/tests/realtikvtest/brietest/segmented_restore_test.go @@ -79,7 +79,7 @@ func TestSegmentedRestoreWorkload(t *testing.T) { Parallel: true, } - for i := 0; i < 4; i++ { + for range 4 { _, err := runner.Run(ctx, runCfg) require.NoError(t, err) kit.forceFlushAndWait(taskName) @@ -104,15 +104,10 @@ func TestSegmentedRestoreWorkload(t *testing.T) { rc.LastRestore = idx == len(checkpoints)-1 rc.IsLastRestoreUserSpecified = true rc.UseCheckpoint = true - rc.CheckpointStorage = kit.LocalURI("checkpoint") if idx > 0 { rc.StartTS = checkpoints[idx-1] rc.FullBackupStorage = "" } - kit.SetFilter(&rc.Config, "test*.*") - if idx != len(checkpoints)-1 { - rc.ExplicitFilter = false - } }) } From 018ff1bb6b17fba05698df5cb949d3ef3a462f44 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Wed, 28 Jan 2026 04:19:30 +0000 Subject: [PATCH 09/18] remove agent docs Signed-off-by: Juncen Yu --- .../findings/lifecycle_of_recorders.md | 226 ------------------ __agent_doc/main.md | 53 ---- __agent_doc/segmented_restore_plan.md | 62 ----- 3 files changed, 341 deletions(-) delete mode 100644 __agent_doc/findings/lifecycle_of_recorders.md delete mode 100644 __agent_doc/main.md delete mode 100644 __agent_doc/segmented_restore_plan.md diff --git a/__agent_doc/findings/lifecycle_of_recorders.md b/__agent_doc/findings/lifecycle_of_recorders.md deleted file mode 100644 index d12b40417b805..0000000000000 --- a/__agent_doc/findings/lifecycle_of_recorders.md +++ /dev/null @@ -1,226 +0,0 @@ -# DDL Special Handling During Log Meta Restore - -This document summarizes **only the DDL-related behaviors that are handled specially** during log/meta restore. All items below are confirmed by code paths in the current repository (no inference beyond what the code checks). - -## 0) Recorder lifecycles (delRangeRecorder / ingestRecorder / TiflashRecorder) - -This section describes **when each recorder is created, populated, and consumed**. - -### delRangeRecorder (GC delete-range) - -- Created: - - `br/pkg/task/stream.go` → `buildSchemaReplace` - - `stream.NewSchemasReplace(..., recordDeleteRange=client.RecordDeleteRange, ...)` - - `br/pkg/stream/rewrite_meta_rawkv.go` → `NewSchemasReplace` - - `delRangeRecorder: newDelRangeExecWrapper(globalTableIdMap, recordDeleteRange)` - -- Populated: - - `br/pkg/stream/rewrite_meta_rawkv.go` → `SchemasReplace.RewriteMetaKvEntry` - - On Default CF + `utils.IsMetaDDLJobHistoryKey(e.Key)`: decode `model.Job` and call `processIngestIndexAndDeleteRangeFromJob(job)` - - `br/pkg/stream/rewrite_meta_rawkv.go` → `processIngestIndexAndDeleteRangeFromJob` - - `if ddl.JobNeedGC(job)` → `ddl.AddDelRangeJobInternal(..., sr.delRangeRecorder, job)` - - `br/pkg/stream/rewrite_meta_rawkv.go` → `brDelRangeExecWrapper.ConsumeDeleteRange` - - calls `recordDeleteRange(*PreDelRangeQuery)` (wired to `LogClient.RecordDeleteRange`). - -- Buffered / thread-safety: - - `br/pkg/restore/log_client/batch_meta_processor.go` → `RestoreMetaKVProcessor.RestoreAndRewriteMetaKVFiles` - - starts loader: `rp.client.RunGCRowsLoader(ctx)` - - `br/pkg/restore/log_client/client.go`: - - `RecordDeleteRange` pushes into `deleteRangeQueryCh` - - `RunGCRowsLoader` drains `deleteRangeQueryCh` into `rc.deleteRangeQuery` - -- Consumed: - - `br/pkg/task/stream.go` → `restoreStream` (after KV restore) - - `client.InsertGCRows(ctx)` - - `br/pkg/restore/log_client/client.go` → `InsertGCRows` - - closes channel, waits loader, and inserts into `gc_delete_range`. - -### ingestRecorder (ingest index repair) - -- Created: - - `br/pkg/stream/rewrite_meta_rawkv.go` → `NewSchemasReplace` - - `ingestRecorder: ingestrec.New()` - -- Populated: - - `br/pkg/stream/rewrite_meta_rawkv.go` → `SchemasReplace.RewriteMetaKvEntry` - - On Default CF + `mDDLJobHistory`: decode `model.Job` → `processIngestIndexAndDeleteRangeFromJob(job)` - - `br/pkg/stream/rewrite_meta_rawkv.go` → `tryRecordIngestIndex` - - For `ActionMultiSchemaChange`: expands subjobs - - Otherwise: `sr.ingestRecorder.TryAddJob(job, ...)` - - `br/pkg/restore/ingestrec/ingest_recorder.go` → `TryAddJob` - - only records ingest reorg jobs for `ActionAddIndex` / `ActionAddPrimaryKey` / `ActionModifyColumn` (and state constraints). - -- Rewritten (table ID mapping after meta restore): - - `br/pkg/task/stream.go` → `restoreStream` - - `ingestRecorder := schemasReplace.GetIngestRecorder()` - - `rangeFilterFromIngestRecorder(ingestRecorder, rewriteRules)` - - `br/pkg/task/stream.go` → `rangeFilterFromIngestRecorder` - - `ingestRecorder.RewriteTableID(...)` based on `rewriteRules`. - -- Consumed: - - `br/pkg/task/stream.go` → `restoreStream` (after KV restore) - - `client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g)` - - `br/pkg/restore/log_client/client.go` → `RepairIngestIndex` - - calls `ingestRecorder.UpdateIndexInfo(..., InfoSchema)` then `Iterate(...)` to generate and execute SQL. - - may load/save generated SQLs via checkpoint meta manager (see `generateRepairIngestIndexSQLs`). - -### TiflashRecorder (TiFlash replica stripping + later restore) - -- Created: - - `br/pkg/task/stream.go` → `RunPointInTimeRestore` - - `cfg.tiflashRecorder = tiflashrec.New()` - -- Populated and applied during meta KV replay: - - Hook wiring: - - `br/pkg/task/stream.go` → `buildSchemaReplace` - - sets `schemasReplace.AfterTableRewrittenFn = func(deleted bool, tableInfo *model.TableInfo) { ... }` - - Hook invocation: - - `br/pkg/stream/rewrite_meta_rawkv.go` → `rewriteTableInfo` - - calls `sr.AfterTableRewrittenFn(false, &tableInfo)` on normal rewrite - - `br/pkg/stream/rewrite_meta_rawkv.go` → `rewriteEntryForTable` - - on deletion calls `sr.AfterTableRewrittenFn(true, &model.TableInfo{ID: newTableID})` - - Hook behavior: - - `br/pkg/task/stream.go` → `buildSchemaReplace` - - records current `tableInfo.TiFlashReplica` into `cfg.tiflashRecorder` (or `DelTable` when deleted / nil) - - **removes** replica info from restored meta: `tableInfo.TiFlashReplica = nil` - -- Checkpoint persistence: - - Save: - - `br/pkg/restore/log_client/client.go` → `LoadOrCreateCheckpointMetadataForLogRestore` - - `CheckpointMetadataForLogRestore.TiFlashItems = tiflashRecorder.GetItems()` - - Load: - - `br/pkg/task/stream.go` → `RunPointInTimeRestore` - - when skipping full restore due to checkpoint: `cfg.tiflashRecorder.Load(taskInfo.CheckpointInfo.Metadata.TiFlashItems)` - -- Consumed: - - SQL generation: - - `br/pkg/restore/tiflashrec/tiflash_recorder.go` - - `GenerateAlterTableDDLs(InfoSchema)` generates `ALTER TABLE ... SET TIFLASH REPLICA ...` - - Execution: - - `br/pkg/task/stream.go` → `restoreStream` - - `sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema())` - - `client.ResetTiflashReplicas(ctx, sqls, g)` - -## 1) DDL job history extraction (mDDLJobHistory) -**What is special:** DDL job history entries are *not restored as meta KV*; instead they are decoded and used for two special cases (ingest index repair and delete-range GC). - -**Path:** -- `br/pkg/stream/rewrite_meta_rawkv.go` - - `SchemasReplace.RewriteMetaKvEntry` - - Checks `utils.IsMetaDDLJobHistoryKey(e.Key)` on **Default CF** - - Decodes `model.Job` and routes to `processIngestIndexAndDeleteRangeFromJob` - -**Behavior:** -- DDL job history is parsed and **not written back** as meta KV (`return nil, ...`). - -## 2) Ingest index DDL jobs (repair by replay) -**What is special:** Ingest-mode index builds are *not included in log backup KV*, so they are recorded and later repaired via SQL. - -**Paths:** -- Capture from DDL job history: - - `br/pkg/stream/rewrite_meta_rawkv.go` - - `processIngestIndexAndDeleteRangeFromJob` - - `tryRecordIngestIndex` - - For `ActionMultiSchemaChange`: expands to sub-jobs - - Otherwise: `ingestrec.IngestRecorder.TryAddJob` -- Recording logic: - - `br/pkg/restore/ingestrec/ingest_recorder.go` - - `TryAddJob` - - **Only records** when all conditions are true: - - `job.ReorgMeta.ReorgTp == model.ReorgTypeIngest` - - `job.Type` is **one of** `ActionAddIndex`, `ActionAddPrimaryKey`, `ActionModifyColumn` - - Job is synced (or sub-job done) -- Repair execution: - - `br/pkg/restore/log_client/client.go` - - `RepairIngestIndex` - - `generateRepairIngestIndexSQLs` - -**Behavior:** -- Only the job types above are recorded for ingest repair. -- The repair uses **latest InfoSchema** to build ADD INDEX/PRIMARY KEY SQL. - -## 3) DDL jobs that require GC delete-range -**What is special:** For DDL jobs where TiDB normally relies on GC to clean ranges, the delete-range is recorded and executed explicitly after restore. - -**Paths:** -- Capture from DDL job history: - - `br/pkg/stream/rewrite_meta_rawkv.go` - - `processIngestIndexAndDeleteRangeFromJob` - - `if ddl.JobNeedGC(job)` → `ddl.AddDelRangeJobInternal(..., brDelRangeExecWrapper, job)` -- Delete-range recording: - - `br/pkg/stream/rewrite_meta_rawkv.go` - - `brDelRangeExecWrapper` (captures SQL + params) -- Execution after restore: - - `br/pkg/restore/log_client/client.go` - - `RunGCRowsLoader` / `InsertGCRows` - -**Behavior:** -- **Only jobs where** `ddl.JobNeedGC(job)` is true are handled. -- The code does **not** list job types explicitly; the DDL package decides. - -## 4) Table deletion / recreation tracking (DDL effects) -**What is special:** Deleted tables are tracked to refresh metadata in a dependency-safe order, and a re-created table is removed from the delete list. - -**Paths:** -- `br/pkg/stream/rewrite_meta_rawkv.go` - - `rewriteEntryForTable` - - When write CF indicates deletion: add to `deletedTables` - - When a new table meta is written after deletion: remove from `deletedTables` - - Comment references **RENAME TABLE** and **EXCHANGE PARTITION** sequences -- `br/pkg/restore/log_client/client.go` - - `RefreshMetaForTables` uses `deletedTables` to refresh meta in order - -**Behavior:** -- Delete + re-create sequences (e.g., rename/exchange partition) are handled to avoid stale refresh. - -## 5) DDL/meta filtering to only restore DB + DDL history -**What is special:** During meta KV restore, only `mDB` and `mDDLJobHistory` keys are considered; other meta keys are skipped. - -**Path:** -- `br/pkg/restore/log_client/log_file_manager.go` - - `ReadFilteredEntriesFromFiles` - - `if !utils.IsDBOrDDLJobHistoryKey(txnEntry.Key) { continue }` - -**Behavior:** -- This limits meta restore scope to database info and DDL job history. - -## 6) TiFlash replica (ALTER TABLE ... SET TIFLASH REPLICA) - -**What is special:** During PiTR, TiFlash replica config is **intentionally stripped out of restored table meta** and restored later via SQL. - -This impacts any historical/meta changes whose effect is persisted into `TableInfo.TiFlashReplica`, including the DDL `ALTER TABLE ... SET TIFLASH REPLICA ...`. - -**Paths:** - -- Strip replica info while replaying meta KV (and record it): - - `br/pkg/task/stream.go` → `buildSchemaReplace` - - assigns `SchemasReplace.AfterTableRewrittenFn` - - in callback: - - updates `cfg.tiflashRecorder` via `AddTable/DelTable` - - then `tableInfo.TiFlashReplica = nil` - - `br/pkg/stream/rewrite_meta_rawkv.go` → `rewriteTableInfo` - - calls `AfterTableRewrittenFn(false, &tableInfo)` during table meta rewrite - -- Persist to checkpoint (so retries keep the same intended replica config): - - `br/pkg/restore/log_client/client.go` → `LoadOrCreateCheckpointMetadataForLogRestore` - - saves `CheckpointMetadataForLogRestore.TiFlashItems` - - `br/pkg/task/stream.go` → `RunPointInTimeRestore` - - loads `cfg.tiflashRecorder.Load(...TiFlashItems...)` when resuming from checkpoint - -- Restore replica config after PiTR finishes: - - `br/pkg/task/stream.go` → `restoreStream` - - `cfg.tiflashRecorder.GenerateAlterTableDDLs(InfoSchema)` - - `client.ResetTiflashReplicas(ctx, sqls, g)` - - `br/pkg/restore/log_client/client.go` → `ResetTiflashReplicas` - ---- - -## Summary (no inference) -From the current code paths, DDL-related special handling during log/meta restore is limited to: -1) **Ingest index repair** (specific ingest reorg DDL types only). -2) **Delete-range GC** for DDL jobs where `ddl.JobNeedGC(job)` is true. -3) **Table delete/recreate tracking** affecting meta refresh order (rename/exchange patterns). -4) **Meta filtering** to `mDB` + `mDDLJobHistory` keys only. -5) **TiFlash replica stripping + later restore** (restore-time `ALTER TABLE ... SET TIFLASH REPLICA ...`). - -No other DDL job types are explicitly enumerated or handled beyond these code paths. diff --git a/__agent_doc/main.md b/__agent_doc/main.md deleted file mode 100644 index 1766897150257..0000000000000 --- a/__agent_doc/main.md +++ /dev/null @@ -1,53 +0,0 @@ -# Your epic - -You need to implement a feature -- segmented restore. - -Now, `br restore point` requires the user to finish restore to a point-in-time within oneshot. - -We want `br restore point` can be "segmented", like: - -``` -br restore point --pd 127.0.0.1:2379 -s local:///Volumes/eXternal/Cache/tmp/20260122_003925/incr --restored-ts 463736294698909699 --full-backup-storage local:///Volumes/eXternal/Cache/tmp/20260122_003925/full --last=false -/br restore point --pd 127.0.0.1:2379 -s local:///Volumes/eXternal/Cache/tmp/20260122_003925/incr --restored-ts 463736295708426243 --start-ts 463736294698909699 --last=false -/br restore point --pd 127.0.0.1:2379 -s local:///Volumes/eXternal/Cache/tmp/20260122_003925/incr --restored-ts ... --start-ts 463736295708426243 --last=true -``` - -But for now, it is impossible. There are something we have known: tiflash replicas may be added back too early, indices may not be properly recreated... But more are unknown unknown. - -This is an epic to make it work. Be patient. You cannot fix all within one edition. You have git access, commit or rollback your work when need. `AGENT.md` may told you to enable failpoint before running test, don't follow that as we are running "integration" tests, also you may ignore all `bazel` related requirements, repeat, don't enable failpoints, don't try to build with bazel or `make bazel_prepare`. - -You may find that something is fully out of your scope. Say, the test environment was broken. In that scenario, **don't** try to hack, just stop and ask for help. Again, don't try to hack. - -If you have made some progress, record them in `__agent_doc/`, for those who come later. - -Suggestions: -- You may not be able to solve the problem in one "sprint". Always record your plan to `__agent_doc` before start. For your next run. - -## To budld BR - -``` -make build_br -``` - -## To run our test cases - -``` -bash /Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh -``` - -This command can only be run without sandbox. Request access to it before you start to do anything. - -This command runs for minutes, again, this command runs for minutes. Don't set a too short time out. - -Once this test script passes, our epic reaches its happy ending. - -Reading its content and record its behavior can be a good start point. - -## After Happy Ending (1) - -All tests are passed. It is time to tidy up our codebase. Inspect recent commits you did, and refactor your modifications with DRY principle. - -## Integrated Test (2026-01-23) -- The segmented PiTR external test is now integrated as `br/tests/br_pitr_segmented_restore/run.sh`. -- The workload source lives under `br/tests/seg_pitr_workload`; the test builds it via `go build`. -- Run via `TEST_NAME=br_pitr_segmented_restore br/tests/run.sh` or include it in `br/tests/run_group_br_tests.sh` (G07). diff --git a/__agent_doc/segmented_restore_plan.md b/__agent_doc/segmented_restore_plan.md deleted file mode 100644 index ae9fb0811d403..0000000000000 --- a/__agent_doc/segmented_restore_plan.md +++ /dev/null @@ -1,62 +0,0 @@ -# Segmented Restore Plan (WIP) - -## Plan (2026-01-23, current run) -1) Read `/Volumes/eXternal/Developer/seg-pitr-workload` to understand external test structure and expected integration points. -2) Integrate the external test into this repo (tests, scripts, or CI hooks) with minimal duplication. -3) Build BR and run the integrated test (requires unsandboxed access) to verify behavior. -4) Record outcomes, gaps, and follow-ups here. - -## Plan (2026-01-22, current run) -1) Run `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` (requires unsandboxed access) to confirm current failure state. -2) If it fails, trace the failing path in BR restore (especially segmented point restore) and implement the minimal fix. -3) Re-run the script to verify, then record outcomes and follow-up risks here. - -## Plan (2025-xx-xx, current sprint) -1) Read `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` to understand the failing step and expected output. -2) Run the script (requires unsandboxed access) to reproduce the current failure and capture the exact error text. -3) Trace where the `AddIndex` checksum / Total_kvs mismatch is generated and ensure the error message is formatted as `Error: AddIndex: Total_kvs mismatch: ...`. -4) Fix any segmented-restore gaps discovered along the way and update this doc with results. - -## Current Status -- Ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` (first run). -- Failure in second segment restore: - - Error: `no base id map found from saved id or last restored PiTR` - - Stack: `br/pkg/restore/log_client/client.go:1068` via `GetBaseIDMapAndMerge`. - -## Hypothesis -- `tidb_pitr_id_map` has `restore_id` column; id maps are saved with restore_id from the first segment. -- Subsequent segments create a new restore_id, so loading by restore_id + restored_ts fails. - -## Plan -1) Update id-map loading to fall back to the latest `restore_id` for a given `restored_ts` when loading base maps (start-ts path). -2) Re-run `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` to find next failure. -3) Iterate on remaining segmented-restore gaps (e.g., tiflash replica handling, ingest index repair), recording findings here. - -## Progress -- Implemented id-map fallback for previous segments in `br/pkg/restore/log_client/id_map.go`. -- Rebuilt BR (`make build_br`). -- Re-ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh` successfully. - - Now it fails. Tiflash cases are added. - - Error: exist table(s) have tiflash replica, please remove it before restore -- Added TiFlash recorder persistence across segments and only reset replicas on the final segment. -- Auto-treat `--last` as false when `--restored-ts` is set to a non-max TS and `--last` wasn't explicitly specified. -- Rebuilt BR and re-ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh`: **PASS**. -## Progress (2026-01-22) -- Reproduced failure: `AddIndex: Total_kvs mismatch` after segmented restore. -- Root cause: running ingest index repair in non-final segments writes new MVCC meta versions that block later log restore, leaving extra indexes. -- Fix: - - Persist ingest recorder items across segments. - - Skip ingest index repair until `--last=true`, then repair once. -- Rebuilt BR and re-ran `/Volumes/eXternal/Developer/seg-pitr-workload/scripts/run.sh`: **PASS**. -## Progress (2026-01-xx) -- DRY cleanup: refactored PiTR ingest/tiflash item persistence to share JSON storage helpers and checkpoint fallback logic. -- Deduplicated ingest item counting by adding `ingestrec.CountItems` and reusing it in log client + stream restore logging. -## Progress (2026-02-xx) -- Unify PiTR ingest/tiflash item persistence with checkpoint storage type (table vs external). -- Added checkpoint-side PiTR item store and moved load/save logic off log backup storage. -- Guard segmented restore: non-final segments now require checkpoint storage. - -## Progress (2026-01-23) -- Integrated the segmented PiTR workload into `br/tests/seg_pitr_workload` and added a new br test `br/tests/br_pitr_segmented_restore/run.sh`. -- Switched workload state storage to a JSON file (no sqlite dependency) and updated CLI flags accordingly. -- Added the new test to `br/tests/run_group_br_tests.sh` (G07). From bf95ad7d0fe28f35edc6f5d4d6045843f8eeda8f Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Wed, 28 Jan 2026 06:01:31 +0000 Subject: [PATCH 10/18] tidy up test cases Signed-off-by: Juncen Yu --- .../brietest/workloadcases/nexus_ddl.go | 250 ++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 tests/realtikvtest/brietest/workloadcases/nexus_ddl.go diff --git a/tests/realtikvtest/brietest/workloadcases/nexus_ddl.go b/tests/realtikvtest/brietest/workloadcases/nexus_ddl.go new file mode 100644 index 0000000000000..456ae457ab352 --- /dev/null +++ b/tests/realtikvtest/brietest/workloadcases/nexus_ddl.go @@ -0,0 +1,250 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workloadcases + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "slices" + + "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" +) + +type NexusDDLCase struct { + Suffix string `json:"suffix"` + N int `json:"n"` +} + +func (c *NexusDDLCase) Name() string { return "NexusDDL" } + +func (c *NexusDDLCase) Prepare(ctx workload.Context) (json.RawMessage, error) { + suffix := c.Suffix + if suffix == "" { + var err error + suffix, err = workload.RandSuffix() + if err != nil { + return nil, err + } + } + n := c.N + if n <= 0 { + n = 50 + } + st := nexusState{ + Suffix: suffix, + DB: fmt.Sprintf("test_nexus_ddl_%s", suffix), + N: n, + Ticked: 0, + NextTableID: 1, + Tables: []nexusTableState{{Name: "t_0"}}, + } + if err := nexusExecDDL(ctx, ctx.DB, 0, "CREATE DATABASE IF NOT EXISTS "+workload.QIdent(st.DB)); err != nil { + return nil, err + } + if err := nexusCreateTable(ctx, ctx.DB, 0, st.DB, st.Tables[0].Name); err != nil { + return nil, err + } + return json.Marshal(st) +} + +func (c *NexusDDLCase) Tick(ctx workload.TickContext, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + st.N = nexusDefaultN(st.N) + if st.NextTableID <= 0 { + st.NextTableID = len(st.Tables) + } + for i := range st.Tables { + if st.Tables[i].NextColID < len(st.Tables[i].Cols) { + st.Tables[i].NextColID = len(st.Tables[i].Cols) + } + } + + tickNo := st.Ticked + 1 + half := nexusHalf(st.N) + + if workload.EveryNTick(tickNo, 2*st.N) && len(st.Tables) > 0 { + oldest := st.Tables[0].Name + stmt := "DROP TABLE IF EXISTS " + workload.QTable(st.DB, oldest) + if err := nexusExecDDL(ctx, ctx.DB, tickNo, stmt); err != nil { + return err + } + st.Tables = st.Tables[1:] + } + + if workload.EveryNTick(tickNo, st.N) { + name := nexusTableName(st.NextTableID) + st.NextTableID++ + if err := nexusCreateTable(ctx, ctx.DB, tickNo, st.DB, name); err != nil { + return err + } + st.Tables = append(st.Tables, nexusTableState{Name: name}) + } + + if workload.EveryNTick(tickNo, half) && len(st.Tables) > 0 { + youngest := &st.Tables[len(st.Tables)-1] + if err := nexusAddOneColumn(ctx, ctx.DB, &st, tickNo, youngest); err != nil { + return err + } + } + + if workload.EveryNTick(tickNo, st.N) && len(st.Tables) > 0 { + oldest := &st.Tables[0] + if err := nexusDropOneColumn(ctx, ctx.DB, &st, tickNo, oldest); err != nil { + return err + } + } + + for _, t := range st.Tables { + if err := nexusInsertRow(ctx, ctx.DB, st.DB, t.Name, tickNo); err != nil { + return err + } + } + + st.Ticked++ + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *NexusDDLCase) Exit(ctx workload.ExitContext, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + + sums, err := nexusRecordChecksums(ctx, ctx.DB, st.DB, st.Tables) + if err != nil { + return err + } + st.Checksums = sums + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *NexusDDLCase) Verify(ctx workload.Context, raw json.RawMessage) error { + var st nexusState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if err := workload.Require(st.LogDone, "NexusDDL: log not executed"); err != nil { + return err + } + if err := workload.Require(len(st.Checksums) > 0, "NexusDDL: checksum not recorded; run Exit first"); err != nil { + return err + } + + for _, t := range st.Tables { + ok, err := workload.TableExists(ctx, ctx.DB, st.DB, t.Name) + if err != nil { + return err + } + if err := workload.Require(ok, "NexusDDL: table %s.%s not found", st.DB, t.Name); err != nil { + return err + } + + for _, col := range t.Cols { + has, err := workload.ColumnExists(ctx, ctx.DB, st.DB, t.Name, col) + if err != nil { + return err + } + if err := workload.Require(has, "NexusDDL: %s.%s column %q not found", st.DB, t.Name, col); err != nil { + return err + } + } + + want, ok := st.Checksums[t.Name] + if !ok { + return fmt.Errorf("NexusDDL: missing checksum for table %s.%s", st.DB, t.Name) + } + got, err := workload.AdminChecksumTable(ctx, ctx.DB, st.DB, t.Name) + if err != nil { + return err + } + if err := workload.Require(got.TotalKvs == want.TotalKvs, "NexusDDL: Total_kvs mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalKvs, want.TotalKvs); err != nil { + return err + } + if want.TotalBytes != "" { + if err := workload.Require(got.TotalBytes == want.TotalBytes, "NexusDDL: Total_bytes mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalBytes, want.TotalBytes); err != nil { + return err + } + } + } + return nil +} + +func nexusAddOneColumn(ctx context.Context, db *sql.DB, st *nexusState, tick int, t *nexusTableState) error { + if t == nil { + return nil + } + if t.NextColID < len(t.Cols) { + t.NextColID = len(t.Cols) + } + + col := fmt.Sprintf("c_%d", t.NextColID) + exists, err := workload.ColumnExists(ctx, db, st.DB, t.Name, col) + if err != nil { + return err + } + if exists { + if !slices.Contains(t.Cols, col) { + t.Cols = append(t.Cols, col) + } + t.NextColID++ + return nil + } + + stmt := "ALTER TABLE " + workload.QTable(st.DB, t.Name) + " ADD COLUMN " + workload.QIdent(col) + " BIGINT" + if err := nexusExecDDL(ctx, db, tick, stmt); err != nil { + return err + } + t.Cols = append(t.Cols, col) + t.NextColID++ + return nil +} + +func nexusDropOneColumn(ctx context.Context, db *sql.DB, st *nexusState, tick int, t *nexusTableState) error { + if t == nil || len(t.Cols) == 0 { + return nil + } + col := t.Cols[0] + exists, err := workload.ColumnExists(ctx, db, st.DB, t.Name, col) + if err != nil { + return err + } + if exists { + stmt := "ALTER TABLE " + workload.QTable(st.DB, t.Name) + " DROP COLUMN " + workload.QIdent(col) + if err := nexusExecDDL(ctx, db, tick, stmt); err != nil { + return err + } + } + t.Cols = t.Cols[1:] + return nil +} From a8328d8fc9fdcd668c8ea85679534943b71ba810 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Wed, 28 Jan 2026 06:03:50 +0000 Subject: [PATCH 11/18] tidy up codes Signed-off-by: Juncen Yu --- pkg/testkit/brhelper/workload/context.go | 10 - .../brhelper/workload/modify_tiflash.go | 248 ---------------- pkg/testkit/brhelper/workload/nexus_ddl.go | 266 ------------------ pkg/testkit/brhelper/workload/registry.go | 25 -- pkg/testkit/brhelper/workload/runner.go | 55 ++-- pkg/testkit/brhelper/workload/summary.go | 75 ----- tests/realtikvtest/brietest/BUILD.bazel | 2 + .../brietest/segmented_restore_test.go | 13 +- .../brietest/workloadcases/BUILD.bazel | 17 ++ .../brietest/workloadcases}/add_index.go | 125 +++----- .../brietest/workloadcases/modify_tiflash.go | 192 +++++++++++++ .../brietest/workloadcases}/nexus_common.go | 52 +--- .../workloadcases}/nexus_ddl_destructive.go | 66 ++--- 13 files changed, 313 insertions(+), 833 deletions(-) delete mode 100644 pkg/testkit/brhelper/workload/modify_tiflash.go delete mode 100644 pkg/testkit/brhelper/workload/nexus_ddl.go delete mode 100644 pkg/testkit/brhelper/workload/registry.go delete mode 100644 pkg/testkit/brhelper/workload/summary.go create mode 100644 tests/realtikvtest/brietest/workloadcases/BUILD.bazel rename {pkg/testkit/brhelper/workload => tests/realtikvtest/brietest/workloadcases}/add_index.go (52%) create mode 100644 tests/realtikvtest/brietest/workloadcases/modify_tiflash.go rename {pkg/testkit/brhelper/workload => tests/realtikvtest/brietest/workloadcases}/nexus_common.go (55%) rename {pkg/testkit/brhelper/workload => tests/realtikvtest/brietest/workloadcases}/nexus_ddl_destructive.go (57%) diff --git a/pkg/testkit/brhelper/workload/context.go b/pkg/testkit/brhelper/workload/context.go index 617626c08e873..fd51f91691f28 100644 --- a/pkg/testkit/brhelper/workload/context.go +++ b/pkg/testkit/brhelper/workload/context.go @@ -24,16 +24,6 @@ import ( type Context struct { context.Context DB *sql.DB - - CaseName string - Summary *Summary -} - -func (c Context) SetSummary(summary any) { - if c.Summary == nil || c.CaseName == "" { - return - } - c.Summary.Set(c.CaseName, summary) } type TickContext struct { diff --git a/pkg/testkit/brhelper/workload/modify_tiflash.go b/pkg/testkit/brhelper/workload/modify_tiflash.go deleted file mode 100644 index 5f3cd8b7b05cc..0000000000000 --- a/pkg/testkit/brhelper/workload/modify_tiflash.go +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright 2025 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package workload - -import ( - "encoding/json" - "fmt" - "strings" -) - -type ModifyTiFlashCase struct { - Suffix string `json:"suffix"` - N int `json:"n"` - NAP int `json:"nap"` - - replicaHistory []replicaHistoryEntry -} - -type modifyTiFlashState struct { - Suffix string `json:"suffix"` - DB string `json:"db"` - Table string `json:"table"` - N int `json:"n"` - NAP int `json:"nap"` - - Ticked int `json:"ticked"` - Inserted int `json:"inserted"` - - Replica int `json:"replica"` - - Checksum TableChecksum `json:"checksum"` - LogDone bool `json:"log_done"` -} - -type replicaHistoryEntry struct { - Tick int `json:"tick"` - Replica int `json:"replica"` -} - -type modifyTiFlashSummary struct { - DB string `json:"db"` - Table string `json:"table"` - N int `json:"n"` - NAP int `json:"nap"` - Ticked int `json:"ticked"` - ReplicaHistory []replicaHistoryEntry `json:"replica_history,omitempty"` -} - -func (s modifyTiFlashSummary) SummaryTable() string { - var b strings.Builder - _, _ = fmt.Fprintf(&b, "db=%s table=%s n=%d nap=%d ticked=%d", s.DB, s.Table, s.N, s.NAP, s.Ticked) - if len(s.ReplicaHistory) > 0 { - b.WriteString("\nreplica history:") - for _, e := range s.ReplicaHistory { - _, _ = fmt.Fprintf(&b, "\n - [%d] %d", e.Tick, e.Replica) - } - } - return b.String() -} - -func (c *ModifyTiFlashCase) Name() string { return "ModifyTiFlash" } - -func (c *ModifyTiFlashCase) Prepare(ctx Context) (json.RawMessage, error) { - c.replicaHistory = nil - - suffix := c.Suffix - if suffix == "" { - var err error - suffix, err = RandSuffix() - if err != nil { - return nil, err - } - } - n := c.N - if n <= 0 { - n = 100 - } - nap := c.NAP - if nap <= 0 { - nap = 1 - } - st := modifyTiFlashState{ - Suffix: suffix, - DB: fmt.Sprintf("test_modify_tiflash_%s", suffix), - Table: "t1", - N: n, - NAP: nap, - Replica: 0, - } - c.replicaHistory = []replicaHistoryEntry{{Tick: 0, Replica: 0}} - if err := ExecAll(ctx, ctx.DB, []string{ - "CREATE DATABASE IF NOT EXISTS " + QIdent(st.DB), - "CREATE TABLE IF NOT EXISTS " + QTable(st.DB, st.Table) + " (" + - "id BIGINT PRIMARY KEY AUTO_INCREMENT," + - "a BIGINT," + - "b BIGINT," + - "c BIGINT" + - ")", - "ALTER TABLE " + QTable(st.DB, st.Table) + " SET TIFLASH REPLICA 0", - }); err != nil { - return nil, err - } - - ctx.SetSummary(modifyTiFlashSummary{ - DB: st.DB, - Table: st.Table, - N: st.N, - NAP: st.NAP, - ReplicaHistory: c.replicaHistory, - }) - return json.Marshal(st) -} - -func (c *ModifyTiFlashCase) Tick(ctx TickContext, raw json.RawMessage) error { - var st modifyTiFlashState - if err := json.Unmarshal(raw, &st); err != nil { - return err - } - if st.N <= 0 { - st.N = 100 - } - if st.NAP <= 0 { - st.NAP = 2 - } - if len(c.replicaHistory) == 0 { - c.replicaHistory = []replicaHistoryEntry{{Tick: st.Ticked, Replica: st.Replica}} - } - - tickNo := st.Ticked + 1 - - if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+QTable(st.DB, st.Table)+" (a,b,c) VALUES (?,?,?)", - int64(st.Inserted), int64(st.Inserted*7+1), int64(st.Inserted*11+2), - ); err != nil { - return err - } - st.Inserted++ - - if EveryNTick(tickNo, st.N) { - max := st.NAP - if max > 0 { - next := tickNo % (max + 1) - if next == st.Replica { - next = (next + 1) % (max + 1) - } - stmt := fmt.Sprintf("ALTER TABLE %s SET TIFLASH REPLICA %d", QTable(st.DB, st.Table), next) - if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { - return err - } - st.Replica = next - c.replicaHistory = append(c.replicaHistory, replicaHistoryEntry{Tick: tickNo, Replica: next}) - } - } - - st.Ticked++ - st.LogDone = true - - updated, err := json.Marshal(st) - if err != nil { - return err - } - ctx.UpdateState(updated) - return nil -} - -func (c *ModifyTiFlashCase) Exit(ctx ExitContext, raw json.RawMessage) error { - var st modifyTiFlashState - if err := json.Unmarshal(raw, &st); err != nil { - return err - } - if len(c.replicaHistory) == 0 { - c.replicaHistory = []replicaHistoryEntry{{Tick: st.Ticked, Replica: st.Replica}} - } - - sum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) - if err != nil { - return err - } - replica, err := TiFlashReplicaCount(ctx, ctx.DB, st.DB, st.Table) - if err != nil { - return err - } - st.Checksum = sum - st.Replica = replica - if last := c.replicaHistory[len(c.replicaHistory)-1]; last.Replica != replica { - c.replicaHistory = append(c.replicaHistory, replicaHistoryEntry{Tick: st.Ticked, Replica: replica}) - } - st.LogDone = true - - ctx.SetSummary(modifyTiFlashSummary{ - DB: st.DB, - Table: st.Table, - N: st.N, - NAP: st.NAP, - Ticked: st.Ticked, - ReplicaHistory: c.replicaHistory, - }) - - updated, err := json.Marshal(st) - if err != nil { - return err - } - ctx.UpdateState(updated) - return nil -} - -func (c *ModifyTiFlashCase) Verify(ctx Context, raw json.RawMessage) error { - var st modifyTiFlashState - if err := json.Unmarshal(raw, &st); err != nil { - return err - } - if err := Require(st.LogDone, "ModifyTiFlash: log not executed"); err != nil { - return err - } - if err := Require(st.Checksum.TotalKvs != "", "ModifyTiFlash: checksum not recorded; run Exit first"); err != nil { - return err - } - - sum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) - if err != nil { - return err - } - if err := Require(sum.TotalKvs == st.Checksum.TotalKvs, "ModifyTiFlash: Total_kvs mismatch: got %q want %q", sum.TotalKvs, st.Checksum.TotalKvs); err != nil { - return err - } - if st.Checksum.TotalBytes != "" { - if err := Require(sum.TotalBytes == st.Checksum.TotalBytes, "ModifyTiFlash: Total_bytes mismatch: got %q want %q", sum.TotalBytes, st.Checksum.TotalBytes); err != nil { - return err - } - } - - replica, err := TiFlashReplicaCount(ctx, ctx.DB, st.DB, st.Table) - if err != nil { - return err - } - return Require(replica == st.Replica, "ModifyTiFlash: tiflash replica mismatch: got %d want %d", replica, st.Replica) -} diff --git a/pkg/testkit/brhelper/workload/nexus_ddl.go b/pkg/testkit/brhelper/workload/nexus_ddl.go deleted file mode 100644 index 101ec5d7cce0a..0000000000000 --- a/pkg/testkit/brhelper/workload/nexus_ddl.go +++ /dev/null @@ -1,266 +0,0 @@ -// Copyright 2025 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package workload - -import ( - "context" - "database/sql" - "encoding/json" - "fmt" - "slices" -) - -type NexusDDLCase struct { - Suffix string `json:"suffix"` - N int `json:"n"` - - ddls []nexusDDLEvent -} - -func (c *NexusDDLCase) Name() string { return "NexusDDL" } - -func (c *NexusDDLCase) Prepare(ctx Context) (json.RawMessage, error) { - c.ddls = nil - - suffix := c.Suffix - if suffix == "" { - var err error - suffix, err = RandSuffix() - if err != nil { - return nil, err - } - } - n := c.N - if n <= 0 { - n = 50 - } - st := nexusState{ - Suffix: suffix, - DB: fmt.Sprintf("test_nexus_ddl_%s", suffix), - N: n, - Ticked: 0, - NextTableID: 1, - Tables: []nexusTableState{{Name: "t_0"}}, - } - if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, 0, "CREATE DATABASE IF NOT EXISTS "+QIdent(st.DB)); err != nil { - return nil, err - } - if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, 0, st.DB, st.Tables[0].Name); err != nil { - return nil, err - } - - ctx.SetSummary(nexusSummary{ - DB: st.DB, - N: st.N, - Ticked: st.Ticked, - DDLs: c.ddls, - }) - return json.Marshal(st) -} - -func (c *NexusDDLCase) Tick(ctx TickContext, raw json.RawMessage) error { - var st nexusState - if err := json.Unmarshal(raw, &st); err != nil { - return err - } - st.N = nexusDefaultN(st.N) - if st.NextTableID <= 0 { - st.NextTableID = len(st.Tables) - } - for i := range st.Tables { - if st.Tables[i].NextColID < len(st.Tables[i].Cols) { - st.Tables[i].NextColID = len(st.Tables[i].Cols) - } - } - - tickNo := st.Ticked + 1 - half := nexusHalf(st.N) - - if EveryNTick(tickNo, 2*st.N) && len(st.Tables) > 0 { - oldest := st.Tables[0].Name - stmt := "DROP TABLE IF EXISTS " + QTable(st.DB, oldest) - if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { - return err - } - st.Tables = st.Tables[1:] - } - - if EveryNTick(tickNo, st.N) { - name := nexusTableName(st.NextTableID) - st.NextTableID++ - if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, tickNo, st.DB, name); err != nil { - return err - } - st.Tables = append(st.Tables, nexusTableState{Name: name}) - } - - if EveryNTick(tickNo, half) && len(st.Tables) > 0 { - youngest := &st.Tables[len(st.Tables)-1] - if err := nexusAddOneColumn(ctx, ctx.DB, &st, &c.ddls, tickNo, youngest); err != nil { - return err - } - } - - if EveryNTick(tickNo, st.N) && len(st.Tables) > 0 { - oldest := &st.Tables[0] - if err := nexusDropOneColumn(ctx, ctx.DB, &st, &c.ddls, tickNo, oldest); err != nil { - return err - } - } - - for _, t := range st.Tables { - if err := nexusInsertRow(ctx, ctx.DB, st.DB, t.Name, tickNo); err != nil { - return err - } - } - - st.Ticked++ - st.LogDone = true - - updated, err := json.Marshal(st) - if err != nil { - return err - } - ctx.UpdateState(updated) - return nil -} - -func (c *NexusDDLCase) Exit(ctx ExitContext, raw json.RawMessage) error { - var st nexusState - if err := json.Unmarshal(raw, &st); err != nil { - return err - } - - sums, err := nexusRecordChecksums(ctx, ctx.DB, st.DB, st.Tables) - if err != nil { - return err - } - st.Checksums = sums - st.LogDone = true - - ctx.SetSummary(nexusSummary{ - DB: st.DB, - N: st.N, - Ticked: st.Ticked, - DDLs: c.ddls, - }) - - updated, err := json.Marshal(st) - if err != nil { - return err - } - ctx.UpdateState(updated) - return nil -} - -func (c *NexusDDLCase) Verify(ctx Context, raw json.RawMessage) error { - var st nexusState - if err := json.Unmarshal(raw, &st); err != nil { - return err - } - if err := Require(st.LogDone, "NexusDDL: log not executed"); err != nil { - return err - } - if err := Require(len(st.Checksums) > 0, "NexusDDL: checksum not recorded; run Exit first"); err != nil { - return err - } - - for _, t := range st.Tables { - ok, err := TableExists(ctx, ctx.DB, st.DB, t.Name) - if err != nil { - return err - } - if err := Require(ok, "NexusDDL: table %s.%s not found", st.DB, t.Name); err != nil { - return err - } - - for _, col := range t.Cols { - has, err := ColumnExists(ctx, ctx.DB, st.DB, t.Name, col) - if err != nil { - return err - } - if err := Require(has, "NexusDDL: %s.%s column %q not found", st.DB, t.Name, col); err != nil { - return err - } - } - - want, ok := st.Checksums[t.Name] - if !ok { - return fmt.Errorf("NexusDDL: missing checksum for table %s.%s", st.DB, t.Name) - } - got, err := AdminChecksumTable(ctx, ctx.DB, st.DB, t.Name) - if err != nil { - return err - } - if err := Require(got.TotalKvs == want.TotalKvs, "NexusDDL: Total_kvs mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalKvs, want.TotalKvs); err != nil { - return err - } - if want.TotalBytes != "" { - if err := Require(got.TotalBytes == want.TotalBytes, "NexusDDL: Total_bytes mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalBytes, want.TotalBytes); err != nil { - return err - } - } - } - return nil -} - -func nexusAddOneColumn(ctx context.Context, db *sql.DB, st *nexusState, ddls *[]nexusDDLEvent, tick int, t *nexusTableState) error { - if t == nil { - return nil - } - if t.NextColID < len(t.Cols) { - t.NextColID = len(t.Cols) - } - - col := fmt.Sprintf("c_%d", t.NextColID) - exists, err := ColumnExists(ctx, db, st.DB, t.Name, col) - if err != nil { - return err - } - if exists { - if !slices.Contains(t.Cols, col) { - t.Cols = append(t.Cols, col) - } - t.NextColID++ - return nil - } - - stmt := "ALTER TABLE " + QTable(st.DB, t.Name) + " ADD COLUMN " + QIdent(col) + " BIGINT" - if err := nexusExecDDL(ctx, db, ddls, tick, stmt); err != nil { - return err - } - t.Cols = append(t.Cols, col) - t.NextColID++ - return nil -} - -func nexusDropOneColumn(ctx context.Context, db *sql.DB, st *nexusState, ddls *[]nexusDDLEvent, tick int, t *nexusTableState) error { - if t == nil || len(t.Cols) == 0 { - return nil - } - col := t.Cols[0] - exists, err := ColumnExists(ctx, db, st.DB, t.Name, col) - if err != nil { - return err - } - if exists { - stmt := "ALTER TABLE " + QTable(st.DB, t.Name) + " DROP COLUMN " + QIdent(col) - if err := nexusExecDDL(ctx, db, ddls, tick, stmt); err != nil { - return err - } - } - t.Cols = t.Cols[1:] - return nil -} diff --git a/pkg/testkit/brhelper/workload/registry.go b/pkg/testkit/brhelper/workload/registry.go deleted file mode 100644 index 1a48dd0b7afd9..0000000000000 --- a/pkg/testkit/brhelper/workload/registry.go +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2025 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package workload - -// AllCases returns the default workload cases. -func AllCases() []Case { - return []Case{ - &AddIndexCase{}, - &NexusDDLCase{}, - &NexusDDLDestructiveCase{}, - &ModifyTiFlashCase{}, - } -} diff --git a/pkg/testkit/brhelper/workload/runner.go b/pkg/testkit/brhelper/workload/runner.go index 7628a50b23106..6e8e5b05bf5b2 100644 --- a/pkg/testkit/brhelper/workload/runner.go +++ b/pkg/testkit/brhelper/workload/runner.go @@ -73,35 +73,34 @@ func (r *Runner) Cases() []CaseSpec { return out } -func (r *Runner) Prepare(ctx context.Context) (*Summary, error) { +func (r *Runner) Prepare(ctx context.Context) error { if err := r.store.Reset(ctx); err != nil { - return nil, err + return err } - summary := NewSummary() for _, spec := range r.cases { - state, err := spec.Case.Prepare(Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}) + state, err := spec.Case.Prepare(Context{Context: ctx, DB: r.db}) if err != nil { - return nil, err + return err } if err := r.store.Put(ctx, spec.Name, state); err != nil { - return nil, err + return err } } - return summary, nil + return nil } -func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { +func (r *Runner) Run(ctx context.Context, cfg RunConfig) error { if cfg.TickCount <= 0 { - return nil, fmt.Errorf("workload: TickCount must be > 0") + return fmt.Errorf("workload: TickCount must be > 0") } if cfg.TickInterval < 0 { - return nil, fmt.Errorf("workload: TickInterval must be >= 0") + return fmt.Errorf("workload: TickInterval must be >= 0") } states, err := r.store.GetAll(ctx) if err != nil { - return nil, err + return err } byName := make(map[string]Case, len(r.cases)) for _, spec := range r.cases { @@ -109,7 +108,7 @@ func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { } for name := range states { if _, ok := byName[name]; !ok { - return nil, fmt.Errorf("workload: unknown case %q in state store", name) + return fmt.Errorf("workload: unknown case %q in state store", name) } } @@ -120,34 +119,33 @@ func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { } } if len(selected) == 0 { - return nil, fmt.Errorf("workload: no cases in state store; run Prepare first") + return fmt.Errorf("workload: no cases in state store; run Prepare first") } - summary := NewSummary() rngs := newCaseRNGs(cfg.Seed, selected) if cfg.Parallel { - if err := r.runParallelTicks(ctx, cfg, selected, states, summary, rngs); err != nil { - return nil, err + if err := r.runParallelTicks(ctx, cfg, selected, states, rngs); err != nil { + return err } } else { - if err := r.runSequentialTicks(ctx, cfg, selected, states, summary, rngs); err != nil { - return nil, err + if err := r.runSequentialTicks(ctx, cfg, selected, states, rngs); err != nil { + return err } } for _, spec := range selected { state, ok := states[spec.Name] if !ok { - return nil, fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + return fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) } exitCtx := ExitContext{ - Context: Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}, + Context: Context{Context: ctx, DB: r.db}, UpdateStateFn: func(updated json.RawMessage) { states[spec.Name] = updated }, } if err := spec.Case.Exit(exitCtx, state); err != nil { - return nil, err + return err } } @@ -158,9 +156,9 @@ func (r *Runner) Run(ctx context.Context, cfg RunConfig) (*Summary, error) { } } if err := r.store.PutMany(ctx, finalStates); err != nil { - return nil, err + return err } - return summary, nil + return nil } func (r *Runner) Verify(ctx context.Context) error { @@ -184,7 +182,7 @@ func (r *Runner) Verify(ctx context.Context) error { if !ok { return fmt.Errorf("workload: unknown case %q in state store", name) } - if err := c.Verify(Context{Context: ctx, DB: r.db, CaseName: name}, state); err != nil { + if err := c.Verify(Context{Context: ctx, DB: r.db}, state); err != nil { return err } } @@ -196,7 +194,6 @@ func (r *Runner) runSequentialTicks( cfg RunConfig, selected []CaseSpec, states map[string]json.RawMessage, - summary *Summary, rngs map[string]*rand.Rand, ) error { shuffleRNG := rand.New(rand.NewPCG(uint64(cfg.Seed), uint64(cfg.Seed>>1))) @@ -210,7 +207,7 @@ func (r *Runner) runSequentialTicks( } rng := rngs[spec.Name] tickCtx := TickContext{ - Context: Context{Context: ctx, DB: r.db, CaseName: spec.Name, Summary: summary}, + Context: Context{Context: ctx, DB: r.db}, RNG: rng, UpdateStateFn: func(updated json.RawMessage) { states[spec.Name] = updated @@ -235,12 +232,11 @@ func (r *Runner) runParallelTicks( cfg RunConfig, selected []CaseSpec, states map[string]json.RawMessage, - summary *Summary, rngs map[string]*rand.Rand, ) error { var mu sync.Mutex for tick := 0; tick < cfg.TickCount; tick++ { - if err := r.runParallelTick(ctx, selected, states, summary, rngs, &mu); err != nil { + if err := r.runParallelTick(ctx, selected, states, rngs, &mu); err != nil { return err } @@ -257,7 +253,6 @@ func (r *Runner) runParallelTick( ctx context.Context, selected []CaseSpec, states map[string]json.RawMessage, - summary *Summary, rngs map[string]*rand.Rand, mu *sync.Mutex, ) error { @@ -291,7 +286,7 @@ func (r *Runner) runParallelTick( rng := rngs[spec.Name] tickCtx := TickContext{ - Context: Context{Context: runCtx, DB: r.db, CaseName: spec.Name, Summary: summary}, + Context: Context{Context: runCtx, DB: r.db}, RNG: rng, UpdateStateFn: func(updated json.RawMessage) { mu.Lock() diff --git a/pkg/testkit/brhelper/workload/summary.go b/pkg/testkit/brhelper/workload/summary.go deleted file mode 100644 index ae5265545b0dc..0000000000000 --- a/pkg/testkit/brhelper/workload/summary.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2025 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package workload - -import "sync" - -type SummaryEntry struct { - Case string `json:"case"` - Summary any `json:"summary"` -} - -type TableSummary interface { - SummaryTable() string -} - -type Summary struct { - mu sync.Mutex - byCase map[string]int - entries []SummaryEntry -} - -func NewSummary() *Summary { - return &Summary{ - byCase: make(map[string]int), - } -} - -func (s *Summary) Set(caseName string, summary any) { - if s == nil || caseName == "" { - return - } - s.mu.Lock() - defer s.mu.Unlock() - - if idx, ok := s.byCase[caseName]; ok { - s.entries[idx].Summary = summary - return - } - - s.byCase[caseName] = len(s.entries) - s.entries = append(s.entries, SummaryEntry{Case: caseName, Summary: summary}) -} - -func (s *Summary) Entries() []SummaryEntry { - if s == nil { - return nil - } - s.mu.Lock() - defer s.mu.Unlock() - - out := make([]SummaryEntry, len(s.entries)) - copy(out, s.entries) - return out -} - -func (s *Summary) Empty() bool { - if s == nil { - return true - } - s.mu.Lock() - defer s.mu.Unlock() - return len(s.entries) == 0 -} diff --git a/tests/realtikvtest/brietest/BUILD.bazel b/tests/realtikvtest/brietest/BUILD.bazel index 37184d9791185..68fb3c0dcc050 100644 --- a/tests/realtikvtest/brietest/BUILD.bazel +++ b/tests/realtikvtest/brietest/BUILD.bazel @@ -12,6 +12,7 @@ go_test( "operator_test.go", "pitr_test.go", "registry_test.go", + "segmented_restore_test.go", "scheduler_test.go", ], flaky = True, @@ -45,6 +46,7 @@ go_test( "//pkg/util/printer", "//pkg/util/table-filter", "//tests/realtikvtest", + "//tests/realtikvtest/brietest/workloadcases", "@com_github_google_uuid//:uuid", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", diff --git a/tests/realtikvtest/brietest/segmented_restore_test.go b/tests/realtikvtest/brietest/segmented_restore_test.go index 174a699b1f223..ad4fe480a7f72 100644 --- a/tests/realtikvtest/brietest/segmented_restore_test.go +++ b/tests/realtikvtest/brietest/segmented_restore_test.go @@ -28,6 +28,7 @@ import ( "github.com/pingcap/tidb/br/pkg/task" "github.com/pingcap/tidb/pkg/testkit" "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" + "github.com/pingcap/tidb/tests/realtikvtest/brietest/workloadcases" "github.com/stretchr/testify/require" ) @@ -43,12 +44,12 @@ func TestSegmentedRestoreWorkload(t *testing.T) { store := workload.NewMemoryStore() cases := []workload.Case{ - &workload.NexusDDLDestructiveCase{}, - &workload.NexusDDLCase{}, - &workload.AddIndexCase{}, + &workloadcases.NexusDDLDestructiveCase{}, + &workloadcases.NexusDDLCase{}, + &workloadcases.AddIndexCase{}, } if tiflashCount := tiflashStoreCount(t, kit.tk); tiflashCount > 0 { - cases = append(cases, &workload.ModifyTiFlashCase{NAP: tiflashCount}) + cases = append(cases, &workloadcases.ModifyTiFlashCase{NAP: tiflashCount}) } else { t.Log("TiFlash not found in environment, won't run tiflash related cases.") } @@ -56,7 +57,7 @@ func TestSegmentedRestoreWorkload(t *testing.T) { require.NoError(t, err) ctx := context.Background() - _, err = runner.Prepare(ctx) + err = runner.Prepare(ctx) require.NoError(t, err) kit.RunFullBackup(func(cfg *task.BackupConfig) { @@ -80,7 +81,7 @@ func TestSegmentedRestoreWorkload(t *testing.T) { } for range 4 { - _, err := runner.Run(ctx, runCfg) + err := runner.Run(ctx, runCfg) require.NoError(t, err) kit.forceFlushAndWait(taskName) checkpoints = append(checkpoints, kit.CheckpointTSOf(taskName)) diff --git a/tests/realtikvtest/brietest/workloadcases/BUILD.bazel b/tests/realtikvtest/brietest/workloadcases/BUILD.bazel new file mode 100644 index 0000000000000..fc5dd10ca3cd7 --- /dev/null +++ b/tests/realtikvtest/brietest/workloadcases/BUILD.bazel @@ -0,0 +1,17 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "workloadcases", + srcs = [ + "add_index.go", + "modify_tiflash.go", + "nexus_common.go", + "nexus_ddl.go", + "nexus_ddl_destructive.go", + ], + importpath = "github.com/pingcap/tidb/tests/realtikvtest/brietest/workloadcases", + visibility = ["//visibility:public"], + deps = [ + "//pkg/testkit/brhelper/workload", + ], +) diff --git a/pkg/testkit/brhelper/workload/add_index.go b/tests/realtikvtest/brietest/workloadcases/add_index.go similarity index 52% rename from pkg/testkit/brhelper/workload/add_index.go rename to tests/realtikvtest/brietest/workloadcases/add_index.go index 1c8987808dbab..961a6561d9d2b 100644 --- a/pkg/testkit/brhelper/workload/add_index.go +++ b/tests/realtikvtest/brietest/workloadcases/add_index.go @@ -12,21 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -package workload +package workloadcases import ( "encoding/json" "fmt" "strings" + + "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" ) type AddIndexCase struct { Suffix string `json:"suffix"` N int `json:"n"` NR int `json:"nr"` - - indexesAdded []addIndexSpec - indexesDropped []addIndexSpec } type addIndexSpec struct { @@ -48,57 +47,17 @@ type addIndexState struct { Indexes []addIndexSpec `json:"indexes"` - Checksum TableChecksum `json:"checksum"` - LogDone bool `json:"log_done"` -} - -type addIndexSummary struct { - DB string `json:"db"` - Table string `json:"table"` - N int `json:"n"` - NR int `json:"nr"` - Ticked int `json:"ticked"` - - IndexesAdded []addIndexSpec `json:"indexes_added,omitempty"` - IndexesDropped []addIndexSpec `json:"indexes_dropped,omitempty"` -} - -func (s addIndexSummary) SummaryTable() string { - var b strings.Builder - _, _ = fmt.Fprintf(&b, "db=%s table=%s n=%d nr=%d ticked=%d", s.DB, s.Table, s.N, s.NR, s.Ticked) - if len(s.IndexesAdded) > 0 { - b.WriteString("\nindexes added:") - for _, idx := range s.IndexesAdded { - b.WriteString("\n - ") - b.WriteString(idx.Name) - if len(idx.Columns) > 0 { - b.WriteString("(" + strings.Join(idx.Columns, ",") + ")") - } - } - } - if len(s.IndexesDropped) > 0 { - b.WriteString("\nindexes dropped:") - for _, idx := range s.IndexesDropped { - b.WriteString("\n - ") - b.WriteString(idx.Name) - if len(idx.Columns) > 0 { - b.WriteString("(" + strings.Join(idx.Columns, ",") + ")") - } - } - } - return b.String() + Checksum workload.TableChecksum `json:"checksum"` + LogDone bool `json:"log_done"` } func (c *AddIndexCase) Name() string { return "AddIndex" } -func (c *AddIndexCase) Prepare(ctx Context) (json.RawMessage, error) { - c.indexesAdded = nil - c.indexesDropped = nil - +func (c *AddIndexCase) Prepare(ctx workload.Context) (json.RawMessage, error) { suffix := c.Suffix if suffix == "" { var err error - suffix, err = RandSuffix() + suffix, err = workload.RandSuffix() if err != nil { return nil, err } @@ -119,9 +78,9 @@ func (c *AddIndexCase) Prepare(ctx Context) (json.RawMessage, error) { NR: nr, NextIndexID: 0, } - if err := ExecAll(ctx, ctx.DB, []string{ - "CREATE DATABASE IF NOT EXISTS " + QIdent(st.DB), - "CREATE TABLE IF NOT EXISTS " + QTable(st.DB, st.Table) + " (" + + if err := workload.ExecAll(ctx, ctx.DB, []string{ + "CREATE DATABASE IF NOT EXISTS " + workload.QIdent(st.DB), + "CREATE TABLE IF NOT EXISTS " + workload.QTable(st.DB, st.Table) + " (" + "id BIGINT PRIMARY KEY AUTO_INCREMENT," + "a BIGINT," + "b BIGINT," + @@ -133,16 +92,10 @@ func (c *AddIndexCase) Prepare(ctx Context) (json.RawMessage, error) { return nil, err } - ctx.SetSummary(addIndexSummary{ - DB: st.DB, - Table: st.Table, - N: st.N, - NR: st.NR, - }) return json.Marshal(st) } -func (c *AddIndexCase) Tick(ctx TickContext, raw json.RawMessage) error { +func (c *AddIndexCase) Tick(ctx workload.TickContext, raw json.RawMessage) error { var st addIndexState if err := json.Unmarshal(raw, &st); err != nil { return err @@ -172,29 +125,19 @@ func (c *AddIndexCase) Tick(ctx TickContext, raw json.RawMessage) error { return nil } -func (c *AddIndexCase) Exit(ctx ExitContext, raw json.RawMessage) error { +func (c *AddIndexCase) Exit(ctx workload.ExitContext, raw json.RawMessage) error { var st addIndexState if err := json.Unmarshal(raw, &st); err != nil { return err } - checksum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + checksum, err := workload.AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) if err != nil { return err } st.Checksum = checksum st.LogDone = true - ctx.SetSummary(addIndexSummary{ - DB: st.DB, - Table: st.Table, - N: st.N, - NR: st.NR, - Ticked: st.Ticked, - IndexesAdded: c.indexesAdded, - IndexesDropped: c.indexesDropped, - }) - updated, err := json.Marshal(st) if err != nil { return err @@ -203,37 +146,37 @@ func (c *AddIndexCase) Exit(ctx ExitContext, raw json.RawMessage) error { return nil } -func (c *AddIndexCase) Verify(ctx Context, raw json.RawMessage) error { +func (c *AddIndexCase) Verify(ctx workload.Context, raw json.RawMessage) error { var st addIndexState if err := json.Unmarshal(raw, &st); err != nil { return err } - if err := Require(st.LogDone, "AddIndex: log not executed"); err != nil { + if err := workload.Require(st.LogDone, "AddIndex: log not executed"); err != nil { return err } - if err := Require(st.Checksum.TotalKvs != "", "AddIndex: checksum not recorded; run Exit first"); err != nil { + if err := workload.Require(st.Checksum.TotalKvs != "", "AddIndex: checksum not recorded; run Exit first"); err != nil { return err } for _, idx := range st.Indexes { - ok, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, idx.Name) + ok, err := workload.IndexExists(ctx, ctx.DB, st.DB, st.Table, idx.Name) if err != nil { return err } - if err := Require(ok, "AddIndex: index %q not found", idx.Name); err != nil { + if err := workload.Require(ok, "AddIndex: index %q not found", idx.Name); err != nil { return err } } - checksum, err := AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + checksum, err := workload.AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) if err != nil { return err } - if err := Require(checksum.TotalKvs == st.Checksum.TotalKvs, "AddIndex: Total_kvs mismatch: got %q want %q", checksum.TotalKvs, st.Checksum.TotalKvs); err != nil { + if err := workload.Require(checksum.TotalKvs == st.Checksum.TotalKvs, "AddIndex: Total_kvs mismatch: got %q want %q", checksum.TotalKvs, st.Checksum.TotalKvs); err != nil { return err } if st.Checksum.TotalBytes != "" { - return Require(checksum.TotalBytes == st.Checksum.TotalBytes, "AddIndex: Total_bytes mismatch: got %q want %q", checksum.TotalBytes, st.Checksum.TotalBytes) + return workload.Require(checksum.TotalBytes == st.Checksum.TotalBytes, "AddIndex: Total_bytes mismatch: got %q want %q", checksum.TotalBytes, st.Checksum.TotalBytes) } return nil } @@ -259,9 +202,9 @@ func normalizeAddIndexState(st *addIndexState) { } } -func addIndexInsertRow(ctx TickContext, st *addIndexState) error { +func addIndexInsertRow(ctx workload.TickContext, st *addIndexState) error { v := int64(st.Inserted) - if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+QTable(st.DB, st.Table)+" (a,b,c,d,e) VALUES (?,?,?,?,?)", + if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+workload.QTable(st.DB, st.Table)+" (a,b,c,d,e) VALUES (?,?,?,?,?)", v, v*7+1, v*11+2, v*13+3, v*17+4, ); err != nil { return err @@ -270,8 +213,8 @@ func addIndexInsertRow(ctx TickContext, st *addIndexState) error { return nil } -func (c *AddIndexCase) maybeAddIndex(ctx TickContext, st *addIndexState, tickNo int) error { - if !EveryNTick(tickNo, st.N) { +func (c *AddIndexCase) maybeAddIndex(ctx workload.TickContext, st *addIndexState, tickNo int) error { + if !workload.EveryNTick(tickNo, st.N) { return nil } allCols := []string{"a", "b", "c", "d", "e"} @@ -285,16 +228,16 @@ func (c *AddIndexCase) maybeAddIndex(ctx TickContext, st *addIndexState, tickNo cols = append(cols, allCols[(start+i)%len(allCols)]) } - exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, idxName) + exists, err := workload.IndexExists(ctx, ctx.DB, st.DB, st.Table, idxName) if err != nil { return err } if !exists { colSQL := make([]string, 0, len(cols)) for _, col := range cols { - colSQL = append(colSQL, QIdent(col)) + colSQL = append(colSQL, workload.QIdent(col)) } - stmt := "CREATE INDEX " + QIdent(idxName) + " ON " + QTable(st.DB, st.Table) + " (" + strings.Join(colSQL, ",") + ")" + stmt := "CREATE INDEX " + workload.QIdent(idxName) + " ON " + workload.QTable(st.DB, st.Table) + " (" + strings.Join(colSQL, ",") + ")" if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { return err } @@ -304,31 +247,27 @@ func (c *AddIndexCase) maybeAddIndex(ctx TickContext, st *addIndexState, tickNo if !hasAddIndexSpec(st.Indexes, idxName) { st.Indexes = append(st.Indexes, spec) } - if !hasAddIndexSpec(c.indexesAdded, idxName) { - c.indexesAdded = append(c.indexesAdded, spec) - } st.NextIndexID++ return nil } -func (c *AddIndexCase) maybeDropIndex(ctx TickContext, st *addIndexState, tickNo int) error { - if !EveryNTick(tickNo, st.NR) || len(st.Indexes) == 0 { +func (c *AddIndexCase) maybeDropIndex(ctx workload.TickContext, st *addIndexState, tickNo int) error { + if !workload.EveryNTick(tickNo, st.NR) || len(st.Indexes) == 0 { return nil } idx := ctx.RNG.IntN(len(st.Indexes)) dropSpec := st.Indexes[idx] - exists, err := IndexExists(ctx, ctx.DB, st.DB, st.Table, dropSpec.Name) + exists, err := workload.IndexExists(ctx, ctx.DB, st.DB, st.Table, dropSpec.Name) if err != nil { return err } if exists { - stmt := "DROP INDEX " + QIdent(dropSpec.Name) + " ON " + QTable(st.DB, st.Table) + stmt := "DROP INDEX " + workload.QIdent(dropSpec.Name) + " ON " + workload.QTable(st.DB, st.Table) if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { return err } } - c.indexesDropped = append(c.indexesDropped, dropSpec) st.Indexes = append(st.Indexes[:idx], st.Indexes[idx+1:]...) return nil } diff --git a/tests/realtikvtest/brietest/workloadcases/modify_tiflash.go b/tests/realtikvtest/brietest/workloadcases/modify_tiflash.go new file mode 100644 index 0000000000000..7cd0649b08448 --- /dev/null +++ b/tests/realtikvtest/brietest/workloadcases/modify_tiflash.go @@ -0,0 +1,192 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package workloadcases + +import ( + "encoding/json" + "fmt" + + "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" +) + +type ModifyTiFlashCase struct { + Suffix string `json:"suffix"` + N int `json:"n"` + NAP int `json:"nap"` +} + +type modifyTiFlashState struct { + Suffix string `json:"suffix"` + DB string `json:"db"` + Table string `json:"table"` + N int `json:"n"` + NAP int `json:"nap"` + + Ticked int `json:"ticked"` + Inserted int `json:"inserted"` + + Replica int `json:"replica"` + + Checksum workload.TableChecksum `json:"checksum"` + LogDone bool `json:"log_done"` +} + +func (c *ModifyTiFlashCase) Name() string { return "ModifyTiFlash" } + +func (c *ModifyTiFlashCase) Prepare(ctx workload.Context) (json.RawMessage, error) { + suffix := c.Suffix + if suffix == "" { + var err error + suffix, err = workload.RandSuffix() + if err != nil { + return nil, err + } + } + n := c.N + if n <= 0 { + n = 100 + } + nap := c.NAP + if nap <= 0 { + nap = 1 + } + st := modifyTiFlashState{ + Suffix: suffix, + DB: fmt.Sprintf("test_modify_tiflash_%s", suffix), + Table: "t1", + N: n, + NAP: nap, + Replica: 0, + } + if err := workload.ExecAll(ctx, ctx.DB, []string{ + "CREATE DATABASE IF NOT EXISTS " + workload.QIdent(st.DB), + "CREATE TABLE IF NOT EXISTS " + workload.QTable(st.DB, st.Table) + " (" + + "id BIGINT PRIMARY KEY AUTO_INCREMENT," + + "a BIGINT," + + "b BIGINT," + + "c BIGINT" + + ")", + "ALTER TABLE " + workload.QTable(st.DB, st.Table) + " SET TIFLASH REPLICA 0", + }); err != nil { + return nil, err + } + + return json.Marshal(st) +} + +func (c *ModifyTiFlashCase) Tick(ctx workload.TickContext, raw json.RawMessage) error { + var st modifyTiFlashState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if st.N <= 0 { + st.N = 100 + } + if st.NAP <= 0 { + st.NAP = 2 + } + + tickNo := st.Ticked + 1 + + if _, err := ctx.DB.ExecContext(ctx, "INSERT INTO "+workload.QTable(st.DB, st.Table)+" (a,b,c) VALUES (?,?,?)", + int64(st.Inserted), int64(st.Inserted*7+1), int64(st.Inserted*11+2), + ); err != nil { + return err + } + st.Inserted++ + + if workload.EveryNTick(tickNo, st.N) { + max := st.NAP + if max > 0 { + next := tickNo % (max + 1) + if next == st.Replica { + next = (next + 1) % (max + 1) + } + stmt := fmt.Sprintf("ALTER TABLE %s SET TIFLASH REPLICA %d", workload.QTable(st.DB, st.Table), next) + if _, err := ctx.DB.ExecContext(ctx, stmt); err != nil { + return err + } + st.Replica = next + } + } + + st.Ticked++ + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *ModifyTiFlashCase) Exit(ctx workload.ExitContext, raw json.RawMessage) error { + var st modifyTiFlashState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + + sum, err := workload.AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + replica, err := workload.TiFlashReplicaCount(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + st.Checksum = sum + st.Replica = replica + st.LogDone = true + + updated, err := json.Marshal(st) + if err != nil { + return err + } + ctx.UpdateState(updated) + return nil +} + +func (c *ModifyTiFlashCase) Verify(ctx workload.Context, raw json.RawMessage) error { + var st modifyTiFlashState + if err := json.Unmarshal(raw, &st); err != nil { + return err + } + if err := workload.Require(st.LogDone, "ModifyTiFlash: log not executed"); err != nil { + return err + } + if err := workload.Require(st.Checksum.TotalKvs != "", "ModifyTiFlash: checksum not recorded; run Exit first"); err != nil { + return err + } + + sum, err := workload.AdminChecksumTable(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + if err := workload.Require(sum.TotalKvs == st.Checksum.TotalKvs, "ModifyTiFlash: Total_kvs mismatch: got %q want %q", sum.TotalKvs, st.Checksum.TotalKvs); err != nil { + return err + } + if st.Checksum.TotalBytes != "" { + if err := workload.Require(sum.TotalBytes == st.Checksum.TotalBytes, "ModifyTiFlash: Total_bytes mismatch: got %q want %q", sum.TotalBytes, st.Checksum.TotalBytes); err != nil { + return err + } + } + + replica, err := workload.TiFlashReplicaCount(ctx, ctx.DB, st.DB, st.Table) + if err != nil { + return err + } + return workload.Require(replica == st.Replica, "ModifyTiFlash: tiflash replica mismatch: got %d want %d", replica, st.Replica) +} diff --git a/pkg/testkit/brhelper/workload/nexus_common.go b/tests/realtikvtest/brietest/workloadcases/nexus_common.go similarity index 55% rename from pkg/testkit/brhelper/workload/nexus_common.go rename to tests/realtikvtest/brietest/workloadcases/nexus_common.go index ed29d6c34af41..f9a41a04edfbf 100644 --- a/pkg/testkit/brhelper/workload/nexus_common.go +++ b/tests/realtikvtest/brietest/workloadcases/nexus_common.go @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -package workload +package workloadcases import ( "context" "database/sql" "fmt" - "strings" + + "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" ) type nexusTableState struct { @@ -27,11 +28,6 @@ type nexusTableState struct { Cols []string `json:"cols,omitempty"` } -type nexusDDLEvent struct { - Tick int `json:"tick"` - Stmt string `json:"stmt"` -} - type nexusState struct { Suffix string `json:"suffix"` DB string `json:"db"` @@ -41,27 +37,8 @@ type nexusState struct { NextTableID int `json:"next_table_id"` Tables []nexusTableState `json:"tables"` - Checksums map[string]TableChecksum `json:"checksums,omitempty"` - LogDone bool `json:"log_done"` -} - -type nexusSummary struct { - DB string `json:"db"` - N int `json:"n"` - Ticked int `json:"ticked"` - DDLs []nexusDDLEvent `json:"ddls,omitempty"` -} - -func (s nexusSummary) SummaryTable() string { - var b strings.Builder - _, _ = fmt.Fprintf(&b, "db=%s n=%d ticked=%d", s.DB, s.N, s.Ticked) - if len(s.DDLs) > 0 { - b.WriteString("\nddls:") - for _, e := range s.DDLs { - _, _ = fmt.Fprintf(&b, "\n - [%d] %s", e.Tick, e.Stmt) - } - } - return b.String() + Checksums map[string]workload.TableChecksum `json:"checksums,omitempty"` + LogDone bool `json:"log_done"` } func nexusDefaultN(n int) int { @@ -83,34 +60,31 @@ func nexusTableName(id int) string { return fmt.Sprintf("t_%d", id) } -func nexusExecDDL(ctx context.Context, db *sql.DB, ddls *[]nexusDDLEvent, tick int, stmt string) error { - if ddls != nil { - *ddls = append(*ddls, nexusDDLEvent{Tick: tick, Stmt: stmt}) - } +func nexusExecDDL(ctx context.Context, db *sql.DB, tick int, stmt string) error { _, err := db.ExecContext(ctx, stmt) return err } -func nexusCreateTable(ctx context.Context, db *sql.DB, ddls *[]nexusDDLEvent, tick int, schema, table string) error { - stmt := "CREATE TABLE IF NOT EXISTS " + QTable(schema, table) + " (" + +func nexusCreateTable(ctx context.Context, db *sql.DB, tick int, schema, table string) error { + stmt := "CREATE TABLE IF NOT EXISTS " + workload.QTable(schema, table) + " (" + "id BIGINT PRIMARY KEY AUTO_INCREMENT," + "v BIGINT," + "s VARCHAR(64) NOT NULL" + ")" - return nexusExecDDL(ctx, db, ddls, tick, stmt) + return nexusExecDDL(ctx, db, tick, stmt) } func nexusInsertRow(ctx context.Context, db *sql.DB, schema, table string, tick int) error { - _, err := db.ExecContext(ctx, "INSERT INTO "+QTable(schema, table)+" (v,s) VALUES (?,?)", + _, err := db.ExecContext(ctx, "INSERT INTO "+workload.QTable(schema, table)+" (v,s) VALUES (?,?)", int64(tick), fmt.Sprintf("%s_%d", table, tick), ) return err } -func nexusRecordChecksums(ctx context.Context, db *sql.DB, schema string, tables []nexusTableState) (map[string]TableChecksum, error) { - out := make(map[string]TableChecksum, len(tables)) +func nexusRecordChecksums(ctx context.Context, db *sql.DB, schema string, tables []nexusTableState) (map[string]workload.TableChecksum, error) { + out := make(map[string]workload.TableChecksum, len(tables)) for _, t := range tables { - sum, err := AdminChecksumTable(ctx, db, schema, t.Name) + sum, err := workload.AdminChecksumTable(ctx, db, schema, t.Name) if err != nil { return nil, err } diff --git a/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go b/tests/realtikvtest/brietest/workloadcases/nexus_ddl_destructive.go similarity index 57% rename from pkg/testkit/brhelper/workload/nexus_ddl_destructive.go rename to tests/realtikvtest/brietest/workloadcases/nexus_ddl_destructive.go index 65c9316874297..55e27d81bc1fa 100644 --- a/pkg/testkit/brhelper/workload/nexus_ddl_destructive.go +++ b/tests/realtikvtest/brietest/workloadcases/nexus_ddl_destructive.go @@ -12,29 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -package workload +package workloadcases import ( "encoding/json" "fmt" + + "github.com/pingcap/tidb/pkg/testkit/brhelper/workload" ) type NexusDDLDestructiveCase struct { Suffix string `json:"suffix"` N int `json:"n"` - - ddls []nexusDDLEvent } func (c *NexusDDLDestructiveCase) Name() string { return "NexusDDLDestructive" } -func (c *NexusDDLDestructiveCase) Prepare(ctx Context) (json.RawMessage, error) { - c.ddls = nil - +func (c *NexusDDLDestructiveCase) Prepare(ctx workload.Context) (json.RawMessage, error) { suffix := c.Suffix if suffix == "" { var err error - suffix, err = RandSuffix() + suffix, err = workload.RandSuffix() if err != nil { return nil, err } @@ -51,23 +49,16 @@ func (c *NexusDDLDestructiveCase) Prepare(ctx Context) (json.RawMessage, error) NextTableID: 1, Tables: []nexusTableState{{Name: "t_0"}}, } - if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, 0, "CREATE DATABASE IF NOT EXISTS "+QIdent(st.DB)); err != nil { + if err := nexusExecDDL(ctx, ctx.DB, 0, "CREATE DATABASE IF NOT EXISTS "+workload.QIdent(st.DB)); err != nil { return nil, err } - if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, 0, st.DB, st.Tables[0].Name); err != nil { + if err := nexusCreateTable(ctx, ctx.DB, 0, st.DB, st.Tables[0].Name); err != nil { return nil, err } - - ctx.SetSummary(nexusSummary{ - DB: st.DB, - N: st.N, - Ticked: st.Ticked, - DDLs: c.ddls, - }) return json.Marshal(st) } -func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) error { +func (c *NexusDDLDestructiveCase) Tick(ctx workload.TickContext, raw json.RawMessage) error { var st nexusState if err := json.Unmarshal(raw, &st); err != nil { return err @@ -80,31 +71,31 @@ func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) err tickNo := st.Ticked + 1 half := nexusHalf(st.N) - if EveryNTick(tickNo, st.N) { + if workload.EveryNTick(tickNo, st.N) { name := nexusTableName(st.NextTableID) st.NextTableID++ - if err := nexusCreateTable(ctx, ctx.DB, &c.ddls, tickNo, st.DB, name); err != nil { + if err := nexusCreateTable(ctx, ctx.DB, tickNo, st.DB, name); err != nil { return err } st.Tables = append(st.Tables, nexusTableState{Name: name}) } - if EveryNTick(tickNo, half) && len(st.Tables) > 0 { + if workload.EveryNTick(tickNo, half) && len(st.Tables) > 0 { idx := ctx.RNG.IntN(len(st.Tables)) oldName := st.Tables[idx].Name newName := nexusTableName(st.NextTableID) st.NextTableID++ - stmt := "RENAME TABLE " + QTable(st.DB, oldName) + " TO " + QTable(st.DB, newName) - if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { + stmt := "RENAME TABLE " + workload.QTable(st.DB, oldName) + " TO " + workload.QTable(st.DB, newName) + if err := nexusExecDDL(ctx, ctx.DB, tickNo, stmt); err != nil { return err } st.Tables[idx].Name = newName } - if EveryNTick(tickNo, 2*st.N) && len(st.Tables) > 0 { + if workload.EveryNTick(tickNo, 2*st.N) && len(st.Tables) > 0 { idx := ctx.RNG.IntN(len(st.Tables)) - stmt := "TRUNCATE TABLE " + QTable(st.DB, st.Tables[idx].Name) - if err := nexusExecDDL(ctx, ctx.DB, &c.ddls, tickNo, stmt); err != nil { + stmt := "TRUNCATE TABLE " + workload.QTable(st.DB, st.Tables[idx].Name) + if err := nexusExecDDL(ctx, ctx.DB, tickNo, stmt); err != nil { return err } } @@ -126,7 +117,7 @@ func (c *NexusDDLDestructiveCase) Tick(ctx TickContext, raw json.RawMessage) err return nil } -func (c *NexusDDLDestructiveCase) Exit(ctx ExitContext, raw json.RawMessage) error { +func (c *NexusDDLDestructiveCase) Exit(ctx workload.ExitContext, raw json.RawMessage) error { var st nexusState if err := json.Unmarshal(raw, &st); err != nil { return err @@ -139,13 +130,6 @@ func (c *NexusDDLDestructiveCase) Exit(ctx ExitContext, raw json.RawMessage) err st.Checksums = sums st.LogDone = true - ctx.SetSummary(nexusSummary{ - DB: st.DB, - N: st.N, - Ticked: st.Ticked, - DDLs: c.ddls, - }) - updated, err := json.Marshal(st) if err != nil { return err @@ -154,24 +138,24 @@ func (c *NexusDDLDestructiveCase) Exit(ctx ExitContext, raw json.RawMessage) err return nil } -func (c *NexusDDLDestructiveCase) Verify(ctx Context, raw json.RawMessage) error { +func (c *NexusDDLDestructiveCase) Verify(ctx workload.Context, raw json.RawMessage) error { var st nexusState if err := json.Unmarshal(raw, &st); err != nil { return err } - if err := Require(st.LogDone, "NexusDDLDestructive: log not executed"); err != nil { + if err := workload.Require(st.LogDone, "NexusDDLDestructive: log not executed"); err != nil { return err } - if err := Require(len(st.Checksums) > 0, "NexusDDLDestructive: checksum not recorded; run Exit first"); err != nil { + if err := workload.Require(len(st.Checksums) > 0, "NexusDDLDestructive: checksum not recorded; run Exit first"); err != nil { return err } for _, t := range st.Tables { - ok, err := TableExists(ctx, ctx.DB, st.DB, t.Name) + ok, err := workload.TableExists(ctx, ctx.DB, st.DB, t.Name) if err != nil { return err } - if err := Require(ok, "NexusDDLDestructive: table %s.%s not found", st.DB, t.Name); err != nil { + if err := workload.Require(ok, "NexusDDLDestructive: table %s.%s not found", st.DB, t.Name); err != nil { return err } @@ -179,15 +163,15 @@ func (c *NexusDDLDestructiveCase) Verify(ctx Context, raw json.RawMessage) error if !ok { return fmt.Errorf("NexusDDLDestructive: missing checksum for table %s.%s", st.DB, t.Name) } - got, err := AdminChecksumTable(ctx, ctx.DB, st.DB, t.Name) + got, err := workload.AdminChecksumTable(ctx, ctx.DB, st.DB, t.Name) if err != nil { return err } - if err := Require(got.TotalKvs == want.TotalKvs, "NexusDDLDestructive: Total_kvs mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalKvs, want.TotalKvs); err != nil { + if err := workload.Require(got.TotalKvs == want.TotalKvs, "NexusDDLDestructive: Total_kvs mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalKvs, want.TotalKvs); err != nil { return err } if want.TotalBytes != "" { - if err := Require(got.TotalBytes == want.TotalBytes, "NexusDDLDestructive: Total_bytes mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalBytes, want.TotalBytes); err != nil { + if err := workload.Require(got.TotalBytes == want.TotalBytes, "NexusDDLDestructive: Total_bytes mismatch for %s.%s: got %q want %q", st.DB, t.Name, got.TotalBytes, want.TotalBytes); err != nil { return err } } From eac4837b9c1f74002be6508ed039dd00dc347b14 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Fri, 30 Jan 2026 05:53:59 +0000 Subject: [PATCH 12/18] tidy up codes Signed-off-by: Juncen Yu --- pkg/testkit/brhelper/workload/runner.go | 67 ++++++++++++------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/pkg/testkit/brhelper/workload/runner.go b/pkg/testkit/brhelper/workload/runner.go index 6e8e5b05bf5b2..2e8ae8465f1ec 100644 --- a/pkg/testkit/brhelper/workload/runner.go +++ b/pkg/testkit/brhelper/workload/runner.go @@ -268,40 +268,39 @@ func (r *Runner) runParallelTick( var firstErr error for _, spec := range selected { - spec := spec - wg.Add(1) - go func() { - defer wg.Done() - - mu.Lock() - state, ok := states[spec.Name] - mu.Unlock() - if !ok { - once.Do(func() { - firstErr = fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) - cancel() - }) - return - } - rng := rngs[spec.Name] - - tickCtx := TickContext{ - Context: Context{Context: runCtx, DB: r.db}, - RNG: rng, - UpdateStateFn: func(updated json.RawMessage) { - mu.Lock() - states[spec.Name] = updated - mu.Unlock() - }, - } - if err := spec.Case.Tick(tickCtx, state); err != nil { - once.Do(func() { - firstErr = err - cancel() - }) - return - } - }() + wg.Go( + func() { + defer wg.Done() + + mu.Lock() + state, ok := states[spec.Name] + mu.Unlock() + if !ok { + once.Do(func() { + firstErr = fmt.Errorf("workload: case %q not found in state store; run Prepare first", spec.Name) + cancel() + }) + return + } + rng := rngs[spec.Name] + + tickCtx := TickContext{ + Context: Context{Context: runCtx, DB: r.db}, + RNG: rng, + UpdateStateFn: func(updated json.RawMessage) { + mu.Lock() + states[spec.Name] = updated + mu.Unlock() + }, + } + if err := spec.Case.Tick(tickCtx, state); err != nil { + once.Do(func() { + firstErr = err + cancel() + }) + return + } + }) } wg.Wait() return firstErr From 02c38b590a9359411e1b61a058caa00215d88812 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Mon, 2 Feb 2026 11:30:03 +0000 Subject: [PATCH 13/18] tidy up codes Signed-off-by: Juncen Yu --- br/pkg/restore/log_client/client.go | 16 +- br/pkg/restore/log_client/id_map.go | 151 +++----- br/pkg/restore/log_client/ingest_items.go | 60 ++- .../restore/log_client/pitr_id_map_payload.go | 357 ++++++++++++++++++ br/pkg/restore/log_client/tiflash_items.go | 58 ++- br/pkg/task/operator/checksum_table.go | 12 +- br/pkg/task/stream.go | 245 +++++++++--- br/tests/br_restore_checkpoint/run.sh | 13 +- go.mod | 1 + pkg/testkit/brhelper/workload/runner.go | 2 - 10 files changed, 733 insertions(+), 182 deletions(-) create mode 100644 br/pkg/restore/log_client/pitr_id_map_payload.go diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 923913f694ba1..293da264273dc 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -243,6 +243,10 @@ func (rc *LogClient) LastOne() bool { return rc.lastRestore } +func (rc *LogClient) ValidateNoTiFlashReplica() error { + return rc.validateNoTiFlashReplica() +} + type restoreStatistics struct { // restoreSSTKVSize is the total size (Original KV length) of KV pairs restored from SST files. restoreSSTKVSize uint64 @@ -1990,12 +1994,22 @@ func (rc *LogClient) SaveIdMapWithFailPoints( ctx context.Context, manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + payload := newPitrIdMapPayload(manager.ToProto()) + return rc.SavePitrIdMapPayloadWithFailPoints(ctx, rc.restoreTS, payload, logCheckpointMetaManager) +} + +func (rc *LogClient) SavePitrIdMapPayloadWithFailPoints( + ctx context.Context, + restoredTS uint64, + payload *backuppb.PitrIdMapPayload, + logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { failpoint.Inject("failed-before-id-maps-saved", func(_ failpoint.Value) { failpoint.Return(errors.New("failpoint: failed before id maps saved")) }) - if err := rc.saveIDMap(ctx, manager, logCheckpointMetaManager); err != nil { + if err := rc.savePitrIdMapPayload(ctx, restoredTS, payload, logCheckpointMetaManager); err != nil { return errors.Trace(err) } diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 208d66c5065fd..0d49129527177 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -18,7 +18,6 @@ import ( "context" "fmt" - "github.com/gogo/protobuf/proto" "github.com/pingcap/errors" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" @@ -27,7 +26,6 @@ import ( "github.com/pingcap/tidb/br/pkg/metautil" "github.com/pingcap/tidb/br/pkg/restore" "github.com/pingcap/tidb/br/pkg/stream" - "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/objstore/storeapi" "github.com/pingcap/tidb/pkg/parser/ast" "go.uber.org/zap" @@ -63,32 +61,8 @@ func (rc *LogClient) saveIDMap( manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { - dbmaps := manager.ToProto() - if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { - log.Info("checkpoint storage is specified, load pitr id map from the checkpoint storage.") - if err := rc.saveIDMap2Storage(ctx, checkpointStorage, dbmaps); err != nil { - return errors.Trace(err) - } - } else if rc.pitrIDMapTableExists() { - if err := rc.saveIDMap2Table(ctx, dbmaps); err != nil { - return errors.Trace(err) - } - } else { - log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") - if err := rc.saveIDMap2Storage(ctx, rc.storage, dbmaps); err != nil { - return errors.Trace(err) - } - } - - if rc.useCheckpoint { - log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") - if err := logCheckpointMetaManager.SaveCheckpointProgress(ctx, &checkpoint.CheckpointProgress{ - Progress: checkpoint.InLogRestoreAndIdMapPersisted, - }); err != nil { - return errors.Trace(err) - } - } - return nil + payload := newPitrIdMapPayload(manager.ToProto()) + return rc.savePitrIdMapPayload(ctx, rc.restoreTS, payload, logCheckpointMetaManager) } func (rc *LogClient) saveIDMap2Storage( @@ -107,48 +81,49 @@ func (rc *LogClient) saveIDMap2Storage( } func (rc *LogClient) saveIDMap2Table(ctx context.Context, dbMaps []*backuppb.PitrDBMap) error { - backupmeta := &backuppb.BackupMeta{DbMaps: dbMaps} - data, err := proto.Marshal(backupmeta) - if err != nil { + payload := newPitrIdMapPayload(dbMaps) + if existing, found, err := rc.loadPitrIdMapPayloadFromTable(ctx, rc.restoreTS, rc.restoreID); err != nil { return errors.Trace(err) + } else if found { + payload.IngestItems = existing.IngestItems + payload.TiflashItems = existing.TiflashItems } + return errors.Trace(rc.savePitrIdMapPayloadToTable(ctx, rc.restoreTS, payload)) +} - hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() - - if hasRestoreIDColumn { - // new version with restore_id column - // clean the dirty id map at first - err = rc.unsafeSession.ExecuteInternal(ctx, "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? and restore_id = %?;", - rc.restoreTS, rc.upstreamClusterID, rc.restoreID) - if err != nil { +func (rc *LogClient) savePitrIdMapPayload( + ctx context.Context, + restoredTS uint64, + payload *backuppb.PitrIdMapPayload, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + if payload == nil { + return errors.New("pitr id map payload is nil") + } + tableExists := rc.pitrIDMapTableExists() + if tableExists { + if err := rc.savePitrIdMapPayloadToTable(ctx, restoredTS, payload); err != nil { return errors.Trace(err) } - replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restore_id, restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?, %?);" - for startIdx, segmentId := 0, 0; startIdx < len(data); segmentId += 1 { - endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) - err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, rc.restoreID, rc.restoreTS, rc.upstreamClusterID, segmentId, data[startIdx:endIdx]) - if err != nil { - return errors.Trace(err) - } - startIdx = endIdx + } + if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { + log.Info("checkpoint storage is specified, save pitr id map to the checkpoint storage.") + if err := rc.saveIDMap2Storage(ctx, checkpointStorage, payload.GetDbMaps()); err != nil { + return errors.Trace(err) } - } else { - // old version without restore_id column - use default value 0 for restore_id - log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") - // clean the dirty id map at first (without restore_id filter) - err = rc.unsafeSession.ExecuteInternal(ctx, "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %?;", - rc.restoreTS, rc.upstreamClusterID) - if err != nil { + } else if !tableExists { + log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") + if err := rc.saveIDMap2Storage(ctx, rc.storage, payload.GetDbMaps()); err != nil { return errors.Trace(err) } - replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?);" - for startIdx, segmentId := 0, 0; startIdx < len(data); segmentId += 1 { - endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) - err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, rc.restoreTS, rc.upstreamClusterID, segmentId, data[startIdx:endIdx]) - if err != nil { - return errors.Trace(err) - } - startIdx = endIdx + } + + if rc.useCheckpoint { + log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") + if err := logCheckpointMetaManager.SaveCheckpointProgress(ctx, &checkpoint.CheckpointProgress{ + Progress: checkpoint.InLogRestoreAndIdMapPersisted, + }); err != nil { + return errors.Trace(err) } } return nil @@ -165,7 +140,7 @@ func (rc *LogClient) loadSchemasMap( return dbMaps, errors.Trace(err) } if rc.pitrIDMapTableExists() { - dbMaps, err := rc.loadSchemasMapFromTable(ctx, restoredTS, true) + dbMaps, err := rc.loadSchemasMapFromTable(ctx, restoredTS) return dbMaps, errors.Trace(err) } log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") @@ -177,7 +152,7 @@ func (rc *LogClient) loadSchemasMapFromLastTask(ctx context.Context, lastRestore if !rc.pitrIDMapTableExists() { return nil, errors.Annotatef(berrors.ErrPiTRIDMapTableNotFound, "segmented restore is impossible") } - return rc.loadSchemasMapFromTable(ctx, lastRestoredTS, false) + return rc.loadSchemasMapFromTable(ctx, lastRestoredTS) } func (rc *LogClient) loadSchemasMapFromStorage( @@ -210,52 +185,14 @@ func (rc *LogClient) loadSchemasMapFromStorage( func (rc *LogClient) loadSchemasMapFromTable( ctx context.Context, restoredTS uint64, - onlyThisRestore bool, ) ([]*backuppb.PitrDBMap, error) { - useRestoreIDFilter := onlyThisRestore && rc.pitrIDMapHasRestoreIDColumn() - - var getPitrIDMapSQL string - var args []any - - if useRestoreIDFilter { - // new version with restore_id column - getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" - args = []any{rc.restoreID, restoredTS, rc.upstreamClusterID} - } else { - getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" - args = []any{restoredTS, rc.upstreamClusterID} - } - - execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() - rows, _, errSQL := execCtx.ExecRestrictedSQL( - kv.WithInternalSourceType(ctx, kv.InternalTxnBR), - nil, - getPitrIDMapSQL, - args..., - ) - if errSQL != nil { - return nil, errors.Annotatef(errSQL, "failed to get pitr id map from mysql.tidb_pitr_id_map") + payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) + if err != nil { + return nil, errors.Trace(err) } - if len(rows) == 0 { + if !found { log.Info("pitr id map does not exist", zap.Uint64("restored ts", restoredTS)) return nil, nil } - metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) - for i, row := range rows { - elementID := row.GetUint64(0) - if uint64(i) != elementID { - return nil, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", i) - } - d := row.GetBytes(1) - if len(d) == 0 { - return nil, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", i) - } - metaData = append(metaData, d...) - } - backupMeta := &backuppb.BackupMeta{} - if err := backupMeta.Unmarshal(metaData); err != nil { - return nil, errors.Trace(err) - } - - return backupMeta.GetDbMaps(), nil + return payload.GetDbMaps(), nil } diff --git a/br/pkg/restore/log_client/ingest_items.go b/br/pkg/restore/log_client/ingest_items.go index 16982e3b6ec7e..1cf7e6507416e 100644 --- a/br/pkg/restore/log_client/ingest_items.go +++ b/br/pkg/restore/log_client/ingest_items.go @@ -30,9 +30,38 @@ func (rc *LogClient) LoadIngestRecorderItems( restoredTS uint64, logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) (map[int64]map[int64]bool, error) { + tableExists := rc.pitrIDMapTableExists() + if tableExists { + payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if found { + items, err := PitrIngestItemsFromPayload(payload) + if err != nil { + return nil, errors.Trace(err) + } + if items == nil { + items = map[int64]map[int64]bool{} + } + log.Info("loaded pitr ingest items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", ingestrec.CountItems(items))) + return items, nil + } + } + if logCheckpointMetaManager == nil { + if tableExists { + return nil, nil + } return nil, errors.New("checkpoint meta manager is not initialized") } + if tableExists { + log.Info("pitr ingest items not found in mysql.tidb_pitr_id_map, fallback to checkpoint storage", + zap.Uint64("restored-ts", restoredTS)) + } clusterID := rc.GetClusterID(ctx) items, found, err := logCheckpointMetaManager.LoadPITRIngestItems(ctx, clusterID, restoredTS) if err != nil { @@ -58,11 +87,40 @@ func (rc *LogClient) SaveIngestRecorderItems( items map[int64]map[int64]bool, logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) error { + if rc.pitrIDMapTableExists() { + payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) + if err != nil { + return errors.Trace(err) + } + if !found { + if logManager, ok := logCheckpointMetaManager.(checkpoint.LogMetaManagerT); ok { + dbMaps, err := rc.loadSchemasMap(ctx, restoredTS, logManager) + if err != nil { + return errors.Trace(err) + } + if len(dbMaps) > 0 { + payload = newPitrIdMapPayload(dbMaps) + } + } + } + if payload == nil { + log.Warn("pitr id map payload not found when saving ingest items", + zap.Uint64("restored-ts", restoredTS)) + return errors.New("pitr id map payload not found for ingest items") + } + payload.IngestItems = PitrIngestItemsToProto(items) + log.Info("saving pitr ingest items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", ingestrec.CountItems(items))) + return errors.Trace(rc.savePitrIdMapPayloadToTable(ctx, restoredTS, payload)) + } + if logCheckpointMetaManager == nil { return errors.New("checkpoint meta manager is not initialized") } clusterID := rc.GetClusterID(ctx) - log.Info("saving pitr ingest items", + log.Info("saving pitr ingest items to checkpoint storage", zap.Uint64("restored-ts", restoredTS), zap.Int("table-count", len(items)), zap.Int("index-count", ingestrec.CountItems(items))) diff --git a/br/pkg/restore/log_client/pitr_id_map_payload.go b/br/pkg/restore/log_client/pitr_id_map_payload.go new file mode 100644 index 0000000000000..2e7465fc31273 --- /dev/null +++ b/br/pkg/restore/log_client/pitr_id_map_payload.go @@ -0,0 +1,357 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package logclient + +import ( + "context" + + "github.com/gogo/protobuf/proto" + "github.com/pingcap/errors" + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/log" + "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/meta/model" +) + +const pitrIdMapPayloadVersion int32 = 1 + +// DecodePitrIdMapPayload parses the payload and returns db maps for callers outside logclient. +func DecodePitrIdMapPayload(metaData []byte) ([]*backuppb.PitrDBMap, error) { + payload, err := decodePitrIdMapPayload(metaData) + if err != nil { + return nil, errors.Trace(err) + } + return payload.GetDbMaps(), nil +} + +func PitrIngestItemsFromProto(items []*backuppb.PitrIngestItem) map[int64]map[int64]bool { + if len(items) == 0 { + return map[int64]map[int64]bool{} + } + result := make(map[int64]map[int64]bool, len(items)) + for _, item := range items { + if item == nil { + continue + } + indexes := make(map[int64]bool, len(item.Indexes)) + for _, index := range item.Indexes { + if index == nil { + continue + } + indexes[index.IndexId] = index.IsPrimary + } + result[item.TableId] = indexes + } + return result +} + +func PitrIngestItemsToProto(items map[int64]map[int64]bool) []*backuppb.PitrIngestItem { + if len(items) == 0 { + return nil + } + result := make([]*backuppb.PitrIngestItem, 0, len(items)) + for tableID, indexMap := range items { + indexes := make([]*backuppb.PitrIngestIndex, 0, len(indexMap)) + for indexID, isPrimary := range indexMap { + indexes = append(indexes, &backuppb.PitrIngestIndex{ + IndexId: indexID, + IsPrimary: isPrimary, + }) + } + result = append(result, &backuppb.PitrIngestItem{ + TableId: tableID, + Indexes: indexes, + }) + } + return result +} + +func PitrTiFlashItemsFromProto(items []*backuppb.PitrTiFlashItem) map[int64]model.TiFlashReplicaInfo { + if len(items) == 0 { + return map[int64]model.TiFlashReplicaInfo{} + } + result := make(map[int64]model.TiFlashReplicaInfo, len(items)) + for _, item := range items { + if item == nil || item.Replica == nil { + continue + } + replica := item.Replica + result[item.TableId] = model.TiFlashReplicaInfo{ + Count: replica.Count, + LocationLabels: append([]string(nil), replica.LocationLabels...), + Available: replica.Available, + AvailablePartitionIDs: append([]int64(nil), replica.AvailablePartitionIds...), + } + } + return result +} + +func PitrTiFlashItemsToProto(items map[int64]model.TiFlashReplicaInfo) []*backuppb.PitrTiFlashItem { + if len(items) == 0 { + return nil + } + result := make([]*backuppb.PitrTiFlashItem, 0, len(items)) + for tableID, replica := range items { + result = append(result, &backuppb.PitrTiFlashItem{ + TableId: tableID, + Replica: &backuppb.PitrTiFlashReplicaInfo{ + Count: replica.Count, + LocationLabels: append([]string(nil), replica.LocationLabels...), + Available: replica.Available, + AvailablePartitionIds: append([]int64(nil), replica.AvailablePartitionIDs...), + }, + }) + } + return result +} + +func PitrIngestItemsFromPayload(payload *backuppb.PitrIdMapPayload) (map[int64]map[int64]bool, error) { + if payload == nil { + return nil, errors.New("pitr id map payload is nil") + } + return PitrIngestItemsFromProto(payload.IngestItems), nil +} + +func PitrTiFlashItemsFromPayload(payload *backuppb.PitrIdMapPayload) (map[int64]model.TiFlashReplicaInfo, error) { + if payload == nil { + return nil, errors.New("pitr id map payload is nil") + } + return PitrTiFlashItemsFromProto(payload.TiflashItems), nil +} + +func decodePitrIdMapPayload(metaData []byte) (*backuppb.PitrIdMapPayload, error) { + payload := &backuppb.PitrIdMapPayload{} + if err := payload.Unmarshal(metaData); err != nil { + return nil, errors.Trace(err) + } + return payload, nil +} + +func newPitrIdMapPayload(dbMaps []*backuppb.PitrDBMap) *backuppb.PitrIdMapPayload { + return &backuppb.PitrIdMapPayload{ + Version: pitrIdMapPayloadVersion, + DbMaps: dbMaps, + } +} + +func NewPitrIdMapPayload(dbMaps []*backuppb.PitrDBMap) *backuppb.PitrIdMapPayload { + return newPitrIdMapPayload(dbMaps) +} + +func (rc *LogClient) loadPitrIdMapDataFromTable( + ctx context.Context, + restoredTS uint64, + restoreID uint64, +) ([]byte, bool, error) { + var getPitrIDMapSQL string + var args []any + + if rc.pitrIDMapHasRestoreIDColumn() { + getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" + args = []any{restoreID, restoredTS, rc.upstreamClusterID} + } else { + getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" + args = []any{restoredTS, rc.upstreamClusterID} + } + + execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + getPitrIDMapSQL, + args..., + ) + if errSQL != nil { + return nil, false, errors.Annotatef(errSQL, "failed to get pitr id map from mysql.tidb_pitr_id_map") + } + if len(rows) == 0 { + return nil, false, nil + } + metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) + for i, row := range rows { + elementID := row.GetUint64(0) + if uint64(i) != elementID { + return nil, false, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", i) + } + d := row.GetBytes(1) + if len(d) == 0 { + return nil, false, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", i) + } + metaData = append(metaData, d...) + } + return metaData, true, nil +} + +func (rc *LogClient) loadLatestRestoreIDFromTable( + ctx context.Context, + restoredTS uint64, +) (uint64, bool, error) { + if !rc.pitrIDMapHasRestoreIDColumn() { + return 0, false, errors.New("restore_id column is not available") + } + execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + "SELECT restore_id FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY restore_id DESC LIMIT 1;", + restoredTS, rc.upstreamClusterID, + ) + if errSQL != nil { + return 0, false, errors.Annotatef(errSQL, "failed to get latest restore_id from mysql.tidb_pitr_id_map") + } + if len(rows) == 0 { + return 0, false, nil + } + return rows[0].GetUint64(0), true, nil +} + +func (rc *LogClient) resolvePitrIdMapRestoreID( + ctx context.Context, + restoredTS uint64, +) (uint64, bool, error) { + if !rc.pitrIDMapHasRestoreIDColumn() { + return 0, true, nil + } + if restoredTS == rc.restoreTS { + return rc.restoreID, true, nil + } + restoreID, found, err := rc.loadLatestRestoreIDFromTable(ctx, restoredTS) + if err != nil { + return 0, false, errors.Trace(err) + } + if !found { + return 0, false, nil + } + return restoreID, true, nil +} + +func (rc *LogClient) normalizePitrIdMapRestoreID(restoreID uint64) uint64 { + if rc.pitrIDMapHasRestoreIDColumn() { + return restoreID + } + return 0 +} + +func (rc *LogClient) loadPitrIdMapPayloadForSegment( + ctx context.Context, + restoredTS uint64, +) (*backuppb.PitrIdMapPayload, bool, error) { + restoreID, found, err := rc.resolvePitrIdMapRestoreID(ctx, restoredTS) + if err != nil { + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil + } + restoreID = rc.normalizePitrIdMapRestoreID(restoreID) + return rc.loadPitrIdMapPayloadFromTable(ctx, restoredTS, restoreID) +} + +func (rc *LogClient) LoadPitrIdMapPayloadForSegment( + ctx context.Context, + restoredTS uint64, +) (*backuppb.PitrIdMapPayload, bool, error) { + if !rc.pitrIDMapTableExists() { + return nil, false, nil + } + return rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) +} + +func (rc *LogClient) loadPitrIdMapPayloadFromTable( + ctx context.Context, + restoredTS uint64, + restoreID uint64, +) (*backuppb.PitrIdMapPayload, bool, error) { + restoreID = rc.normalizePitrIdMapRestoreID(restoreID) + payload, found, err := rc.loadPitrIdMapPayloadFromTableOnce(ctx, restoredTS, restoreID) + if err != nil { + return nil, false, errors.Trace(err) + } + return payload, found, nil +} + +func (rc *LogClient) loadPitrIdMapPayloadFromTableOnce( + ctx context.Context, + restoredTS uint64, + restoreID uint64, +) (*backuppb.PitrIdMapPayload, bool, error) { + metaData, found, err := rc.loadPitrIdMapDataFromTable(ctx, restoredTS, restoreID) + if err != nil { + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil + } + payload, err := decodePitrIdMapPayload(metaData) + if err != nil { + return nil, false, errors.Trace(err) + } + return payload, true, nil +} + +func (rc *LogClient) savePitrIdMapPayloadToTable( + ctx context.Context, + restoredTS uint64, + payload *backuppb.PitrIdMapPayload, +) error { + if payload == nil { + return errors.New("pitr id map payload is nil") + } + if payload.Version == 0 { + payload.Version = pitrIdMapPayloadVersion + } + data, err := proto.Marshal(payload) + if err != nil { + return errors.Trace(err) + } + + hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() + if hasRestoreIDColumn { + err = rc.unsafeSession.ExecuteInternal(ctx, + "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? and restore_id = %?;", + restoredTS, rc.upstreamClusterID, rc.restoreID) + if err != nil { + return errors.Trace(err) + } + replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restore_id, restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?, %?);" + for startIdx, segmentID := 0, 0; startIdx < len(data); segmentID += 1 { + endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) + err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, rc.restoreID, restoredTS, rc.upstreamClusterID, segmentID, data[startIdx:endIdx]) + if err != nil { + return errors.Trace(err) + } + startIdx = endIdx + } + return nil + } + + log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") + err = rc.unsafeSession.ExecuteInternal(ctx, + "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %?;", + restoredTS, rc.upstreamClusterID) + if err != nil { + return errors.Trace(err) + } + replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?);" + for startIdx, segmentID := 0, 0; startIdx < len(data); segmentID += 1 { + endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) + err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, restoredTS, rc.upstreamClusterID, segmentID, data[startIdx:endIdx]) + if err != nil { + return errors.Trace(err) + } + startIdx = endIdx + } + return nil +} diff --git a/br/pkg/restore/log_client/tiflash_items.go b/br/pkg/restore/log_client/tiflash_items.go index 90798b9b8ddef..5e790e7144a72 100644 --- a/br/pkg/restore/log_client/tiflash_items.go +++ b/br/pkg/restore/log_client/tiflash_items.go @@ -30,9 +30,37 @@ func (rc *LogClient) LoadTiFlashRecorderItems( restoredTS uint64, logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) (map[int64]model.TiFlashReplicaInfo, error) { + tableExists := rc.pitrIDMapTableExists() + if tableExists { + payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) + if err != nil { + return nil, errors.Trace(err) + } + if found { + items, err := PitrTiFlashItemsFromPayload(payload) + if err != nil { + return nil, errors.Trace(err) + } + if items == nil { + items = map[int64]model.TiFlashReplicaInfo{} + } + log.Info("loaded pitr tiflash items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("item-count", len(items))) + return items, nil + } + } + if logCheckpointMetaManager == nil { + if tableExists { + return nil, nil + } return nil, errors.New("checkpoint meta manager is not initialized") } + if tableExists { + log.Info("pitr tiflash items not found in mysql.tidb_pitr_id_map, fallback to checkpoint storage", + zap.Uint64("restored-ts", restoredTS)) + } clusterID := rc.GetClusterID(ctx) items, found, err := logCheckpointMetaManager.LoadPITRTiFlashItems(ctx, clusterID, restoredTS) if err != nil { @@ -57,11 +85,39 @@ func (rc *LogClient) SaveTiFlashRecorderItems( items map[int64]model.TiFlashReplicaInfo, logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, ) error { + if rc.pitrIDMapTableExists() { + payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) + if err != nil { + return errors.Trace(err) + } + if !found { + if logManager, ok := logCheckpointMetaManager.(checkpoint.LogMetaManagerT); ok { + dbMaps, err := rc.loadSchemasMap(ctx, restoredTS, logManager) + if err != nil { + return errors.Trace(err) + } + if len(dbMaps) > 0 { + payload = newPitrIdMapPayload(dbMaps) + } + } + } + if payload == nil { + log.Warn("pitr id map payload not found when saving tiflash items", + zap.Uint64("restored-ts", restoredTS)) + return errors.New("pitr id map payload not found for tiflash items") + } + payload.TiflashItems = PitrTiFlashItemsToProto(items) + log.Info("saving pitr tiflash items", + zap.Uint64("restored-ts", restoredTS), + zap.Int("item-count", len(items))) + return errors.Trace(rc.savePitrIdMapPayloadToTable(ctx, restoredTS, payload)) + } + if logCheckpointMetaManager == nil { return errors.New("checkpoint meta manager is not initialized") } clusterID := rc.GetClusterID(ctx) - log.Info("saving pitr tiflash items", + log.Info("saving pitr tiflash items to checkpoint storage", zap.Uint64("restored-ts", restoredTS), zap.Int("item-count", len(items))) return errors.Trace(logCheckpointMetaManager.SavePITRTiFlashItems(ctx, clusterID, restoredTS, items)) diff --git a/br/pkg/task/operator/checksum_table.go b/br/pkg/task/operator/checksum_table.go index 592ab34caccbb..682a90cd32482 100644 --- a/br/pkg/task/operator/checksum_table.go +++ b/br/pkg/task/operator/checksum_table.go @@ -258,11 +258,11 @@ func (c *checksumTableCtx) loadPitrIdMap(ctx context.Context, g glue.Glue, resto for _, row := range rows { restoreID, elementID, data := getRowColumns(row) if lastRestoreID != restoreID { - backupMeta := &backup.BackupMeta{} - if err := backupMeta.Unmarshal(metaData); err != nil { + dbMaps, err := logclient.DecodePitrIdMapPayload(metaData) + if err != nil { return nil, errors.Trace(err) } - pitrDBMap = append(pitrDBMap, backupMeta.DbMaps...) + pitrDBMap = append(pitrDBMap, dbMaps...) metaData = make([]byte, 0) lastRestoreID = restoreID nextSegmentID = uint64(0) @@ -277,11 +277,11 @@ func (c *checksumTableCtx) loadPitrIdMap(ctx context.Context, g glue.Glue, resto nextSegmentID += 1 } if len(metaData) > 0 { - backupMeta := &backup.BackupMeta{} - if err := backupMeta.Unmarshal(metaData); err != nil { + dbMaps, err := logclient.DecodePitrIdMapPayload(metaData) + if err != nil { return nil, errors.Trace(err) } - pitrDBMap = append(pitrDBMap, backupMeta.DbMaps...) + pitrDBMap = append(pitrDBMap, dbMaps...) } return pitrDBMap, nil } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 923f1ff695943..a22718e77ac58 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1628,21 +1628,48 @@ func restoreStream( } } - // build and save id map - if err := buildAndSaveIDMapIfNeeded(ctx, client, cfg); err != nil { + savedIDMap := isCurrentIdMapSaved(cfg.checkpointTaskInfo) + segmentedPayloads, savedIDMap, err := loadSegmentedRestorePayloads(ctx, client, cfg, savedIDMap) + if err != nil { return errors.Trace(err) } - if err := loadTiFlashRecorderItemsIfNeeded(ctx, client, cfg); err != nil { + if err := buildIDMapWithPayloads(ctx, client, cfg, segmentedPayloads, savedIDMap); err != nil { return errors.Trace(err) } + if cfg.tiflashRecorder != nil && len(cfg.FullBackupStorage) == 0 { + items, loaded, err := loadSegmentedTiFlashItems(ctx, client, cfg, segmentedPayloads) + if err != nil { + return errors.Trace(err) + } + if !loaded { + log.Info("no tiflash items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) + } else { + cfg.tiflashRecorder.Load(items) + log.Info("loaded tiflash items for previous segment", + zap.Uint64("start-ts", cfg.StartTS), + zap.Int("item-count", len(items))) + } + } // build schema replace schemasReplace, err := buildSchemaReplace(client, cfg) if err != nil { return errors.Trace(err) } - if err := loadIngestRecorderItemsIfNeeded(ctx, client, cfg, schemasReplace.GetIngestRecorder()); err != nil { - return errors.Trace(err) + if recorder := schemasReplace.GetIngestRecorder(); recorder != nil && len(cfg.FullBackupStorage) == 0 { + items, loaded, err := loadSegmentedIngestItems(ctx, client, cfg, segmentedPayloads) + if err != nil { + return errors.Trace(err) + } + if !loaded { + log.Info("no ingest items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) + } else { + recorder.MergeItems(items) + log.Info("loaded ingest items for previous segment", + zap.Uint64("start-ts", cfg.StartTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", ingestrec.CountItems(items))) + } } importModeSwitcher := restore.NewImportModeSwitcher(mgr.GetPDClient(), @@ -1816,20 +1843,12 @@ func restoreStream( return errors.Annotate(err, "failed to repair ingest index") } } else { - if len(ingestItemsForNextSeg) > 0 { - if err := client.SaveIngestRecorderItems(ctx, cfg.RestoreTS, ingestItemsForNextSeg, cfg.logCheckpointMetaManager); err != nil { - return errors.Annotate(err, "failed to persist ingest items for next segment") - } - } log.Info("skip repairing ingest index until last segment", zap.Uint64("restored-ts", cfg.RestoreTS)) } if cfg.tiflashRecorder != nil { if !cfg.LastRestore { - if err := client.SaveTiFlashRecorderItems(ctx, cfg.RestoreTS, cfg.tiflashRecorder.GetItems(), cfg.logCheckpointMetaManager); err != nil { - return errors.Annotate(err, "failed to persist tiflash items for next segment") - } log.Info("skip restoring TiFlash Replica until last segment", zap.Uint64("restored-ts", cfg.RestoreTS)) } else { @@ -1842,6 +1861,10 @@ func restoreStream( } } + if err := persistSegmentedRestorePayload(ctx, client, cfg, segmentedPayloads, ingestItemsForNextSeg); err != nil { + return errors.Annotate(err, "failed to persist segmented restore payload") + } + failpoint.Inject("do-checksum-with-rewrite-rules", func(_ failpoint.Value) { if err := client.FailpointDoChecksumForLogRestore(ctx, mgr.GetStorage().GetClient(), mgr.GetPDClient(), rewriteRules); err != nil { failpoint.Return(errors.Annotate(err, "failed to do checksum")) @@ -2253,17 +2276,85 @@ func buildSchemaReplace(client *logclient.LogClient, cfg *LogRestoreConfig) (*st return schemasReplace, nil } -func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, cfg *LogRestoreConfig) error { - // get the schemas ID replace information. - saved := isCurrentIdMapSaved(cfg.checkpointTaskInfo) +type segmentedRestorePayloads struct { + previous *backuppb.PitrIdMapPayload + current *backuppb.PitrIdMapPayload +} + +func loadSegmentedRestorePayloads( + ctx context.Context, + client *logclient.LogClient, + cfg *LogRestoreConfig, + savedIDMap bool, +) (*segmentedRestorePayloads, bool, error) { + payloads := &segmentedRestorePayloads{} + if savedIDMap { + payload, found, err := client.LoadPitrIdMapPayloadForSegment(ctx, cfg.RestoreTS) + if err != nil { + return nil, false, errors.Trace(err) + } + if found { + payloads.current = payload + } else { + log.Warn("checkpoint indicates id map saved but payload not found, rebuild it", + zap.Uint64("restore-ts", cfg.RestoreTS), + zap.Uint64("restore-id", cfg.RestoreID)) + savedIDMap = false + } + } + + if len(cfg.FullBackupStorage) == 0 { + payload, found, err := client.LoadPitrIdMapPayloadForSegment(ctx, cfg.StartTS) + if err != nil { + return nil, false, errors.Trace(err) + } + if found { + payloads.previous = payload + } + } + + return payloads, savedIDMap, nil +} + +func buildIDMapWithPayloads( + ctx context.Context, + client *logclient.LogClient, + cfg *LogRestoreConfig, + payloads *segmentedRestorePayloads, + savedIDMap bool, +) error { hasFullBackupStorage := len(cfg.FullBackupStorage) != 0 - err := client.GetBaseIDMapAndMerge(ctx, hasFullBackupStorage, saved, - cfg.logCheckpointMetaManager, cfg.tableMappingManager) - if err != nil { - return errors.Trace(err) + var ( + dbMaps []*backuppb.PitrDBMap + usePrevMap bool + ) + if payloads.current != nil && len(payloads.current.DbMaps) > 0 { + dbMaps = payloads.current.DbMaps + } + if len(dbMaps) == 0 && !hasFullBackupStorage && payloads.previous != nil { + dbMaps = payloads.previous.DbMaps + usePrevMap = true + } + if len(dbMaps) == 0 && !hasFullBackupStorage { + log.Error("no id maps found") + return errors.New("no base id map found from saved id or last restored PiTR") + } + if len(dbMaps) > 0 { + dbReplaces := stream.FromDBMapProto(dbMaps) + stream.LogDBReplaceMap("base db replace info", dbReplaces) + if len(dbReplaces) != 0 { + cfg.tableMappingManager.SetFromPiTRIDMap() + cfg.tableMappingManager.MergeBaseDBReplace(dbReplaces) + } + } + + if usePrevMap { + if err := client.ValidateNoTiFlashReplica(); err != nil { + return errors.Trace(err) + } } - if saved { + if savedIDMap { return nil } @@ -2275,66 +2366,102 @@ func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, // reuse existing database ids if it exists in the current cluster cfg.tableMappingManager.ReuseExistingDatabaseIDs(client.GetDomain().InfoSchema()) // replace temp id with read global id - err = cfg.tableMappingManager.ReplaceTemporaryIDs(ctx, client.GenGlobalIDs) - if err != nil { - return errors.Trace(err) - } - if err = client.SaveIdMapWithFailPoints(ctx, cfg.tableMappingManager, cfg.logCheckpointMetaManager); err != nil { + if err := cfg.tableMappingManager.ReplaceTemporaryIDs(ctx, client.GenGlobalIDs); err != nil { return errors.Trace(err) } + payloads.current = logclient.NewPitrIdMapPayload(cfg.tableMappingManager.ToProto()) return nil } -func loadTiFlashRecorderItemsIfNeeded(ctx context.Context, client *logclient.LogClient, cfg *LogRestoreConfig) error { - if cfg.tiflashRecorder == nil { - return nil - } - if len(cfg.FullBackupStorage) != 0 { - return nil +func loadSegmentedTiFlashItems( + ctx context.Context, + client *logclient.LogClient, + cfg *LogRestoreConfig, + payloads *segmentedRestorePayloads, +) (map[int64]model.TiFlashReplicaInfo, bool, error) { + if payloads != nil && payloads.previous != nil { + items, err := logclient.PitrTiFlashItemsFromPayload(payloads.previous) + if err != nil { + return nil, false, errors.Trace(err) + } + if items == nil { + items = map[int64]model.TiFlashReplicaInfo{} + } + return items, true, nil } - items, err := client.LoadTiFlashRecorderItems(ctx, cfg.StartTS, cfg.logCheckpointMetaManager) + if cfg.logCheckpointMetaManager == nil { + return nil, false, nil + } + clusterID := client.GetClusterID(ctx) + items, found, err := cfg.logCheckpointMetaManager.LoadPITRTiFlashItems(ctx, clusterID, cfg.StartTS) if err != nil { - return errors.Trace(err) + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil } if items == nil { - log.Info("no tiflash items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) - return nil + items = map[int64]model.TiFlashReplicaInfo{} } - cfg.tiflashRecorder.Load(items) - log.Info("loaded tiflash items for previous segment", - zap.Uint64("start-ts", cfg.StartTS), - zap.Int("item-count", len(items))) - return nil + return items, true, nil } -func loadIngestRecorderItemsIfNeeded( +func loadSegmentedIngestItems( ctx context.Context, client *logclient.LogClient, cfg *LogRestoreConfig, - recorder *ingestrec.IngestRecorder, -) error { - if recorder == nil { - return nil - } - if len(cfg.FullBackupStorage) != 0 { - return nil + payloads *segmentedRestorePayloads, +) (map[int64]map[int64]bool, bool, error) { + if payloads != nil && payloads.previous != nil { + items, err := logclient.PitrIngestItemsFromPayload(payloads.previous) + if err != nil { + return nil, false, errors.Trace(err) + } + if items == nil { + items = map[int64]map[int64]bool{} + } + return items, true, nil } - items, err := client.LoadIngestRecorderItems(ctx, cfg.StartTS, cfg.logCheckpointMetaManager) + if cfg.logCheckpointMetaManager == nil { + return nil, false, nil + } + clusterID := client.GetClusterID(ctx) + items, found, err := cfg.logCheckpointMetaManager.LoadPITRIngestItems(ctx, clusterID, cfg.StartTS) if err != nil { - return errors.Trace(err) + return nil, false, errors.Trace(err) + } + if !found { + return nil, false, nil } if items == nil { - log.Info("no ingest items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) - return nil + items = map[int64]map[int64]bool{} } - recorder.MergeItems(items) - log.Info("loaded ingest items for previous segment", - zap.Uint64("start-ts", cfg.StartTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - return nil + return items, true, nil +} + +func persistSegmentedRestorePayload( + ctx context.Context, + client *logclient.LogClient, + cfg *LogRestoreConfig, + payloads *segmentedRestorePayloads, + ingestItemsForNextSeg map[int64]map[int64]bool, +) error { + if payloads == nil || payloads.current == nil { + payloads = &segmentedRestorePayloads{ + current: logclient.NewPitrIdMapPayload(cfg.tableMappingManager.ToProto()), + } + } + if !cfg.LastRestore { + if ingestItemsForNextSeg != nil { + payloads.current.IngestItems = logclient.PitrIngestItemsToProto(ingestItemsForNextSeg) + } + if cfg.tiflashRecorder != nil { + payloads.current.TiflashItems = logclient.PitrTiFlashItemsToProto(cfg.tiflashRecorder.GetItems()) + } + } + return errors.Trace(client.SavePitrIdMapPayloadWithFailPoints(ctx, cfg.RestoreTS, payloads.current, cfg.logCheckpointMetaManager)) } func getCurrentTSFromCheckpointOrPD(ctx context.Context, mgr *conn.Mgr, cfg *LogRestoreConfig) (uint64, error) { diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index 6c378db0e6ca7..4310a9b4a7e8a 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -85,7 +85,7 @@ run_sql "select count(*) from \`$latest_db\`.\`cpt_data\`;" check_contains "count(*): 1" # check the log restore save id map into the table mysql.tidb_pitr_id_map -run_sql 'select count(*) from mysql.tidb_pitr_id_map;' +run_sql "select count(*) from mysql.tidb_pitr_id_map;" check_contains "count(*): 1" # PITR with checkpoint but failed in the log restore datakv stage @@ -129,7 +129,7 @@ check_result() { check_result # check mysql.tidb_pitr_id_map has data -count=$(run_sql 'select count(*) from mysql.tidb_pitr_id_map;' | awk '/count/{print $2}') +count=$(run_sql "select count(*) from mysql.tidb_pitr_id_map;" | awk '/count/{print $2}') if [ $count -eq 0 ]; then echo "the number of pitr id map is $count" exit 1 @@ -170,14 +170,17 @@ if [ $restore_fail -ne 1 ]; then exit 1 fi -# check the pitr id map is saved in the checkpoint storage +# check the pitr id map is saved in the checkpoint storage and system table count=$(ls $TEST_DIR/$PREFIX/log/pitr_id_maps | wc -l) if [ $count -ne 0 ]; then echo "the number of pitr id map is $count instead of 0" exit 1 fi -run_sql 'select count(*) from mysql.tidb_pitr_id_map;' -check_contains "count(*): 0" +count=$(run_sql "select count(*) from mysql.tidb_pitr_id_map;" | awk '/count/{print $2}') +if [ $count -eq 0 ]; then + echo "the number of pitr id map is $count" + exit 1 +fi count=$(ls $TEST_DIR/$PREFIX/checkpoints/pitr_id_maps | wc -l) if [ $count -ne 1 ]; then echo "the number of pitr id map is $count instead of 1" diff --git a/go.mod b/go.mod index b677ac577614e..5091bfc1db717 100644 --- a/go.mod +++ b/go.mod @@ -369,6 +369,7 @@ replace ( // Downgrade grpc to v1.63.2, as well as other related modules. github.com/apache/arrow-go/v18 => github.com/joechenrh/arrow-go/v18 v18.0.0-20250911101656-62c34c9a3b82 github.com/go-ldap/ldap/v3 => github.com/YangKeao/ldap/v3 v3.4.5-0.20230421065457-369a3bab1117 + github.com/pingcap/kvproto => /root/workspace/kvproto/worktree/pitr_id_map_payload github.com/pingcap/tidb/pkg/parser => ./pkg/parser // TODO: `sourcegraph.com/sourcegraph/appdash` has been archived, and the original host has been removed. diff --git a/pkg/testkit/brhelper/workload/runner.go b/pkg/testkit/brhelper/workload/runner.go index 2e8ae8465f1ec..22984abac7386 100644 --- a/pkg/testkit/brhelper/workload/runner.go +++ b/pkg/testkit/brhelper/workload/runner.go @@ -270,8 +270,6 @@ func (r *Runner) runParallelTick( for _, spec := range selected { wg.Go( func() { - defer wg.Done() - mu.Lock() state, ok := states[spec.Name] mu.Unlock() From 7eca38f809ce72c98d4a242221be759be2c6925a Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Mon, 2 Feb 2026 11:30:28 +0000 Subject: [PATCH 14/18] remove unused files Signed-off-by: Juncen Yu --- br/pkg/restore/log_client/ingest_items.go | 128 --------------------- br/pkg/restore/log_client/tiflash_items.go | 124 -------------------- 2 files changed, 252 deletions(-) delete mode 100644 br/pkg/restore/log_client/ingest_items.go delete mode 100644 br/pkg/restore/log_client/tiflash_items.go diff --git a/br/pkg/restore/log_client/ingest_items.go b/br/pkg/restore/log_client/ingest_items.go deleted file mode 100644 index 1cf7e6507416e..0000000000000 --- a/br/pkg/restore/log_client/ingest_items.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package logclient - -import ( - "context" - - "github.com/pingcap/errors" - "github.com/pingcap/log" - "github.com/pingcap/tidb/br/pkg/checkpoint" - "github.com/pingcap/tidb/br/pkg/restore/ingestrec" - "go.uber.org/zap" -) - -// LoadIngestRecorderItems loads persisted ingest recorder items for a segment. -func (rc *LogClient) LoadIngestRecorderItems( - ctx context.Context, - restoredTS uint64, - logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, -) (map[int64]map[int64]bool, error) { - tableExists := rc.pitrIDMapTableExists() - if tableExists { - payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if found { - items, err := PitrIngestItemsFromPayload(payload) - if err != nil { - return nil, errors.Trace(err) - } - if items == nil { - items = map[int64]map[int64]bool{} - } - log.Info("loaded pitr ingest items", - zap.Uint64("restored-ts", restoredTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - return items, nil - } - } - - if logCheckpointMetaManager == nil { - if tableExists { - return nil, nil - } - return nil, errors.New("checkpoint meta manager is not initialized") - } - if tableExists { - log.Info("pitr ingest items not found in mysql.tidb_pitr_id_map, fallback to checkpoint storage", - zap.Uint64("restored-ts", restoredTS)) - } - clusterID := rc.GetClusterID(ctx) - items, found, err := logCheckpointMetaManager.LoadPITRIngestItems(ctx, clusterID, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if !found { - return nil, nil - } - if items == nil { - items = map[int64]map[int64]bool{} - } - log.Info("loaded pitr ingest items", - zap.Uint64("restored-ts", restoredTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - return items, nil -} - -// SaveIngestRecorderItems persists ingest recorder items for the next segment. -func (rc *LogClient) SaveIngestRecorderItems( - ctx context.Context, - restoredTS uint64, - items map[int64]map[int64]bool, - logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, -) error { - if rc.pitrIDMapTableExists() { - payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) - if err != nil { - return errors.Trace(err) - } - if !found { - if logManager, ok := logCheckpointMetaManager.(checkpoint.LogMetaManagerT); ok { - dbMaps, err := rc.loadSchemasMap(ctx, restoredTS, logManager) - if err != nil { - return errors.Trace(err) - } - if len(dbMaps) > 0 { - payload = newPitrIdMapPayload(dbMaps) - } - } - } - if payload == nil { - log.Warn("pitr id map payload not found when saving ingest items", - zap.Uint64("restored-ts", restoredTS)) - return errors.New("pitr id map payload not found for ingest items") - } - payload.IngestItems = PitrIngestItemsToProto(items) - log.Info("saving pitr ingest items", - zap.Uint64("restored-ts", restoredTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - return errors.Trace(rc.savePitrIdMapPayloadToTable(ctx, restoredTS, payload)) - } - - if logCheckpointMetaManager == nil { - return errors.New("checkpoint meta manager is not initialized") - } - clusterID := rc.GetClusterID(ctx) - log.Info("saving pitr ingest items to checkpoint storage", - zap.Uint64("restored-ts", restoredTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - return errors.Trace(logCheckpointMetaManager.SavePITRIngestItems(ctx, clusterID, restoredTS, items)) -} diff --git a/br/pkg/restore/log_client/tiflash_items.go b/br/pkg/restore/log_client/tiflash_items.go deleted file mode 100644 index 5e790e7144a72..0000000000000 --- a/br/pkg/restore/log_client/tiflash_items.go +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2025 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package logclient - -import ( - "context" - - "github.com/pingcap/errors" - "github.com/pingcap/log" - "github.com/pingcap/tidb/br/pkg/checkpoint" - "github.com/pingcap/tidb/pkg/meta/model" - "go.uber.org/zap" -) - -// LoadTiFlashRecorderItems loads persisted TiFlash recorder items for a segment. -func (rc *LogClient) LoadTiFlashRecorderItems( - ctx context.Context, - restoredTS uint64, - logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, -) (map[int64]model.TiFlashReplicaInfo, error) { - tableExists := rc.pitrIDMapTableExists() - if tableExists { - payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if found { - items, err := PitrTiFlashItemsFromPayload(payload) - if err != nil { - return nil, errors.Trace(err) - } - if items == nil { - items = map[int64]model.TiFlashReplicaInfo{} - } - log.Info("loaded pitr tiflash items", - zap.Uint64("restored-ts", restoredTS), - zap.Int("item-count", len(items))) - return items, nil - } - } - - if logCheckpointMetaManager == nil { - if tableExists { - return nil, nil - } - return nil, errors.New("checkpoint meta manager is not initialized") - } - if tableExists { - log.Info("pitr tiflash items not found in mysql.tidb_pitr_id_map, fallback to checkpoint storage", - zap.Uint64("restored-ts", restoredTS)) - } - clusterID := rc.GetClusterID(ctx) - items, found, err := logCheckpointMetaManager.LoadPITRTiFlashItems(ctx, clusterID, restoredTS) - if err != nil { - return nil, errors.Trace(err) - } - if !found { - return nil, nil - } - if items == nil { - items = map[int64]model.TiFlashReplicaInfo{} - } - log.Info("loaded pitr tiflash items", - zap.Uint64("restored-ts", restoredTS), - zap.Int("item-count", len(items))) - return items, nil -} - -// SaveTiFlashRecorderItems persists TiFlash recorder items for the next segment. -func (rc *LogClient) SaveTiFlashRecorderItems( - ctx context.Context, - restoredTS uint64, - items map[int64]model.TiFlashReplicaInfo, - logCheckpointMetaManager checkpoint.SegmentedRestoreStorage, -) error { - if rc.pitrIDMapTableExists() { - payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) - if err != nil { - return errors.Trace(err) - } - if !found { - if logManager, ok := logCheckpointMetaManager.(checkpoint.LogMetaManagerT); ok { - dbMaps, err := rc.loadSchemasMap(ctx, restoredTS, logManager) - if err != nil { - return errors.Trace(err) - } - if len(dbMaps) > 0 { - payload = newPitrIdMapPayload(dbMaps) - } - } - } - if payload == nil { - log.Warn("pitr id map payload not found when saving tiflash items", - zap.Uint64("restored-ts", restoredTS)) - return errors.New("pitr id map payload not found for tiflash items") - } - payload.TiflashItems = PitrTiFlashItemsToProto(items) - log.Info("saving pitr tiflash items", - zap.Uint64("restored-ts", restoredTS), - zap.Int("item-count", len(items))) - return errors.Trace(rc.savePitrIdMapPayloadToTable(ctx, restoredTS, payload)) - } - - if logCheckpointMetaManager == nil { - return errors.New("checkpoint meta manager is not initialized") - } - clusterID := rc.GetClusterID(ctx) - log.Info("saving pitr tiflash items to checkpoint storage", - zap.Uint64("restored-ts", restoredTS), - zap.Int("item-count", len(items))) - return errors.Trace(logCheckpointMetaManager.SavePITRTiFlashItems(ctx, clusterID, restoredTS, items)) -} From cecf40b051d3b06c3de5f7c9ed9a35a957cb9170 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Tue, 3 Feb 2026 07:35:35 +0000 Subject: [PATCH 15/18] tidy up codes Signed-off-by: Juncen Yu --- br/pkg/checkpoint/manager.go | 9 - br/pkg/checkpoint/pitr_items.go | 219 ------------------ br/pkg/checkpoint/pitr_items_manager.go | 172 -------------- br/pkg/checkpoint/storage.go | 1 + br/pkg/restore/log_client/client.go | 2 +- br/pkg/restore/log_client/id_map.go | 4 +- .../restore/log_client/pitr_id_map_payload.go | 63 +---- br/pkg/task/operator/checksum_table.go | 10 +- br/pkg/task/restore.go | 4 + br/pkg/task/stream.go | 112 +++------ 10 files changed, 45 insertions(+), 551 deletions(-) delete mode 100644 br/pkg/checkpoint/pitr_items.go delete mode 100644 br/pkg/checkpoint/pitr_items_manager.go diff --git a/br/pkg/checkpoint/manager.go b/br/pkg/checkpoint/manager.go index f52774f79ea45..3f0e40b2f80e2 100644 --- a/br/pkg/checkpoint/manager.go +++ b/br/pkg/checkpoint/manager.go @@ -23,7 +23,6 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/pkg/domain" - "github.com/pingcap/tidb/pkg/meta/model" "github.com/pingcap/tidb/pkg/objstore/storeapi" "github.com/pingcap/tidb/pkg/parser/ast" ) @@ -70,7 +69,6 @@ type MetaManager[K KeyType, SV, LV ValueType, M any] interface { type LogMetaManager[K KeyType, SV, LV ValueType, M any] interface { MetaManager[K, SV, LV, M] - SegmentedRestoreStorage LoadCheckpointProgress(context.Context) (*CheckpointProgress, error) SaveCheckpointProgress(context.Context, *CheckpointProgress) error @@ -83,13 +81,6 @@ type LogMetaManager[K KeyType, SV, LV ValueType, M any] interface { TryGetStorage() storeapi.Storage } -type SegmentedRestoreStorage interface { - LoadPITRIngestItems(context.Context, uint64, uint64) (map[int64]map[int64]bool, bool, error) - SavePITRIngestItems(context.Context, uint64, uint64, map[int64]map[int64]bool) error - LoadPITRTiFlashItems(context.Context, uint64, uint64) (map[int64]model.TiFlashReplicaInfo, bool, error) - SavePITRTiFlashItems(context.Context, uint64, uint64, map[int64]model.TiFlashReplicaInfo) error -} - type TableMetaManager[K KeyType, SV, LV ValueType, M any] struct { se glue.Session runnerSe glue.Session diff --git a/br/pkg/checkpoint/pitr_items.go b/br/pkg/checkpoint/pitr_items.go deleted file mode 100644 index ef8b88ca1791b..0000000000000 --- a/br/pkg/checkpoint/pitr_items.go +++ /dev/null @@ -1,219 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package checkpoint - -import ( - "context" - "encoding/json" - "fmt" - - "github.com/pingcap/errors" - "github.com/pingcap/tidb/br/pkg/glue" - "github.com/pingcap/tidb/pkg/domain" - "github.com/pingcap/tidb/pkg/kv" - "github.com/pingcap/tidb/pkg/meta/model" - "github.com/pingcap/tidb/pkg/objstore/storeapi" - "github.com/pingcap/tidb/pkg/parser/ast" - "github.com/pingcap/tidb/pkg/util/sqlexec" -) - -const ( - LogRestorePITRItemsDatabaseName = "__TiDB_BR_Temporary_Log_Restore_PiTR_Items" - - pitrIngestItemsTableName = "pitr_ingest_items" - pitrTiFlashItemsTableName = "pitr_tiflash_items" - - pitrIngestItemsDir = "pitr_ingest_items" - pitrTiFlashItemsDir = "pitr_tiflash_items" - - createPITRItemsTable = ` - CREATE TABLE IF NOT EXISTS %n.%n ( - cluster_id BIGINT NOT NULL, - restored_ts BIGINT NOT NULL, - segment_id BIGINT NOT NULL, - data BLOB(524288) NOT NULL, - update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY(cluster_id, restored_ts, segment_id));` - - insertPITRItemsSQLTemplate = ` - REPLACE INTO %n.%n (cluster_id, restored_ts, segment_id, data) VALUES (%?, %?, %?, %?);` - - selectPITRItemsSQLTemplate = ` - SELECT segment_id, data FROM %n.%n WHERE cluster_id = %? AND restored_ts = %? ORDER BY segment_id;` - - deletePITRItemsSQLTemplate = ` - DELETE FROM %n.%n WHERE cluster_id = %? AND restored_ts = %?;` -) - -type pitrIngestItemsPayload struct { - Items map[int64]map[int64]bool `json:"items"` -} - -type pitrTiFlashItemsPayload struct { - Items map[int64]model.TiFlashReplicaInfo `json:"items"` -} - -func marshalPITRIngestItems(items map[int64]map[int64]bool) ([]byte, error) { - if items == nil { - items = map[int64]map[int64]bool{} - } - return json.Marshal(&pitrIngestItemsPayload{Items: items}) -} - -func unmarshalPITRIngestItems(data []byte) (map[int64]map[int64]bool, error) { - var payload pitrIngestItemsPayload - if err := json.Unmarshal(data, &payload); err != nil { - return nil, errors.Trace(err) - } - if payload.Items == nil { - payload.Items = map[int64]map[int64]bool{} - } - return payload.Items, nil -} - -func marshalPITRTiFlashItems(items map[int64]model.TiFlashReplicaInfo) ([]byte, error) { - if items == nil { - items = map[int64]model.TiFlashReplicaInfo{} - } - return json.Marshal(&pitrTiFlashItemsPayload{Items: items}) -} - -func unmarshalPITRTiFlashItems(data []byte) (map[int64]model.TiFlashReplicaInfo, error) { - var payload pitrTiFlashItemsPayload - if err := json.Unmarshal(data, &payload); err != nil { - return nil, errors.Trace(err) - } - if payload.Items == nil { - payload.Items = map[int64]model.TiFlashReplicaInfo{} - } - return payload.Items, nil -} - -func pitrItemsFilename(dir, name string, clusterID, restoredTS uint64) string { - return fmt.Sprintf("%s/%s.cluster_id:%d.restored_ts:%d", dir, name, clusterID, restoredTS) -} - -func pitrIngestItemsPath(clusterID, restoredTS uint64) string { - return pitrItemsFilename(pitrIngestItemsDir, pitrIngestItemsDir, clusterID, restoredTS) -} - -func pitrTiFlashItemsPath(clusterID, restoredTS uint64) string { - return pitrItemsFilename(pitrTiFlashItemsDir, pitrTiFlashItemsDir, clusterID, restoredTS) -} - -func loadPITRItemsFromStorage( - ctx context.Context, - storage storeapi.Storage, - path string, - itemName string, -) ([]byte, bool, error) { - exists, err := storage.FileExists(ctx, path) - if err != nil { - return nil, false, errors.Annotatef(err, "failed to check %s file %s", itemName, path) - } - if !exists { - return nil, false, nil - } - raw, err := storage.ReadFile(ctx, path) - if err != nil { - return nil, false, errors.Annotatef(err, "failed to read %s file %s", itemName, path) - } - return raw, true, nil -} - -func savePITRItemsToStorage( - ctx context.Context, - storage storeapi.Storage, - path string, - itemName string, - data []byte, -) error { - if err := storage.WriteFile(ctx, path, data); err != nil { - return errors.Annotatef(err, "failed to save %s file %s", itemName, path) - } - return nil -} - -func initPITRItemsTable(ctx context.Context, se glue.Session, dbName string, tableNames []string) error { - if err := se.ExecuteInternal(ctx, "CREATE DATABASE IF NOT EXISTS %n;", dbName); err != nil { - return errors.Trace(err) - } - for _, tableName := range tableNames { - if err := se.ExecuteInternal(ctx, createPITRItemsTable, dbName, tableName); err != nil { - return errors.Trace(err) - } - } - return nil -} - -func pitrItemsTableExists(dom *domain.Domain, tableName string) bool { - if dom == nil { - return false - } - return dom.InfoSchema(). - TableExists(ast.NewCIStr(LogRestorePITRItemsDatabaseName), ast.NewCIStr(tableName)) -} - -func loadPITRItemsFromTable( - ctx context.Context, - execCtx sqlexec.RestrictedSQLExecutor, - dbName string, - tableName string, - clusterID uint64, - restoredTS uint64, -) ([]byte, bool, error) { - rows, _, errSQL := execCtx.ExecRestrictedSQL( - kv.WithInternalSourceType(ctx, kv.InternalTxnBR), - nil, - selectPITRItemsSQLTemplate, - dbName, tableName, clusterID, restoredTS, - ) - if errSQL != nil { - return nil, false, errors.Annotatef(errSQL, "failed to get pitr items from table %s.%s", dbName, tableName) - } - if len(rows) == 0 { - return nil, false, nil - } - data := make([]byte, 0, len(rows)*CheckpointIdMapBlockSize) - for i, row := range rows { - segmentID, chunk := row.GetUint64(0), row.GetBytes(1) - if uint64(i) != segmentID { - return nil, false, errors.Errorf( - "pitr items table %s.%s is incomplete at segment %d", dbName, tableName, segmentID) - } - data = append(data, chunk...) - } - return data, true, nil -} - -func savePITRItemsToTable( - ctx context.Context, - se glue.Session, - dbName string, - tableName string, - clusterID uint64, - restoredTS uint64, - data []byte, -) error { - if err := initPITRItemsTable(ctx, se, dbName, []string{tableName}); err != nil { - return errors.Trace(err) - } - if err := se.ExecuteInternal(ctx, deletePITRItemsSQLTemplate, dbName, tableName, clusterID, restoredTS); err != nil { - return errors.Trace(err) - } - return errors.Trace(chunkInsertCheckpointData(data, func(segmentID uint64, chunk []byte) error { - return errors.Trace(se.ExecuteInternal(ctx, insertPITRItemsSQLTemplate, dbName, tableName, clusterID, restoredTS, segmentID, chunk)) - })) -} diff --git a/br/pkg/checkpoint/pitr_items_manager.go b/br/pkg/checkpoint/pitr_items_manager.go deleted file mode 100644 index 311e08c2733bd..0000000000000 --- a/br/pkg/checkpoint/pitr_items_manager.go +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package checkpoint - -import ( - "context" - - "github.com/pingcap/errors" - "github.com/pingcap/tidb/pkg/meta/model" -) - -func (manager *TableMetaManager[K, SV, LV, M]) LoadPITRIngestItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, -) (map[int64]map[int64]bool, bool, error) { - if !pitrItemsTableExists(manager.dom, pitrIngestItemsTableName) { - return nil, false, nil - } - if manager.se == nil { - return nil, false, errors.New("checkpoint session is not initialized") - } - execCtx := manager.se.GetSessionCtx().GetRestrictedSQLExecutor() - data, found, err := loadPITRItemsFromTable(ctx, execCtx, LogRestorePITRItemsDatabaseName, pitrIngestItemsTableName, clusterID, restoredTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - items, err := unmarshalPITRIngestItems(data) - if err != nil { - return nil, false, errors.Trace(err) - } - return items, true, nil -} - -func (manager *TableMetaManager[K, SV, LV, M]) SavePITRIngestItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, - items map[int64]map[int64]bool, -) error { - if manager.se == nil { - return errors.New("checkpoint session is not initialized") - } - data, err := marshalPITRIngestItems(items) - if err != nil { - return errors.Trace(err) - } - return errors.Trace(savePITRItemsToTable(ctx, manager.se, LogRestorePITRItemsDatabaseName, pitrIngestItemsTableName, clusterID, restoredTS, data)) -} - -func (manager *TableMetaManager[K, SV, LV, M]) LoadPITRTiFlashItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, -) (map[int64]model.TiFlashReplicaInfo, bool, error) { - if !pitrItemsTableExists(manager.dom, pitrTiFlashItemsTableName) { - return nil, false, nil - } - if manager.se == nil { - return nil, false, errors.New("checkpoint session is not initialized") - } - execCtx := manager.se.GetSessionCtx().GetRestrictedSQLExecutor() - data, found, err := loadPITRItemsFromTable(ctx, execCtx, LogRestorePITRItemsDatabaseName, pitrTiFlashItemsTableName, clusterID, restoredTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - items, err := unmarshalPITRTiFlashItems(data) - if err != nil { - return nil, false, errors.Trace(err) - } - return items, true, nil -} - -func (manager *TableMetaManager[K, SV, LV, M]) SavePITRTiFlashItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, - items map[int64]model.TiFlashReplicaInfo, -) error { - if manager.se == nil { - return errors.New("checkpoint session is not initialized") - } - data, err := marshalPITRTiFlashItems(items) - if err != nil { - return errors.Trace(err) - } - return errors.Trace(savePITRItemsToTable(ctx, manager.se, LogRestorePITRItemsDatabaseName, pitrTiFlashItemsTableName, clusterID, restoredTS, data)) -} - -func (manager *StorageMetaManager[K, SV, LV, M]) LoadPITRIngestItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, -) (map[int64]map[int64]bool, bool, error) { - path := pitrIngestItemsPath(clusterID, restoredTS) - data, found, err := loadPITRItemsFromStorage(ctx, manager.storage, path, "ingest items") - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - items, err := unmarshalPITRIngestItems(data) - if err != nil { - return nil, false, errors.Trace(err) - } - return items, true, nil -} - -func (manager *StorageMetaManager[K, SV, LV, M]) SavePITRIngestItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, - items map[int64]map[int64]bool, -) error { - data, err := marshalPITRIngestItems(items) - if err != nil { - return errors.Trace(err) - } - return errors.Trace(savePITRItemsToStorage(ctx, manager.storage, pitrIngestItemsPath(clusterID, restoredTS), "ingest items", data)) -} - -func (manager *StorageMetaManager[K, SV, LV, M]) LoadPITRTiFlashItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, -) (map[int64]model.TiFlashReplicaInfo, bool, error) { - path := pitrTiFlashItemsPath(clusterID, restoredTS) - data, found, err := loadPITRItemsFromStorage(ctx, manager.storage, path, "tiflash items") - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - items, err := unmarshalPITRTiFlashItems(data) - if err != nil { - return nil, false, errors.Trace(err) - } - return items, true, nil -} - -func (manager *StorageMetaManager[K, SV, LV, M]) SavePITRTiFlashItems( - ctx context.Context, - clusterID uint64, - restoredTS uint64, - items map[int64]model.TiFlashReplicaInfo, -) error { - data, err := marshalPITRTiFlashItems(items) - if err != nil { - return errors.Trace(err) - } - return errors.Trace(savePITRItemsToStorage(ctx, manager.storage, pitrTiFlashItemsPath(clusterID, restoredTS), "tiflash items", data)) -} diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index bb00eb5887826..a0daf51f5d1f1 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -49,6 +49,7 @@ type checkpointStorage interface { // 3. BR regards the checkpoint table as a directory which is managed by metadata table. const ( LogRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_Checkpoint" + LogRestorePITRItemsDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_PiTR_Items" SnapshotRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint" CustomSSTRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Custom_SST_Restore_Checkpoint" diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 293da264273dc..9b61e81bc6c09 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -1995,7 +1995,7 @@ func (rc *LogClient) SaveIdMapWithFailPoints( manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { - payload := newPitrIdMapPayload(manager.ToProto()) + payload := &backuppb.PitrIdMapPayload{DbMaps: manager.ToProto()} return rc.SavePitrIdMapPayloadWithFailPoints(ctx, rc.restoreTS, payload, logCheckpointMetaManager) } diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 0d49129527177..80e89b45360d9 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -61,7 +61,7 @@ func (rc *LogClient) saveIDMap( manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { - payload := newPitrIdMapPayload(manager.ToProto()) + payload := &backuppb.PitrIdMapPayload{DbMaps: manager.ToProto()} return rc.savePitrIdMapPayload(ctx, rc.restoreTS, payload, logCheckpointMetaManager) } @@ -81,7 +81,7 @@ func (rc *LogClient) saveIDMap2Storage( } func (rc *LogClient) saveIDMap2Table(ctx context.Context, dbMaps []*backuppb.PitrDBMap) error { - payload := newPitrIdMapPayload(dbMaps) + payload := &backuppb.PitrIdMapPayload{DbMaps: dbMaps} if existing, found, err := rc.loadPitrIdMapPayloadFromTable(ctx, rc.restoreTS, rc.restoreID); err != nil { return errors.Trace(err) } else if found { diff --git a/br/pkg/restore/log_client/pitr_id_map_payload.go b/br/pkg/restore/log_client/pitr_id_map_payload.go index 2e7465fc31273..ac385ad0a343a 100644 --- a/br/pkg/restore/log_client/pitr_id_map_payload.go +++ b/br/pkg/restore/log_client/pitr_id_map_payload.go @@ -25,17 +25,6 @@ import ( "github.com/pingcap/tidb/pkg/meta/model" ) -const pitrIdMapPayloadVersion int32 = 1 - -// DecodePitrIdMapPayload parses the payload and returns db maps for callers outside logclient. -func DecodePitrIdMapPayload(metaData []byte) ([]*backuppb.PitrDBMap, error) { - payload, err := decodePitrIdMapPayload(metaData) - if err != nil { - return nil, errors.Trace(err) - } - return payload.GetDbMaps(), nil -} - func PitrIngestItemsFromProto(items []*backuppb.PitrIngestItem) map[int64]map[int64]bool { if len(items) == 0 { return map[int64]map[int64]bool{} @@ -117,39 +106,6 @@ func PitrTiFlashItemsToProto(items map[int64]model.TiFlashReplicaInfo) []*backup return result } -func PitrIngestItemsFromPayload(payload *backuppb.PitrIdMapPayload) (map[int64]map[int64]bool, error) { - if payload == nil { - return nil, errors.New("pitr id map payload is nil") - } - return PitrIngestItemsFromProto(payload.IngestItems), nil -} - -func PitrTiFlashItemsFromPayload(payload *backuppb.PitrIdMapPayload) (map[int64]model.TiFlashReplicaInfo, error) { - if payload == nil { - return nil, errors.New("pitr id map payload is nil") - } - return PitrTiFlashItemsFromProto(payload.TiflashItems), nil -} - -func decodePitrIdMapPayload(metaData []byte) (*backuppb.PitrIdMapPayload, error) { - payload := &backuppb.PitrIdMapPayload{} - if err := payload.Unmarshal(metaData); err != nil { - return nil, errors.Trace(err) - } - return payload, nil -} - -func newPitrIdMapPayload(dbMaps []*backuppb.PitrDBMap) *backuppb.PitrIdMapPayload { - return &backuppb.PitrIdMapPayload{ - Version: pitrIdMapPayloadVersion, - DbMaps: dbMaps, - } -} - -func NewPitrIdMapPayload(dbMaps []*backuppb.PitrDBMap) *backuppb.PitrIdMapPayload { - return newPitrIdMapPayload(dbMaps) -} - func (rc *LogClient) loadPitrIdMapDataFromTable( ctx context.Context, restoredTS uint64, @@ -275,18 +231,6 @@ func (rc *LogClient) loadPitrIdMapPayloadFromTable( restoreID uint64, ) (*backuppb.PitrIdMapPayload, bool, error) { restoreID = rc.normalizePitrIdMapRestoreID(restoreID) - payload, found, err := rc.loadPitrIdMapPayloadFromTableOnce(ctx, restoredTS, restoreID) - if err != nil { - return nil, false, errors.Trace(err) - } - return payload, found, nil -} - -func (rc *LogClient) loadPitrIdMapPayloadFromTableOnce( - ctx context.Context, - restoredTS uint64, - restoreID uint64, -) (*backuppb.PitrIdMapPayload, bool, error) { metaData, found, err := rc.loadPitrIdMapDataFromTable(ctx, restoredTS, restoreID) if err != nil { return nil, false, errors.Trace(err) @@ -294,8 +238,8 @@ func (rc *LogClient) loadPitrIdMapPayloadFromTableOnce( if !found { return nil, false, nil } - payload, err := decodePitrIdMapPayload(metaData) - if err != nil { + payload := &backuppb.PitrIdMapPayload{} + if err := payload.Unmarshal(metaData); err != nil { return nil, false, errors.Trace(err) } return payload, true, nil @@ -309,9 +253,6 @@ func (rc *LogClient) savePitrIdMapPayloadToTable( if payload == nil { return errors.New("pitr id map payload is nil") } - if payload.Version == 0 { - payload.Version = pitrIdMapPayloadVersion - } data, err := proto.Marshal(payload) if err != nil { return errors.Trace(err) diff --git a/br/pkg/task/operator/checksum_table.go b/br/pkg/task/operator/checksum_table.go index 682a90cd32482..8b578182915d9 100644 --- a/br/pkg/task/operator/checksum_table.go +++ b/br/pkg/task/operator/checksum_table.go @@ -258,10 +258,11 @@ func (c *checksumTableCtx) loadPitrIdMap(ctx context.Context, g glue.Glue, resto for _, row := range rows { restoreID, elementID, data := getRowColumns(row) if lastRestoreID != restoreID { - dbMaps, err := logclient.DecodePitrIdMapPayload(metaData) - if err != nil { + payload := &backup.PitrIdMapPayload{} + if err := payload.Unmarshal(metaData); err != nil { return nil, errors.Trace(err) } + dbMaps := payload.GetDbMaps() pitrDBMap = append(pitrDBMap, dbMaps...) metaData = make([]byte, 0) lastRestoreID = restoreID @@ -277,10 +278,11 @@ func (c *checksumTableCtx) loadPitrIdMap(ctx context.Context, g glue.Glue, resto nextSegmentID += 1 } if len(metaData) > 0 { - dbMaps, err := logclient.DecodePitrIdMapPayload(metaData) - if err != nil { + payload := &backup.PitrIdMapPayload{} + if err := payload.Unmarshal(metaData); err != nil { return nil, errors.Trace(err) } + dbMaps := payload.GetDbMaps() pitrDBMap = append(pitrDBMap, dbMaps...) } return pitrDBMap, nil diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 35785cdee4591..93175850324a2 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -327,6 +327,10 @@ func (cfg *RestoreConfig) LocalEncryptionEnabled() bool { return cfg.CipherInfo.CipherType != encryptionpb.EncryptionMethod_PLAINTEXT } +func (cfg *RestoreConfig) HasFullBackupStorage() bool { + return len(cfg.FullBackupStorage) > 0 +} + type immutableRestoreConfig struct { CmdName string UpstreamClusterID uint64 diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index a22718e77ac58..f2606be23ad3d 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1633,22 +1633,22 @@ func restoreStream( if err != nil { return errors.Trace(err) } + if !cfg.HasFullBackupStorage() && segmentedPayloads.previous == nil { + return errors.Annotatef(berrors.ErrInvalidArgument, + "missing payload for previous segment; start-ts=%d", cfg.StartTS) + } if err := buildIDMapWithPayloads(ctx, client, cfg, segmentedPayloads, savedIDMap); err != nil { return errors.Trace(err) } - if cfg.tiflashRecorder != nil && len(cfg.FullBackupStorage) == 0 { - items, loaded, err := loadSegmentedTiFlashItems(ctx, client, cfg, segmentedPayloads) + if cfg.tiflashRecorder != nil && !cfg.HasFullBackupStorage() { + items, err := loadSegmentedTiFlashItems(segmentedPayloads.previous) if err != nil { return errors.Trace(err) } - if !loaded { - log.Info("no tiflash items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) - } else { - cfg.tiflashRecorder.Load(items) - log.Info("loaded tiflash items for previous segment", - zap.Uint64("start-ts", cfg.StartTS), - zap.Int("item-count", len(items))) - } + cfg.tiflashRecorder.Load(items) + log.Info("loaded tiflash items for previous segment", + zap.Uint64("start-ts", cfg.StartTS), + zap.Int("item-count", len(items))) } // build schema replace @@ -1656,20 +1656,16 @@ func restoreStream( if err != nil { return errors.Trace(err) } - if recorder := schemasReplace.GetIngestRecorder(); recorder != nil && len(cfg.FullBackupStorage) == 0 { - items, loaded, err := loadSegmentedIngestItems(ctx, client, cfg, segmentedPayloads) + if recorder := schemasReplace.GetIngestRecorder(); recorder != nil && !cfg.HasFullBackupStorage() { + items, err := loadSegmentedIngestItems(segmentedPayloads.previous) if err != nil { return errors.Trace(err) } - if !loaded { - log.Info("no ingest items found for previous segment", zap.Uint64("start-ts", cfg.StartTS)) - } else { - recorder.MergeItems(items) - log.Info("loaded ingest items for previous segment", - zap.Uint64("start-ts", cfg.StartTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - } + recorder.MergeItems(items) + log.Info("loaded ingest items for previous segment", + zap.Uint64("start-ts", cfg.StartTS), + zap.Int("table-count", len(items)), + zap.Int("index-count", ingestrec.CountItems(items))) } importModeSwitcher := restore.NewImportModeSwitcher(mgr.GetPDClient(), @@ -2369,76 +2365,26 @@ func buildIDMapWithPayloads( if err := cfg.tableMappingManager.ReplaceTemporaryIDs(ctx, client.GenGlobalIDs); err != nil { return errors.Trace(err) } - payloads.current = logclient.NewPitrIdMapPayload(cfg.tableMappingManager.ToProto()) + payloads.current = &backuppb.PitrIdMapPayload{DbMaps: cfg.tableMappingManager.ToProto()} return nil } func loadSegmentedTiFlashItems( - ctx context.Context, - client *logclient.LogClient, - cfg *LogRestoreConfig, - payloads *segmentedRestorePayloads, -) (map[int64]model.TiFlashReplicaInfo, bool, error) { - if payloads != nil && payloads.previous != nil { - items, err := logclient.PitrTiFlashItemsFromPayload(payloads.previous) - if err != nil { - return nil, false, errors.Trace(err) - } - if items == nil { - items = map[int64]model.TiFlashReplicaInfo{} - } - return items, true, nil - } - - if cfg.logCheckpointMetaManager == nil { - return nil, false, nil - } - clusterID := client.GetClusterID(ctx) - items, found, err := cfg.logCheckpointMetaManager.LoadPITRTiFlashItems(ctx, clusterID, cfg.StartTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - if items == nil { - items = map[int64]model.TiFlashReplicaInfo{} + payload *backuppb.PitrIdMapPayload, +) (map[int64]model.TiFlashReplicaInfo, error) { + if payload == nil { + return nil, errors.New("pitr id map payload is nil") } - return items, true, nil + return logclient.PitrTiFlashItemsFromProto(payload.TiflashItems), nil } func loadSegmentedIngestItems( - ctx context.Context, - client *logclient.LogClient, - cfg *LogRestoreConfig, - payloads *segmentedRestorePayloads, -) (map[int64]map[int64]bool, bool, error) { - if payloads != nil && payloads.previous != nil { - items, err := logclient.PitrIngestItemsFromPayload(payloads.previous) - if err != nil { - return nil, false, errors.Trace(err) - } - if items == nil { - items = map[int64]map[int64]bool{} - } - return items, true, nil - } - - if cfg.logCheckpointMetaManager == nil { - return nil, false, nil - } - clusterID := client.GetClusterID(ctx) - items, found, err := cfg.logCheckpointMetaManager.LoadPITRIngestItems(ctx, clusterID, cfg.StartTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - if items == nil { - items = map[int64]map[int64]bool{} + payload *backuppb.PitrIdMapPayload, +) (map[int64]map[int64]bool, error) { + if payload == nil { + return nil, errors.New("pitr id map payload is nil") } - return items, true, nil + return logclient.PitrIngestItemsFromProto(payload.IngestItems), nil } func persistSegmentedRestorePayload( @@ -2450,7 +2396,7 @@ func persistSegmentedRestorePayload( ) error { if payloads == nil || payloads.current == nil { payloads = &segmentedRestorePayloads{ - current: logclient.NewPitrIdMapPayload(cfg.tableMappingManager.ToProto()), + current: &backuppb.PitrIdMapPayload{DbMaps: cfg.tableMappingManager.ToProto()}, } } if !cfg.LastRestore { From ddd4205ae692b3d0002018253ac430c883f08f1f Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Tue, 3 Feb 2026 09:55:38 +0000 Subject: [PATCH 16/18] go back Signed-off-by: Juncen Yu --- br/pkg/checkpoint/storage.go | 2 - br/pkg/errors/errors.go | 1 - br/pkg/restore/ingestrec/ingest_recorder.go | 63 ---- br/pkg/restore/log_client/client.go | 87 +---- br/pkg/restore/log_client/id_map.go | 156 ++++++--- .../restore/log_client/pitr_id_map_payload.go | 298 ------------------ br/pkg/task/operator/checksum_table.go | 14 +- br/pkg/task/restore.go | 18 -- br/pkg/task/stream.go | 208 ++---------- br/tests/br_restore_checkpoint/run.sh | 13 +- 10 files changed, 151 insertions(+), 709 deletions(-) delete mode 100644 br/pkg/restore/log_client/pitr_id_map_payload.go diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index a0daf51f5d1f1..59f319057ec6d 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -49,7 +49,6 @@ type checkpointStorage interface { // 3. BR regards the checkpoint table as a directory which is managed by metadata table. const ( LogRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_Checkpoint" - LogRestorePITRItemsDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_PiTR_Items" SnapshotRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint" CustomSSTRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Custom_SST_Restore_Checkpoint" @@ -94,7 +93,6 @@ const ( func IsCheckpointDB(dbname string) bool { // Check if the database name starts with any of the checkpoint database name prefixes return strings.HasPrefix(dbname, LogRestoreCheckpointDatabaseName) || - strings.HasPrefix(dbname, LogRestorePITRItemsDatabaseName) || strings.HasPrefix(dbname, SnapshotRestoreCheckpointDatabaseName) || strings.HasPrefix(dbname, CustomSSTRestoreCheckpointDatabaseName) } diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go index ca5cc3b06f1ae..a21855f95c589 100644 --- a/br/pkg/errors/errors.go +++ b/br/pkg/errors/errors.go @@ -88,7 +88,6 @@ var ( ErrPiTRTaskNotFound = errors.Normalize("task not found", errors.RFCCodeText("BR:PiTR:ErrTaskNotFound")) ErrPiTRInvalidTaskInfo = errors.Normalize("task info is invalid", errors.RFCCodeText("BR:PiTR:ErrInvalidTaskInfo")) ErrPiTRMalformedMetadata = errors.Normalize("malformed metadata", errors.RFCCodeText("BR:PiTR:ErrMalformedMetadata")) - ErrPiTRIDMapTableNotFound = errors.Normalize("id map table not found", errors.RFCCodeText("BR:PiTR:IDMapTableNotFound")) ErrStorageUnknown = errors.Normalize("unknown external storage error", errors.RFCCodeText("BR:ExternalStorage:ErrStorageUnknown")) ErrStorageInvalidConfig = errors.Normalize("invalid external storage config", errors.RFCCodeText("BR:ExternalStorage:ErrStorageInvalidConfig")) diff --git a/br/pkg/restore/ingestrec/ingest_recorder.go b/br/pkg/restore/ingestrec/ingest_recorder.go index 78f22c5adff65..f406d36b0d680 100644 --- a/br/pkg/restore/ingestrec/ingest_recorder.go +++ b/br/pkg/restore/ingestrec/ingest_recorder.go @@ -247,66 +247,3 @@ func (i *IngestRecorder) IterateForeignKeys(f func(*ForeignKeyRecord) error) err } return nil } - -// CountItems counts the total ingested indexes across all tables. -func CountItems(items map[int64]map[int64]bool) int { - total := 0 - for _, indexMap := range items { - total += len(indexMap) - } - return total -} - -// ExportItems returns a snapshot of ingest items keyed by table ID and index ID. -func (i *IngestRecorder) ExportItems() map[int64]map[int64]bool { - items := make(map[int64]map[int64]bool, len(i.items)) - for tableID, indexes := range i.items { - if len(indexes) == 0 { - continue - } - tableItems := make(map[int64]bool, len(indexes)) - for indexID, info := range indexes { - if info == nil { - continue - } - tableItems[indexID] = info.IsPrimary - } - if len(tableItems) > 0 { - items[tableID] = tableItems - } - } - return items -} - -// MergeItems merges the provided ingest items into the recorder. -func (i *IngestRecorder) MergeItems(items map[int64]map[int64]bool) { - if len(items) == 0 { - return - } - if i.items == nil { - i.items = make(map[int64]map[int64]*IngestIndexInfo) - } - for tableID, indexMap := range items { - if len(indexMap) == 0 { - continue - } - tableIndexes, exists := i.items[tableID] - if !exists { - tableIndexes = make(map[int64]*IngestIndexInfo, len(indexMap)) - i.items[tableID] = tableIndexes - } - for indexID, isPrimary := range indexMap { - info, exists := tableIndexes[indexID] - if !exists { - tableIndexes[indexID] = &IngestIndexInfo{ - IsPrimary: isPrimary, - Updated: false, - } - continue - } - if isPrimary && !info.IsPrimary { - info.IsPrimary = true - } - } - } -} diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 9b61e81bc6c09..07d77db537a56 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -217,7 +217,6 @@ type LogClient struct { upstreamClusterID uint64 restoreID uint64 - lastRestore bool // the query to insert rows into table `gc_delete_range`, lack of ts. deleteRangeQuery []*stream.PreDelRangeQuery @@ -235,18 +234,6 @@ func (rc *LogClient) SetRestoreID(restoreID uint64) { rc.restoreID = restoreID } -func (rc *LogClient) SetRestoreToLast(restoreToLast bool) { - rc.lastRestore = restoreToLast -} - -func (rc *LogClient) LastOne() bool { - return rc.lastRestore -} - -func (rc *LogClient) ValidateNoTiFlashReplica() error { - return rc.validateNoTiFlashReplica() -} - type restoreStatistics struct { // restoreSSTKVSize is the total size (Original KV length) of KV pairs restored from SST files. restoreSSTKVSize uint64 @@ -1059,7 +1046,7 @@ func (rc *LogClient) GetBaseIDMapAndMerge( // schemas map whose `restore-ts`` is the task's `start-ts`. if len(dbMaps) <= 0 && !hasFullBackupStorageConfig { log.Info("try to load pitr id maps of the previous task", zap.Uint64("start-ts", rc.startTS)) - dbMaps, err = rc.loadSchemasMapFromLastTask(ctx, rc.startTS) + dbMaps, err = rc.loadSchemasMap(ctx, rc.startTS, logCheckpointMetaManager) if err != nil { return errors.Trace(err) } @@ -1994,22 +1981,12 @@ func (rc *LogClient) SaveIdMapWithFailPoints( ctx context.Context, manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, -) error { - payload := &backuppb.PitrIdMapPayload{DbMaps: manager.ToProto()} - return rc.SavePitrIdMapPayloadWithFailPoints(ctx, rc.restoreTS, payload, logCheckpointMetaManager) -} - -func (rc *LogClient) SavePitrIdMapPayloadWithFailPoints( - ctx context.Context, - restoredTS uint64, - payload *backuppb.PitrIdMapPayload, - logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { failpoint.Inject("failed-before-id-maps-saved", func(_ failpoint.Value) { failpoint.Return(errors.New("failpoint: failed before id maps saved")) }) - if err := rc.savePitrIdMapPayload(ctx, restoredTS, payload, logCheckpointMetaManager); err != nil { + if err := rc.saveIDMap(ctx, manager, logCheckpointMetaManager); err != nil { return errors.Trace(err) } @@ -2169,23 +2146,22 @@ func (rc *LogClient) RefreshMetaForTables(ctx context.Context, schemasReplace *s return errors.Errorf("the deleted table(upstream ID: %d) has no record in replace map", upstreamTableID) } - involvedDB, involvedTable := rc.resolveInvolvingNames(ctx, dbReplace, tableReplace) args := &model.RefreshMetaArgs{ SchemaID: dbReplace.DbID, TableID: tableReplace.TableID, - InvolvedDB: involvedDB, - InvolvedTable: involvedTable, + InvolvedDB: dbReplace.Name, + InvolvedTable: tableReplace.Name, } log.Info("refreshing deleted table meta", zap.Int64("schemaID", dbReplace.DbID), - zap.String("dbName", involvedDB), + zap.String("dbName", dbReplace.Name), zap.Any("tableID", tableReplace.TableID), - zap.String("tableName", involvedTable)) + zap.String("tableName", tableReplace.Name)) if err := rc.unsafeSession.RefreshMeta(ctx, args); err != nil { return errors.Annotatef(err, "failed to refresh meta for deleted table with schemaID=%d, tableID=%d, dbName=%s, tableName=%s", - dbReplace.DbID, tableReplace.TableID, involvedDB, involvedTable) + dbReplace.DbID, tableReplace.TableID, dbReplace.Name, tableReplace.Name) } deletedTableCount++ } @@ -2266,21 +2242,20 @@ func (rc *LogClient) RefreshMetaForTables(ctx context.Context, schemasReplace *s } } - involvedDB, involvedTable := rc.resolveInvolvingNames(ctx, dbReplace, tableReplace) args := &model.RefreshMetaArgs{ SchemaID: dbReplace.DbID, TableID: tableReplace.TableID, - InvolvedDB: involvedDB, - InvolvedTable: involvedTable, + InvolvedDB: dbReplace.Name, + InvolvedTable: tableReplace.Name, } log.Info("refreshing regular table meta", zap.Int64("schemaID", dbReplace.DbID), - zap.String("dbName", involvedDB), + zap.String("dbName", dbReplace.Name), zap.Any("tableID", tableReplace.TableID), - zap.String("tableName", involvedTable)) + zap.String("tableName", tableReplace.Name)) if err := rc.unsafeSession.RefreshMeta(ctx, args); err != nil { return errors.Annotatef(err, "failed to refresh meta for table with schemaID=%d, tableID=%d, dbName=%s, tableName=%s", - dbReplace.DbID, tableReplace.TableID, involvedDB, involvedTable) + dbReplace.DbID, tableReplace.TableID, dbReplace.Name, tableReplace.Name) } regularCount++ } @@ -2291,41 +2266,3 @@ func (rc *LogClient) RefreshMetaForTables(ctx context.Context, schemasReplace *s zap.Int("regularTableCount", regularCount)) return nil } - -func (rc *LogClient) resolveInvolvingNames( - ctx context.Context, - dbReplace *stream.DBReplace, - tableReplace *stream.TableReplace, -) (string, string) { - dbName := "" - if dbReplace != nil { - dbName = dbReplace.Name - } - tableName := "" - if tableReplace != nil { - tableName = tableReplace.Name - } - - infoSchema := rc.dom.InfoSchema() - if dbName == "" && dbReplace != nil { - if dbInfo, ok := infoSchema.SchemaByID(dbReplace.DbID); ok { - dbName = dbInfo.Name.O - } - } - if tableName == "" && tableReplace != nil && tableReplace.TableID != 0 { - if tbl, ok := infoSchema.TableByID(ctx, tableReplace.TableID); ok { - tableName = tbl.Meta().Name.O - } - } - - if dbName == "" { - dbName = model.InvolvingAll - } - if tableName == "" { - tableName = model.InvolvingAll - } - if dbName == model.InvolvingAll && tableName != model.InvolvingAll { - tableName = model.InvolvingAll - } - return dbName, tableName -} diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 80e89b45360d9..0ad5bb92ee9d0 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -18,14 +18,15 @@ import ( "context" "fmt" + "github.com/gogo/protobuf/proto" "github.com/pingcap/errors" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" - berrors "github.com/pingcap/tidb/br/pkg/errors" "github.com/pingcap/tidb/br/pkg/metautil" "github.com/pingcap/tidb/br/pkg/restore" "github.com/pingcap/tidb/br/pkg/stream" + "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/objstore/storeapi" "github.com/pingcap/tidb/pkg/parser/ast" "go.uber.org/zap" @@ -61,8 +62,32 @@ func (rc *LogClient) saveIDMap( manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { - payload := &backuppb.PitrIdMapPayload{DbMaps: manager.ToProto()} - return rc.savePitrIdMapPayload(ctx, rc.restoreTS, payload, logCheckpointMetaManager) + dbmaps := manager.ToProto() + if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { + log.Info("checkpoint storage is specified, load pitr id map from the checkpoint storage.") + if err := rc.saveIDMap2Storage(ctx, checkpointStorage, dbmaps); err != nil { + return errors.Trace(err) + } + } else if rc.pitrIDMapTableExists() { + if err := rc.saveIDMap2Table(ctx, dbmaps); err != nil { + return errors.Trace(err) + } + } else { + log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") + if err := rc.saveIDMap2Storage(ctx, rc.storage, dbmaps); err != nil { + return errors.Trace(err) + } + } + + if rc.useCheckpoint { + log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") + if err := logCheckpointMetaManager.SaveCheckpointProgress(ctx, &checkpoint.CheckpointProgress{ + Progress: checkpoint.InLogRestoreAndIdMapPersisted, + }); err != nil { + return errors.Trace(err) + } + } + return nil } func (rc *LogClient) saveIDMap2Storage( @@ -81,49 +106,48 @@ func (rc *LogClient) saveIDMap2Storage( } func (rc *LogClient) saveIDMap2Table(ctx context.Context, dbMaps []*backuppb.PitrDBMap) error { - payload := &backuppb.PitrIdMapPayload{DbMaps: dbMaps} - if existing, found, err := rc.loadPitrIdMapPayloadFromTable(ctx, rc.restoreTS, rc.restoreID); err != nil { + backupmeta := &backuppb.BackupMeta{DbMaps: dbMaps} + data, err := proto.Marshal(backupmeta) + if err != nil { return errors.Trace(err) - } else if found { - payload.IngestItems = existing.IngestItems - payload.TiflashItems = existing.TiflashItems } - return errors.Trace(rc.savePitrIdMapPayloadToTable(ctx, rc.restoreTS, payload)) -} -func (rc *LogClient) savePitrIdMapPayload( - ctx context.Context, - restoredTS uint64, - payload *backuppb.PitrIdMapPayload, - logCheckpointMetaManager checkpoint.LogMetaManagerT, -) error { - if payload == nil { - return errors.New("pitr id map payload is nil") - } - tableExists := rc.pitrIDMapTableExists() - if tableExists { - if err := rc.savePitrIdMapPayloadToTable(ctx, restoredTS, payload); err != nil { + hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() + + if hasRestoreIDColumn { + // new version with restore_id column + // clean the dirty id map at first + err = rc.unsafeSession.ExecuteInternal(ctx, "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? and restore_id = %?;", + rc.restoreTS, rc.upstreamClusterID, rc.restoreID) + if err != nil { return errors.Trace(err) } - } - if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { - log.Info("checkpoint storage is specified, save pitr id map to the checkpoint storage.") - if err := rc.saveIDMap2Storage(ctx, checkpointStorage, payload.GetDbMaps()); err != nil { - return errors.Trace(err) + replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restore_id, restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?, %?);" + for startIdx, segmentId := 0, 0; startIdx < len(data); segmentId += 1 { + endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) + err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, rc.restoreID, rc.restoreTS, rc.upstreamClusterID, segmentId, data[startIdx:endIdx]) + if err != nil { + return errors.Trace(err) + } + startIdx = endIdx } - } else if !tableExists { - log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") - if err := rc.saveIDMap2Storage(ctx, rc.storage, payload.GetDbMaps()); err != nil { + } else { + // old version without restore_id column - use default value 0 for restore_id + log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") + // clean the dirty id map at first (without restore_id filter) + err = rc.unsafeSession.ExecuteInternal(ctx, "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %?;", + rc.restoreTS, rc.upstreamClusterID) + if err != nil { return errors.Trace(err) } - } - - if rc.useCheckpoint { - log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") - if err := logCheckpointMetaManager.SaveCheckpointProgress(ctx, &checkpoint.CheckpointProgress{ - Progress: checkpoint.InLogRestoreAndIdMapPersisted, - }); err != nil { - return errors.Trace(err) + replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?);" + for startIdx, segmentId := 0, 0; startIdx < len(data); segmentId += 1 { + endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) + err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, rc.restoreTS, rc.upstreamClusterID, segmentId, data[startIdx:endIdx]) + if err != nil { + return errors.Trace(err) + } + startIdx = endIdx } } return nil @@ -148,13 +172,6 @@ func (rc *LogClient) loadSchemasMap( return dbMaps, errors.Trace(err) } -func (rc *LogClient) loadSchemasMapFromLastTask(ctx context.Context, lastRestoredTS uint64) ([]*backuppb.PitrDBMap, error) { - if !rc.pitrIDMapTableExists() { - return nil, errors.Annotatef(berrors.ErrPiTRIDMapTableNotFound, "segmented restore is impossible") - } - return rc.loadSchemasMapFromTable(ctx, lastRestoredTS) -} - func (rc *LogClient) loadSchemasMapFromStorage( ctx context.Context, storage storeapi.Storage, @@ -186,13 +203,52 @@ func (rc *LogClient) loadSchemasMapFromTable( ctx context.Context, restoredTS uint64, ) ([]*backuppb.PitrDBMap, error) { - payload, found, err := rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) - if err != nil { - return nil, errors.Trace(err) + hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() + + var getPitrIDMapSQL string + var args []any + + if hasRestoreIDColumn { + // new version with restore_id column + getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" + args = []any{rc.restoreID, restoredTS, rc.upstreamClusterID} + } else { + // old version without restore_id column + log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") + getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" + args = []any{restoredTS, rc.upstreamClusterID} } - if !found { + + execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + getPitrIDMapSQL, + args..., + ) + if errSQL != nil { + return nil, errors.Annotatef(errSQL, "failed to get pitr id map from mysql.tidb_pitr_id_map") + } + if len(rows) == 0 { log.Info("pitr id map does not exist", zap.Uint64("restored ts", restoredTS)) return nil, nil } - return payload.GetDbMaps(), nil + metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) + for i, row := range rows { + elementID := row.GetUint64(0) + if uint64(i) != elementID { + return nil, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", i) + } + d := row.GetBytes(1) + if len(d) == 0 { + return nil, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", i) + } + metaData = append(metaData, d...) + } + backupMeta := &backuppb.BackupMeta{} + if err := backupMeta.Unmarshal(metaData); err != nil { + return nil, errors.Trace(err) + } + + return backupMeta.GetDbMaps(), nil } diff --git a/br/pkg/restore/log_client/pitr_id_map_payload.go b/br/pkg/restore/log_client/pitr_id_map_payload.go deleted file mode 100644 index ac385ad0a343a..0000000000000 --- a/br/pkg/restore/log_client/pitr_id_map_payload.go +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package logclient - -import ( - "context" - - "github.com/gogo/protobuf/proto" - "github.com/pingcap/errors" - backuppb "github.com/pingcap/kvproto/pkg/brpb" - "github.com/pingcap/log" - "github.com/pingcap/tidb/pkg/kv" - "github.com/pingcap/tidb/pkg/meta/model" -) - -func PitrIngestItemsFromProto(items []*backuppb.PitrIngestItem) map[int64]map[int64]bool { - if len(items) == 0 { - return map[int64]map[int64]bool{} - } - result := make(map[int64]map[int64]bool, len(items)) - for _, item := range items { - if item == nil { - continue - } - indexes := make(map[int64]bool, len(item.Indexes)) - for _, index := range item.Indexes { - if index == nil { - continue - } - indexes[index.IndexId] = index.IsPrimary - } - result[item.TableId] = indexes - } - return result -} - -func PitrIngestItemsToProto(items map[int64]map[int64]bool) []*backuppb.PitrIngestItem { - if len(items) == 0 { - return nil - } - result := make([]*backuppb.PitrIngestItem, 0, len(items)) - for tableID, indexMap := range items { - indexes := make([]*backuppb.PitrIngestIndex, 0, len(indexMap)) - for indexID, isPrimary := range indexMap { - indexes = append(indexes, &backuppb.PitrIngestIndex{ - IndexId: indexID, - IsPrimary: isPrimary, - }) - } - result = append(result, &backuppb.PitrIngestItem{ - TableId: tableID, - Indexes: indexes, - }) - } - return result -} - -func PitrTiFlashItemsFromProto(items []*backuppb.PitrTiFlashItem) map[int64]model.TiFlashReplicaInfo { - if len(items) == 0 { - return map[int64]model.TiFlashReplicaInfo{} - } - result := make(map[int64]model.TiFlashReplicaInfo, len(items)) - for _, item := range items { - if item == nil || item.Replica == nil { - continue - } - replica := item.Replica - result[item.TableId] = model.TiFlashReplicaInfo{ - Count: replica.Count, - LocationLabels: append([]string(nil), replica.LocationLabels...), - Available: replica.Available, - AvailablePartitionIDs: append([]int64(nil), replica.AvailablePartitionIds...), - } - } - return result -} - -func PitrTiFlashItemsToProto(items map[int64]model.TiFlashReplicaInfo) []*backuppb.PitrTiFlashItem { - if len(items) == 0 { - return nil - } - result := make([]*backuppb.PitrTiFlashItem, 0, len(items)) - for tableID, replica := range items { - result = append(result, &backuppb.PitrTiFlashItem{ - TableId: tableID, - Replica: &backuppb.PitrTiFlashReplicaInfo{ - Count: replica.Count, - LocationLabels: append([]string(nil), replica.LocationLabels...), - Available: replica.Available, - AvailablePartitionIds: append([]int64(nil), replica.AvailablePartitionIDs...), - }, - }) - } - return result -} - -func (rc *LogClient) loadPitrIdMapDataFromTable( - ctx context.Context, - restoredTS uint64, - restoreID uint64, -) ([]byte, bool, error) { - var getPitrIDMapSQL string - var args []any - - if rc.pitrIDMapHasRestoreIDColumn() { - getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" - args = []any{restoreID, restoredTS, rc.upstreamClusterID} - } else { - getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" - args = []any{restoredTS, rc.upstreamClusterID} - } - - execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() - rows, _, errSQL := execCtx.ExecRestrictedSQL( - kv.WithInternalSourceType(ctx, kv.InternalTxnBR), - nil, - getPitrIDMapSQL, - args..., - ) - if errSQL != nil { - return nil, false, errors.Annotatef(errSQL, "failed to get pitr id map from mysql.tidb_pitr_id_map") - } - if len(rows) == 0 { - return nil, false, nil - } - metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) - for i, row := range rows { - elementID := row.GetUint64(0) - if uint64(i) != elementID { - return nil, false, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", i) - } - d := row.GetBytes(1) - if len(d) == 0 { - return nil, false, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", i) - } - metaData = append(metaData, d...) - } - return metaData, true, nil -} - -func (rc *LogClient) loadLatestRestoreIDFromTable( - ctx context.Context, - restoredTS uint64, -) (uint64, bool, error) { - if !rc.pitrIDMapHasRestoreIDColumn() { - return 0, false, errors.New("restore_id column is not available") - } - execCtx := rc.unsafeSession.GetSessionCtx().GetRestrictedSQLExecutor() - rows, _, errSQL := execCtx.ExecRestrictedSQL( - kv.WithInternalSourceType(ctx, kv.InternalTxnBR), - nil, - "SELECT restore_id FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY restore_id DESC LIMIT 1;", - restoredTS, rc.upstreamClusterID, - ) - if errSQL != nil { - return 0, false, errors.Annotatef(errSQL, "failed to get latest restore_id from mysql.tidb_pitr_id_map") - } - if len(rows) == 0 { - return 0, false, nil - } - return rows[0].GetUint64(0), true, nil -} - -func (rc *LogClient) resolvePitrIdMapRestoreID( - ctx context.Context, - restoredTS uint64, -) (uint64, bool, error) { - if !rc.pitrIDMapHasRestoreIDColumn() { - return 0, true, nil - } - if restoredTS == rc.restoreTS { - return rc.restoreID, true, nil - } - restoreID, found, err := rc.loadLatestRestoreIDFromTable(ctx, restoredTS) - if err != nil { - return 0, false, errors.Trace(err) - } - if !found { - return 0, false, nil - } - return restoreID, true, nil -} - -func (rc *LogClient) normalizePitrIdMapRestoreID(restoreID uint64) uint64 { - if rc.pitrIDMapHasRestoreIDColumn() { - return restoreID - } - return 0 -} - -func (rc *LogClient) loadPitrIdMapPayloadForSegment( - ctx context.Context, - restoredTS uint64, -) (*backuppb.PitrIdMapPayload, bool, error) { - restoreID, found, err := rc.resolvePitrIdMapRestoreID(ctx, restoredTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - restoreID = rc.normalizePitrIdMapRestoreID(restoreID) - return rc.loadPitrIdMapPayloadFromTable(ctx, restoredTS, restoreID) -} - -func (rc *LogClient) LoadPitrIdMapPayloadForSegment( - ctx context.Context, - restoredTS uint64, -) (*backuppb.PitrIdMapPayload, bool, error) { - if !rc.pitrIDMapTableExists() { - return nil, false, nil - } - return rc.loadPitrIdMapPayloadForSegment(ctx, restoredTS) -} - -func (rc *LogClient) loadPitrIdMapPayloadFromTable( - ctx context.Context, - restoredTS uint64, - restoreID uint64, -) (*backuppb.PitrIdMapPayload, bool, error) { - restoreID = rc.normalizePitrIdMapRestoreID(restoreID) - metaData, found, err := rc.loadPitrIdMapDataFromTable(ctx, restoredTS, restoreID) - if err != nil { - return nil, false, errors.Trace(err) - } - if !found { - return nil, false, nil - } - payload := &backuppb.PitrIdMapPayload{} - if err := payload.Unmarshal(metaData); err != nil { - return nil, false, errors.Trace(err) - } - return payload, true, nil -} - -func (rc *LogClient) savePitrIdMapPayloadToTable( - ctx context.Context, - restoredTS uint64, - payload *backuppb.PitrIdMapPayload, -) error { - if payload == nil { - return errors.New("pitr id map payload is nil") - } - data, err := proto.Marshal(payload) - if err != nil { - return errors.Trace(err) - } - - hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() - if hasRestoreIDColumn { - err = rc.unsafeSession.ExecuteInternal(ctx, - "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? and restore_id = %?;", - restoredTS, rc.upstreamClusterID, rc.restoreID) - if err != nil { - return errors.Trace(err) - } - replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restore_id, restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?, %?);" - for startIdx, segmentID := 0, 0; startIdx < len(data); segmentID += 1 { - endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) - err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, rc.restoreID, restoredTS, rc.upstreamClusterID, segmentID, data[startIdx:endIdx]) - if err != nil { - return errors.Trace(err) - } - startIdx = endIdx - } - return nil - } - - log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") - err = rc.unsafeSession.ExecuteInternal(ctx, - "DELETE FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %?;", - restoredTS, rc.upstreamClusterID) - if err != nil { - return errors.Trace(err) - } - replacePitrIDMapSQL := "REPLACE INTO mysql.tidb_pitr_id_map (restored_ts, upstream_cluster_id, segment_id, id_map) VALUES (%?, %?, %?, %?);" - for startIdx, segmentID := 0, 0; startIdx < len(data); segmentID += 1 { - endIdx := min(startIdx+PITRIdMapBlockSize, len(data)) - err := rc.unsafeSession.ExecuteInternal(ctx, replacePitrIDMapSQL, restoredTS, rc.upstreamClusterID, segmentID, data[startIdx:endIdx]) - if err != nil { - return errors.Trace(err) - } - startIdx = endIdx - } - return nil -} diff --git a/br/pkg/task/operator/checksum_table.go b/br/pkg/task/operator/checksum_table.go index 8b578182915d9..592ab34caccbb 100644 --- a/br/pkg/task/operator/checksum_table.go +++ b/br/pkg/task/operator/checksum_table.go @@ -258,12 +258,11 @@ func (c *checksumTableCtx) loadPitrIdMap(ctx context.Context, g glue.Glue, resto for _, row := range rows { restoreID, elementID, data := getRowColumns(row) if lastRestoreID != restoreID { - payload := &backup.PitrIdMapPayload{} - if err := payload.Unmarshal(metaData); err != nil { + backupMeta := &backup.BackupMeta{} + if err := backupMeta.Unmarshal(metaData); err != nil { return nil, errors.Trace(err) } - dbMaps := payload.GetDbMaps() - pitrDBMap = append(pitrDBMap, dbMaps...) + pitrDBMap = append(pitrDBMap, backupMeta.DbMaps...) metaData = make([]byte, 0) lastRestoreID = restoreID nextSegmentID = uint64(0) @@ -278,12 +277,11 @@ func (c *checksumTableCtx) loadPitrIdMap(ctx context.Context, g glue.Glue, resto nextSegmentID += 1 } if len(metaData) > 0 { - payload := &backup.PitrIdMapPayload{} - if err := payload.Unmarshal(metaData); err != nil { + backupMeta := &backup.BackupMeta{} + if err := backupMeta.Unmarshal(metaData); err != nil { return nil, errors.Trace(err) } - dbMaps := payload.GetDbMaps() - pitrDBMap = append(pitrDBMap, dbMaps...) + pitrDBMap = append(pitrDBMap, backupMeta.DbMaps...) } return pitrDBMap, nil } diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 93175850324a2..3916e4292fcca 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -105,8 +105,6 @@ const ( // FlagStreamStartTS and FlagStreamRestoreTS is used for log restore timestamp range. FlagStreamStartTS = "start-ts" FlagStreamRestoreTS = "restored-ts" - // FlagStreamLast is used for log restore, represents restore to the last available TS. - FlagStreamLast = "last" // FlagStreamFullBackupStorage is used for log restore, represents the full backup storage. FlagStreamFullBackupStorage = "full-backup-storage" // FlagPiTRBatchCount and FlagPiTRBatchSize are used for restore log with batch method. @@ -280,10 +278,6 @@ type RestoreConfig struct { RestoreTS uint64 `json:"restore-ts" toml:"restore-ts"` // whether RestoreTS was explicitly specified by user vs auto-detected IsRestoredTSUserSpecified bool `json:"-" toml:"-"` - // LastRestore represents whether restore is the last one. - LastRestore bool `json:"last" toml:"last"` - // whether LastRestore was explicitly specified by user vs default - IsLastRestoreUserSpecified bool `json:"-" toml:"-"` // rewriteTS is the rewritten timestamp of meta kvs. RewriteTS uint64 `json:"-" toml:"-"` tiflashRecorder *tiflashrec.TiFlashRecorder `json:"-" toml:"-"` @@ -327,10 +321,6 @@ func (cfg *RestoreConfig) LocalEncryptionEnabled() bool { return cfg.CipherInfo.CipherType != encryptionpb.EncryptionMethod_PLAINTEXT } -func (cfg *RestoreConfig) HasFullBackupStorage() bool { - return len(cfg.FullBackupStorage) > 0 -} - type immutableRestoreConfig struct { CmdName string UpstreamClusterID uint64 @@ -393,7 +383,6 @@ func DefineStreamRestoreFlags(command *cobra.Command) { "support TSO or datetime, e.g. '400036290571534337' or '2018-05-11 01:42:23+0800'") command.Flags().String(FlagStreamRestoreTS, "", "the point of restore, used for log restore.\n"+ "support TSO or datetime, e.g. '400036290571534337' or '2018-05-11 01:42:23+0800'") - command.Flags().Bool(FlagStreamLast, true, "restore to the last available commit timestamp") command.Flags().String(FlagStreamFullBackupStorage, "", "specify the backup full storage. "+ "fill it if want restore full backup before restore log.") command.Flags().Uint32(FlagPiTRBatchCount, defaultPiTRBatchCount, "specify the batch count to restore log.") @@ -421,13 +410,6 @@ func (cfg *RestoreConfig) ParseStreamRestoreFlags(flags *pflag.FlagSet) error { // check if RestoreTS was explicitly specified by user cfg.IsRestoredTSUserSpecified = flags.Changed(FlagStreamRestoreTS) - // check if LastRestore was explicitly specified by user - cfg.IsLastRestoreUserSpecified = flags.Changed(FlagStreamLast) - cfg.LastRestore, err = flags.GetBool(FlagStreamLast) - if err != nil { - return errors.Trace(err) - } - if cfg.FullBackupStorage, err = flags.GetString(FlagStreamFullBackupStorage); err != nil { return errors.Trace(err) } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index f2606be23ad3d..611d7d391b2c2 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1373,12 +1373,6 @@ func RunStreamRestore( if err := checkLogRange(cfg.StartTS, cfg.RestoreTS, logInfo.logMinTS, logInfo.logMaxTS); err != nil { return errors.Trace(err) } - if cfg.LastRestore && !cfg.IsLastRestoreUserSpecified && cfg.IsRestoredTSUserSpecified && cfg.RestoreTS < logInfo.logMaxTS { - log.Info("restore-ts is before log max and --last not specified; treating as non-final segment", - zap.Uint64("restore-ts", cfg.RestoreTS), - zap.Uint64("log-max-ts", logInfo.logMaxTS)) - cfg.LastRestore = false - } // register task if needed // will potentially override restoredTS @@ -1500,7 +1494,6 @@ func restoreStream( checkpointTotalSize uint64 currentTS uint64 extraFields []zapcore.Field - ingestItemsForNextSeg map[int64]map[int64]bool mu sync.Mutex startTime = time.Now() ) @@ -1543,10 +1536,6 @@ func restoreStream( } client := cfg.logClient - if !cfg.LastRestore && cfg.logCheckpointMetaManager == nil { - return errors.Annotatef(berrors.ErrInvalidArgument, - "segmented log restore requires checkpoint storage (table or external), enable checkpoint or configure --checkpoint-storage") - } migs, err := client.GetLockedMigrations(ctx) if err != nil { return errors.Trace(err) @@ -1628,45 +1617,16 @@ func restoreStream( } } - savedIDMap := isCurrentIdMapSaved(cfg.checkpointTaskInfo) - segmentedPayloads, savedIDMap, err := loadSegmentedRestorePayloads(ctx, client, cfg, savedIDMap) - if err != nil { - return errors.Trace(err) - } - if !cfg.HasFullBackupStorage() && segmentedPayloads.previous == nil { - return errors.Annotatef(berrors.ErrInvalidArgument, - "missing payload for previous segment; start-ts=%d", cfg.StartTS) - } - if err := buildIDMapWithPayloads(ctx, client, cfg, segmentedPayloads, savedIDMap); err != nil { + // build and save id map + if err := buildAndSaveIDMapIfNeeded(ctx, client, cfg); err != nil { return errors.Trace(err) } - if cfg.tiflashRecorder != nil && !cfg.HasFullBackupStorage() { - items, err := loadSegmentedTiFlashItems(segmentedPayloads.previous) - if err != nil { - return errors.Trace(err) - } - cfg.tiflashRecorder.Load(items) - log.Info("loaded tiflash items for previous segment", - zap.Uint64("start-ts", cfg.StartTS), - zap.Int("item-count", len(items))) - } // build schema replace schemasReplace, err := buildSchemaReplace(client, cfg) if err != nil { return errors.Trace(err) } - if recorder := schemasReplace.GetIngestRecorder(); recorder != nil && !cfg.HasFullBackupStorage() { - items, err := loadSegmentedIngestItems(segmentedPayloads.previous) - if err != nil { - return errors.Trace(err) - } - recorder.MergeItems(items) - log.Info("loaded ingest items for previous segment", - zap.Uint64("start-ts", cfg.StartTS), - zap.Int("table-count", len(items)), - zap.Int("index-count", ingestrec.CountItems(items))) - } importModeSwitcher := restore.NewImportModeSwitcher(mgr.GetPDClient(), cfg.Config.SwitchModeInterval, mgr.GetTLSConfig()) @@ -1719,9 +1679,6 @@ func restoreStream( rewriteRules := buildRewriteRules(schemasReplace) ingestRecorder := schemasReplace.GetIngestRecorder() - if !cfg.LastRestore { - ingestItemsForNextSeg = ingestRecorder.ExportItems() - } if err := rangeFilterFromIngestRecorder(ingestRecorder, rewriteRules); err != nil { return errors.Trace(err) } @@ -1834,33 +1791,19 @@ func restoreStream( } // index ingestion is not captured by regular log backup, so we need to manually ingest again - if cfg.LastRestore { - if err = client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g); err != nil { - return errors.Annotate(err, "failed to repair ingest index") - } - } else { - log.Info("skip repairing ingest index until last segment", - zap.Uint64("restored-ts", cfg.RestoreTS)) + if err = client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g); err != nil { + return errors.Annotate(err, "failed to repair ingest index") } if cfg.tiflashRecorder != nil { - if !cfg.LastRestore { - log.Info("skip restoring TiFlash Replica until last segment", - zap.Uint64("restored-ts", cfg.RestoreTS)) - } else { - sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema()) - log.Info("Generating SQLs for restoring TiFlash Replica", - zap.Strings("sqls", sqls)) - if err := client.ResetTiflashReplicas(ctx, sqls, g); err != nil { - return errors.Annotate(err, "failed to reset tiflash replicas") - } + sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema()) + log.Info("Generating SQLs for restoring TiFlash Replica", + zap.Strings("sqls", sqls)) + if err := client.ResetTiflashReplicas(ctx, sqls, g); err != nil { + return errors.Annotate(err, "failed to reset tiflash replicas") } } - if err := persistSegmentedRestorePayload(ctx, client, cfg, segmentedPayloads, ingestItemsForNextSeg); err != nil { - return errors.Annotate(err, "failed to persist segmented restore payload") - } - failpoint.Inject("do-checksum-with-rewrite-rules", func(_ failpoint.Value) { if err := client.FailpointDoChecksumForLogRestore(ctx, mgr.GetStorage().GetClient(), mgr.GetPDClient(), rewriteRules); err != nil { failpoint.Return(errors.Annotate(err, "failed to do checksum")) @@ -1899,7 +1842,6 @@ func createLogClient(ctx context.Context, g glue.Glue, cfg *RestoreConfig, mgr * } client.SetCrypter(&cfg.CipherInfo) client.SetUpstreamClusterID(cfg.UpstreamClusterID) - client.SetRestoreToLast(cfg.LastRestore) err = client.InitClients(ctx, u, cfg.logCheckpointMetaManager, cfg.sstCheckpointMetaManager, uint(cfg.PitrConcurrency), cfg.ConcurrencyPerStore.Value) if err != nil { @@ -2272,85 +2214,17 @@ func buildSchemaReplace(client *logclient.LogClient, cfg *LogRestoreConfig) (*st return schemasReplace, nil } -type segmentedRestorePayloads struct { - previous *backuppb.PitrIdMapPayload - current *backuppb.PitrIdMapPayload -} - -func loadSegmentedRestorePayloads( - ctx context.Context, - client *logclient.LogClient, - cfg *LogRestoreConfig, - savedIDMap bool, -) (*segmentedRestorePayloads, bool, error) { - payloads := &segmentedRestorePayloads{} - if savedIDMap { - payload, found, err := client.LoadPitrIdMapPayloadForSegment(ctx, cfg.RestoreTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if found { - payloads.current = payload - } else { - log.Warn("checkpoint indicates id map saved but payload not found, rebuild it", - zap.Uint64("restore-ts", cfg.RestoreTS), - zap.Uint64("restore-id", cfg.RestoreID)) - savedIDMap = false - } - } - - if len(cfg.FullBackupStorage) == 0 { - payload, found, err := client.LoadPitrIdMapPayloadForSegment(ctx, cfg.StartTS) - if err != nil { - return nil, false, errors.Trace(err) - } - if found { - payloads.previous = payload - } - } - - return payloads, savedIDMap, nil -} - -func buildIDMapWithPayloads( - ctx context.Context, - client *logclient.LogClient, - cfg *LogRestoreConfig, - payloads *segmentedRestorePayloads, - savedIDMap bool, -) error { +func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, cfg *LogRestoreConfig) error { + // get the schemas ID replace information. + saved := isCurrentIdMapSaved(cfg.checkpointTaskInfo) hasFullBackupStorage := len(cfg.FullBackupStorage) != 0 - var ( - dbMaps []*backuppb.PitrDBMap - usePrevMap bool - ) - if payloads.current != nil && len(payloads.current.DbMaps) > 0 { - dbMaps = payloads.current.DbMaps - } - if len(dbMaps) == 0 && !hasFullBackupStorage && payloads.previous != nil { - dbMaps = payloads.previous.DbMaps - usePrevMap = true - } - if len(dbMaps) == 0 && !hasFullBackupStorage { - log.Error("no id maps found") - return errors.New("no base id map found from saved id or last restored PiTR") - } - if len(dbMaps) > 0 { - dbReplaces := stream.FromDBMapProto(dbMaps) - stream.LogDBReplaceMap("base db replace info", dbReplaces) - if len(dbReplaces) != 0 { - cfg.tableMappingManager.SetFromPiTRIDMap() - cfg.tableMappingManager.MergeBaseDBReplace(dbReplaces) - } - } - - if usePrevMap { - if err := client.ValidateNoTiFlashReplica(); err != nil { - return errors.Trace(err) - } + err := client.GetBaseIDMapAndMerge(ctx, hasFullBackupStorage, saved, + cfg.logCheckpointMetaManager, cfg.tableMappingManager) + if err != nil { + return errors.Trace(err) } - if savedIDMap { + if saved { return nil } @@ -2362,52 +2236,14 @@ func buildIDMapWithPayloads( // reuse existing database ids if it exists in the current cluster cfg.tableMappingManager.ReuseExistingDatabaseIDs(client.GetDomain().InfoSchema()) // replace temp id with read global id - if err := cfg.tableMappingManager.ReplaceTemporaryIDs(ctx, client.GenGlobalIDs); err != nil { + err = cfg.tableMappingManager.ReplaceTemporaryIDs(ctx, client.GenGlobalIDs) + if err != nil { return errors.Trace(err) } - payloads.current = &backuppb.PitrIdMapPayload{DbMaps: cfg.tableMappingManager.ToProto()} - return nil -} - -func loadSegmentedTiFlashItems( - payload *backuppb.PitrIdMapPayload, -) (map[int64]model.TiFlashReplicaInfo, error) { - if payload == nil { - return nil, errors.New("pitr id map payload is nil") - } - return logclient.PitrTiFlashItemsFromProto(payload.TiflashItems), nil -} - -func loadSegmentedIngestItems( - payload *backuppb.PitrIdMapPayload, -) (map[int64]map[int64]bool, error) { - if payload == nil { - return nil, errors.New("pitr id map payload is nil") - } - return logclient.PitrIngestItemsFromProto(payload.IngestItems), nil -} - -func persistSegmentedRestorePayload( - ctx context.Context, - client *logclient.LogClient, - cfg *LogRestoreConfig, - payloads *segmentedRestorePayloads, - ingestItemsForNextSeg map[int64]map[int64]bool, -) error { - if payloads == nil || payloads.current == nil { - payloads = &segmentedRestorePayloads{ - current: &backuppb.PitrIdMapPayload{DbMaps: cfg.tableMappingManager.ToProto()}, - } - } - if !cfg.LastRestore { - if ingestItemsForNextSeg != nil { - payloads.current.IngestItems = logclient.PitrIngestItemsToProto(ingestItemsForNextSeg) - } - if cfg.tiflashRecorder != nil { - payloads.current.TiflashItems = logclient.PitrTiFlashItemsToProto(cfg.tiflashRecorder.GetItems()) - } + if err = client.SaveIdMapWithFailPoints(ctx, cfg.tableMappingManager, cfg.logCheckpointMetaManager); err != nil { + return errors.Trace(err) } - return errors.Trace(client.SavePitrIdMapPayloadWithFailPoints(ctx, cfg.RestoreTS, payloads.current, cfg.logCheckpointMetaManager)) + return nil } func getCurrentTSFromCheckpointOrPD(ctx context.Context, mgr *conn.Mgr, cfg *LogRestoreConfig) (uint64, error) { diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index 4310a9b4a7e8a..6c378db0e6ca7 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -85,7 +85,7 @@ run_sql "select count(*) from \`$latest_db\`.\`cpt_data\`;" check_contains "count(*): 1" # check the log restore save id map into the table mysql.tidb_pitr_id_map -run_sql "select count(*) from mysql.tidb_pitr_id_map;" +run_sql 'select count(*) from mysql.tidb_pitr_id_map;' check_contains "count(*): 1" # PITR with checkpoint but failed in the log restore datakv stage @@ -129,7 +129,7 @@ check_result() { check_result # check mysql.tidb_pitr_id_map has data -count=$(run_sql "select count(*) from mysql.tidb_pitr_id_map;" | awk '/count/{print $2}') +count=$(run_sql 'select count(*) from mysql.tidb_pitr_id_map;' | awk '/count/{print $2}') if [ $count -eq 0 ]; then echo "the number of pitr id map is $count" exit 1 @@ -170,17 +170,14 @@ if [ $restore_fail -ne 1 ]; then exit 1 fi -# check the pitr id map is saved in the checkpoint storage and system table +# check the pitr id map is saved in the checkpoint storage count=$(ls $TEST_DIR/$PREFIX/log/pitr_id_maps | wc -l) if [ $count -ne 0 ]; then echo "the number of pitr id map is $count instead of 0" exit 1 fi -count=$(run_sql "select count(*) from mysql.tidb_pitr_id_map;" | awk '/count/{print $2}') -if [ $count -eq 0 ]; then - echo "the number of pitr id map is $count" - exit 1 -fi +run_sql 'select count(*) from mysql.tidb_pitr_id_map;' +check_contains "count(*): 0" count=$(ls $TEST_DIR/$PREFIX/checkpoints/pitr_id_maps | wc -l) if [ $count -ne 1 ]; then echo "the number of pitr id map is $count instead of 1" From 33457da5e649334e42cbf8e35d32902234303b27 Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Tue, 3 Feb 2026 17:22:14 +0000 Subject: [PATCH 17/18] reimplement segmented restore Signed-off-by: Juncen Yu --- br/pkg/restore/ingestrec/ingest_recorder.go | 68 ++++++++ br/pkg/restore/log_client/client.go | 28 ++-- br/pkg/restore/log_client/client_test.go | 6 +- br/pkg/restore/log_client/export_test.go | 9 +- br/pkg/restore/log_client/id_map.go | 157 +++++++++++++------ br/pkg/restore/log_client/segmented_state.go | 93 +++++++++++ br/pkg/task/restore.go | 23 ++- br/pkg/task/stream.go | 51 ++++-- 8 files changed, 355 insertions(+), 80 deletions(-) create mode 100644 br/pkg/restore/log_client/segmented_state.go diff --git a/br/pkg/restore/ingestrec/ingest_recorder.go b/br/pkg/restore/ingestrec/ingest_recorder.go index f406d36b0d680..772c18635476b 100644 --- a/br/pkg/restore/ingestrec/ingest_recorder.go +++ b/br/pkg/restore/ingestrec/ingest_recorder.go @@ -247,3 +247,71 @@ func (i *IngestRecorder) IterateForeignKeys(f func(*ForeignKeyRecord) error) err } return nil } + +// RecorderState is a serializable snapshot of ingest recorder data. +type RecorderState struct { + Items map[int64]map[int64]IndexState `json:"items,omitempty"` +} + +// IndexState is a minimal representation of an ingested index. +type IndexState struct { + IsPrimary bool `json:"is_primary,omitempty"` +} + +// ExportState returns a snapshot of the ingest recorder state. +func (i *IngestRecorder) ExportState() *RecorderState { + if i == nil || len(i.items) == 0 { + return nil + } + state := &RecorderState{ + Items: make(map[int64]map[int64]IndexState, len(i.items)), + } + for tableID, indexes := range i.items { + if len(indexes) == 0 { + continue + } + tableIndexes := make(map[int64]IndexState, len(indexes)) + for indexID, info := range indexes { + if info == nil { + continue + } + tableIndexes[indexID] = IndexState{IsPrimary: info.IsPrimary} + } + if len(tableIndexes) > 0 { + state.Items[tableID] = tableIndexes + } + } + if len(state.Items) == 0 { + return nil + } + return state +} + +// MergeState merges a snapshot into the ingest recorder. +func (i *IngestRecorder) MergeState(state *RecorderState) { + if i == nil || state == nil || len(state.Items) == 0 { + return + } + if i.items == nil { + i.items = make(map[int64]map[int64]*IngestIndexInfo) + } + for tableID, indexes := range state.Items { + if len(indexes) == 0 { + continue + } + tableIndexes, exists := i.items[tableID] + if !exists { + tableIndexes = make(map[int64]*IngestIndexInfo, len(indexes)) + i.items[tableID] = tableIndexes + } + for indexID, info := range indexes { + if _, ok := tableIndexes[indexID]; ok { + continue + } + tableIndexes[indexID] = &IngestIndexInfo{ + IsPrimary: info.IsPrimary, + Updated: false, + } + } + } +} diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 07d77db537a56..5f5988f48133a 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -1026,9 +1026,10 @@ func (rc *LogClient) GetBaseIDMapAndMerge( loadSavedIDMap bool, logCheckpointMetaManager checkpoint.LogMetaManagerT, tableMappingManager *stream.TableMappingManager, -) error { +) (*SegmentedPiTRState, error) { var ( err error + state *SegmentedPiTRState dbMaps []*backuppb.PitrDBMap dbReplaces map[stream.UpstreamID]*stream.DBReplace ) @@ -1036,9 +1037,12 @@ func (rc *LogClient) GetBaseIDMapAndMerge( // this is a retry, id map saved last time, load it from external storage if loadSavedIDMap { log.Info("try to load previously saved pitr id maps") - dbMaps, err = rc.loadSchemasMap(ctx, rc.restoreTS, logCheckpointMetaManager) + state, err = rc.loadSegmentedPiTRState(ctx, rc.restoreTS, logCheckpointMetaManager, true) if err != nil { - return errors.Trace(err) + return nil, errors.Trace(err) + } + if state != nil { + dbMaps = state.DbMaps } } @@ -1046,19 +1050,23 @@ func (rc *LogClient) GetBaseIDMapAndMerge( // schemas map whose `restore-ts`` is the task's `start-ts`. if len(dbMaps) <= 0 && !hasFullBackupStorageConfig { log.Info("try to load pitr id maps of the previous task", zap.Uint64("start-ts", rc.startTS)) - dbMaps, err = rc.loadSchemasMap(ctx, rc.startTS, logCheckpointMetaManager) + state, err = rc.loadSegmentedPiTRState(ctx, rc.startTS, logCheckpointMetaManager, false) if err != nil { - return errors.Trace(err) + return nil, errors.Trace(err) } - err := rc.validateNoTiFlashReplica() - if err != nil { - return errors.Trace(err) + if state != nil { + dbMaps = state.DbMaps + } + if len(dbMaps) > 0 { + if err := rc.validateNoTiFlashReplica(); err != nil { + return nil, errors.Trace(err) + } } } if len(dbMaps) <= 0 && !hasFullBackupStorageConfig { log.Error("no id maps found") - return errors.New("no base id map found from saved id or last restored PiTR") + return nil, errors.New("no base id map found from saved id or last restored PiTR") } dbReplaces = stream.FromDBMapProto(dbMaps) @@ -1067,7 +1075,7 @@ func (rc *LogClient) GetBaseIDMapAndMerge( tableMappingManager.SetFromPiTRIDMap() tableMappingManager.MergeBaseDBReplace(dbReplaces) } - return nil + return state, nil } func SortMetaKVFiles(files []*backuppb.DataFileInfo) []*backuppb.DataFileInfo { diff --git a/br/pkg/restore/log_client/client_test.go b/br/pkg/restore/log_client/client_test.go index 2563510038b09..64f875af2b2c5 100644 --- a/br/pkg/restore/log_client/client_test.go +++ b/br/pkg/restore/log_client/client_test.go @@ -1346,7 +1346,7 @@ func TestInitSchemasReplaceForDDL(t *testing.T) { require.NoError(t, err) err = stg.WriteFile(ctx, logclient.PitrIDMapsFilename(123, 1), []byte("123")) require.NoError(t, err) - err = client.GetBaseIDMapAndMerge(ctx, false, false, nil, stream.NewTableMappingManager()) + _, err = client.GetBaseIDMapAndMerge(ctx, false, false, nil, stream.NewTableMappingManager()) require.Error(t, err) require.Contains(t, err.Error(), "proto: wrong") err = stg.DeleteFile(ctx, logclient.PitrIDMapsFilename(123, 1)) @@ -1358,7 +1358,7 @@ func TestInitSchemasReplaceForDDL(t *testing.T) { client.SetStorage(ctx, backend, nil) err := stg.WriteFile(ctx, logclient.PitrIDMapsFilename(123, 2), []byte("123")) require.NoError(t, err) - err = client.GetBaseIDMapAndMerge(ctx, false, true, nil, stream.NewTableMappingManager()) + _, err = client.GetBaseIDMapAndMerge(ctx, false, true, nil, stream.NewTableMappingManager()) require.Error(t, err) require.Contains(t, err.Error(), "proto: wrong") err = stg.DeleteFile(ctx, logclient.PitrIDMapsFilename(123, 2)) @@ -1373,7 +1373,7 @@ func TestInitSchemasReplaceForDDL(t *testing.T) { se, err := g.CreateSession(s.Mock.Storage) require.NoError(t, err) client := logclient.TEST_NewLogClient(123, 1, 2, 1, s.Mock.Domain, se) - err = client.GetBaseIDMapAndMerge(ctx, false, true, nil, stream.NewTableMappingManager()) + _, err = client.GetBaseIDMapAndMerge(ctx, false, true, nil, stream.NewTableMappingManager()) require.Error(t, err) require.Contains(t, err.Error(), "no base id map found from saved id or last restored PiTR") } diff --git a/br/pkg/restore/log_client/export_test.go b/br/pkg/restore/log_client/export_test.go index db5104b59d9b0..743b2d97cab37 100644 --- a/br/pkg/restore/log_client/export_test.go +++ b/br/pkg/restore/log_client/export_test.go @@ -77,7 +77,14 @@ func (rc *LogClient) TEST_initSchemasMap( restoreTS uint64, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) ([]*backuppb.PitrDBMap, error) { - return rc.loadSchemasMap(ctx, restoreTS, logCheckpointMetaManager) + state, err := rc.loadSegmentedPiTRState(ctx, restoreTS, logCheckpointMetaManager, true) + if err != nil { + return nil, err + } + if state == nil { + return nil, nil + } + return state.DbMaps, nil } // readStreamMetaByTS is used for streaming task. collect all meta file by TS, it is for test usage. diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 0ad5bb92ee9d0..1d2df0e2291de 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -23,7 +23,6 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" - "github.com/pingcap/tidb/br/pkg/metautil" "github.com/pingcap/tidb/br/pkg/restore" "github.com/pingcap/tidb/br/pkg/stream" "github.com/pingcap/tidb/pkg/kv" @@ -62,52 +61,90 @@ func (rc *LogClient) saveIDMap( manager *stream.TableMappingManager, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { - dbmaps := manager.ToProto() + state := &SegmentedPiTRState{ + DbMaps: manager.ToProto(), + } + existingState, err := rc.loadSegmentedPiTRState(ctx, rc.restoreTS, logCheckpointMetaManager, true) + if err != nil { + return errors.Trace(err) + } + if existingState != nil { + state.TiFlashItems = existingState.TiFlashItems + state.IngestRecorderState = existingState.IngestRecorderState + } + return rc.saveSegmentedPiTRState(ctx, state, logCheckpointMetaManager) +} + +// SaveSegmentedPiTRState saves segmented PiTR state for later restores. +func (rc *LogClient) SaveSegmentedPiTRState( + ctx context.Context, + state *SegmentedPiTRState, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + return rc.saveSegmentedPiTRState(ctx, state, logCheckpointMetaManager) +} + +func (rc *LogClient) saveSegmentedPiTRState( + ctx context.Context, + state *SegmentedPiTRState, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) error { + if state == nil { + return errors.New("segmented pitr state is nil") + } + pbState, err := state.toProto() + if err != nil { + return errors.Trace(err) + } if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { - log.Info("checkpoint storage is specified, load pitr id map from the checkpoint storage.") - if err := rc.saveIDMap2Storage(ctx, checkpointStorage, dbmaps); err != nil { + log.Info("checkpoint storage is specified, save pitr id map to the checkpoint storage.") + if err := rc.saveSegmentedPiTRStateToStorage(ctx, checkpointStorage, pbState); err != nil { return errors.Trace(err) } } else if rc.pitrIDMapTableExists() { - if err := rc.saveIDMap2Table(ctx, dbmaps); err != nil { + if err := rc.saveSegmentedPiTRStateToTable(ctx, pbState); err != nil { return errors.Trace(err) } } else { log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") - if err := rc.saveIDMap2Storage(ctx, rc.storage, dbmaps); err != nil { + if err := rc.saveSegmentedPiTRStateToStorage(ctx, rc.storage, pbState); err != nil { return errors.Trace(err) } } if rc.useCheckpoint { - log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") - if err := logCheckpointMetaManager.SaveCheckpointProgress(ctx, &checkpoint.CheckpointProgress{ - Progress: checkpoint.InLogRestoreAndIdMapPersisted, - }); err != nil { + exists, err := logCheckpointMetaManager.ExistsCheckpointProgress(ctx) + if err != nil { return errors.Trace(err) } + if !exists { + log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") + if err := logCheckpointMetaManager.SaveCheckpointProgress(ctx, &checkpoint.CheckpointProgress{ + Progress: checkpoint.InLogRestoreAndIdMapPersisted, + }); err != nil { + return errors.Trace(err) + } + } } return nil } -func (rc *LogClient) saveIDMap2Storage( +func (rc *LogClient) saveSegmentedPiTRStateToStorage( ctx context.Context, storage storeapi.Storage, - dbMaps []*backuppb.PitrDBMap, + state *backuppb.SegmentedPiTRState, ) error { clusterID := rc.GetClusterID(ctx) metaFileName := PitrIDMapsFilename(clusterID, rc.restoreTS) - metaWriter := metautil.NewMetaWriter(storage, metautil.MetaFileSize, false, metaFileName, nil) - metaWriter.Update(func(m *backuppb.BackupMeta) { - m.ClusterId = clusterID - m.DbMaps = dbMaps - }) - return metaWriter.FlushBackupMeta(ctx) + data, err := proto.Marshal(state) + if err != nil { + return errors.Trace(err) + } + return storage.WriteFile(ctx, metaFileName, data) } -func (rc *LogClient) saveIDMap2Table(ctx context.Context, dbMaps []*backuppb.PitrDBMap) error { - backupmeta := &backuppb.BackupMeta{DbMaps: dbMaps} - data, err := proto.Marshal(backupmeta) +func (rc *LogClient) saveSegmentedPiTRStateToTable(ctx context.Context, state *backuppb.SegmentedPiTRState) error { + data, err := proto.Marshal(state) if err != nil { return errors.Trace(err) } @@ -153,30 +190,31 @@ func (rc *LogClient) saveIDMap2Table(ctx context.Context, dbMaps []*backuppb.Pit return nil } -func (rc *LogClient) loadSchemasMap( +func (rc *LogClient) loadSegmentedPiTRState( ctx context.Context, restoredTS uint64, logCheckpointMetaManager checkpoint.LogMetaManagerT, -) ([]*backuppb.PitrDBMap, error) { + onlyThisRestore bool, +) (*SegmentedPiTRState, error) { if checkpointStorage := rc.tryGetCheckpointStorage(logCheckpointMetaManager); checkpointStorage != nil { log.Info("checkpoint storage is specified, load pitr id map from the checkpoint storage.") - dbMaps, err := rc.loadSchemasMapFromStorage(ctx, checkpointStorage, restoredTS) - return dbMaps, errors.Trace(err) + state, err := rc.loadSegmentedPiTRStateFromStorage(ctx, checkpointStorage, restoredTS) + return state, errors.Trace(err) } if rc.pitrIDMapTableExists() { - dbMaps, err := rc.loadSchemasMapFromTable(ctx, restoredTS) - return dbMaps, errors.Trace(err) + state, err := rc.loadSegmentedPiTRStateFromTable(ctx, restoredTS, onlyThisRestore) + return state, errors.Trace(err) } log.Info("the table mysql.tidb_pitr_id_map does not exist, maybe the cluster version is old.") - dbMaps, err := rc.loadSchemasMapFromStorage(ctx, rc.storage, restoredTS) - return dbMaps, errors.Trace(err) + state, err := rc.loadSegmentedPiTRStateFromStorage(ctx, rc.storage, restoredTS) + return state, errors.Trace(err) } -func (rc *LogClient) loadSchemasMapFromStorage( +func (rc *LogClient) loadSegmentedPiTRStateFromStorage( ctx context.Context, storage storeapi.Storage, restoredTS uint64, -) ([]*backuppb.PitrDBMap, error) { +) (*SegmentedPiTRState, error) { clusterID := rc.GetClusterID(ctx) metaFileName := PitrIDMapsFilename(clusterID, restoredTS) exist, err := storage.FileExists(ctx, metaFileName) @@ -192,26 +230,34 @@ func (rc *LogClient) loadSchemasMapFromStorage( if err != nil { return nil, errors.Trace(err) } - backupMeta := &backuppb.BackupMeta{} - if err := backupMeta.Unmarshal(metaData); err != nil { + state := &backuppb.SegmentedPiTRState{} + if err := state.Unmarshal(metaData); err != nil { return nil, errors.Trace(err) } - return backupMeta.GetDbMaps(), nil + return segmentedPiTRStateFromProto(state) } -func (rc *LogClient) loadSchemasMapFromTable( +func (rc *LogClient) loadSegmentedPiTRStateFromTable( ctx context.Context, restoredTS uint64, -) ([]*backuppb.PitrDBMap, error) { + onlyThisRestore bool, +) (*SegmentedPiTRState, error) { hasRestoreIDColumn := rc.pitrIDMapHasRestoreIDColumn() var getPitrIDMapSQL string var args []any + var withRestoreID bool if hasRestoreIDColumn { - // new version with restore_id column - getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" - args = []any{rc.restoreID, restoredTS, rc.upstreamClusterID} + if onlyThisRestore { + // new version with restore_id column + getPitrIDMapSQL = "SELECT segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restore_id = %? and restored_ts = %? and upstream_cluster_id = %? ORDER BY segment_id;" + args = []any{rc.restoreID, restoredTS, rc.upstreamClusterID} + } else { + getPitrIDMapSQL = "SELECT restore_id, segment_id, id_map FROM mysql.tidb_pitr_id_map WHERE restored_ts = %? and upstream_cluster_id = %? ORDER BY restore_id, segment_id;" + args = []any{restoredTS, rc.upstreamClusterID} + withRestoreID = true + } } else { // old version without restore_id column log.Info("mysql.tidb_pitr_id_map table does not have restore_id column, using backward compatible mode") @@ -234,21 +280,38 @@ func (rc *LogClient) loadSchemasMapFromTable( return nil, nil } metaData := make([]byte, 0, len(rows)*PITRIdMapBlockSize) + var expectedSegmentID uint64 + var selectedRestoreID uint64 for i, row := range rows { - elementID := row.GetUint64(0) - if uint64(i) != elementID { + var elementID uint64 + var data []byte + if withRestoreID { + restoreID := row.GetUint64(0) + if i == 0 { + selectedRestoreID = restoreID + } else if restoreID != selectedRestoreID { + return nil, errors.Errorf("multiple restore_id values found for restored_ts=%d and upstream_cluster_id=%d: %d, %d", + restoredTS, rc.upstreamClusterID, selectedRestoreID, restoreID) + } + elementID = row.GetUint64(1) + data = row.GetBytes(2) + } else { + elementID = row.GetUint64(0) + data = row.GetBytes(1) + } + if expectedSegmentID != elementID { return nil, errors.Errorf("the part(segment_id = %d) of pitr id map is lost", i) } - d := row.GetBytes(1) - if len(d) == 0 { + if len(data) == 0 { return nil, errors.Errorf("get the empty part(segment_id = %d) of pitr id map", i) } - metaData = append(metaData, d...) + metaData = append(metaData, data...) + expectedSegmentID++ } - backupMeta := &backuppb.BackupMeta{} - if err := backupMeta.Unmarshal(metaData); err != nil { + state := &backuppb.SegmentedPiTRState{} + if err := state.Unmarshal(metaData); err != nil { return nil, errors.Trace(err) } - return backupMeta.GetDbMaps(), nil + return segmentedPiTRStateFromProto(state) } diff --git a/br/pkg/restore/log_client/segmented_state.go b/br/pkg/restore/log_client/segmented_state.go new file mode 100644 index 0000000000000..277ecafd2ff12 --- /dev/null +++ b/br/pkg/restore/log_client/segmented_state.go @@ -0,0 +1,93 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package logclient + +import ( + "encoding/json" + + "github.com/pingcap/errors" + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/tidb/br/pkg/restore/ingestrec" + "github.com/pingcap/tidb/pkg/meta/model" +) + +const segmentedPiTRStatePayloadVersion uint64 = 1 + +// SegmentedPiTRState is the decoded segmented restore state stored in tidb_pitr_id_map. +type SegmentedPiTRState struct { + DbMaps []*backuppb.PitrDBMap + TiFlashItems map[int64]model.TiFlashReplicaInfo + IngestRecorderState *ingestrec.RecorderState +} + +type segmentedPiTRStatePayload struct { + TiFlashItems map[int64]model.TiFlashReplicaInfo `json:"tiflash_items"` + IngestRecorderState *ingestrec.RecorderState `json:"ingest_recorder,omitempty"` +} + +func (s *SegmentedPiTRState) hasPayload() bool { + if s == nil { + return false + } + if s.TiFlashItems != nil { + return true + } + return s.IngestRecorderState != nil +} + +func (s *SegmentedPiTRState) toProto() (*backuppb.SegmentedPiTRState, error) { + if s == nil { + return nil, errors.New("segmented pitr state is nil") + } + state := &backuppb.SegmentedPiTRState{ + DbMaps: s.DbMaps, + } + if !s.hasPayload() { + return state, nil + } + payload := segmentedPiTRStatePayload{ + TiFlashItems: s.TiFlashItems, + IngestRecorderState: s.IngestRecorderState, + } + data, err := json.Marshal(payload) + if err != nil { + return nil, errors.Trace(err) + } + state.SegmentedPitrStateVer = segmentedPiTRStatePayloadVersion + state.SegmentedPitrState = [][]byte{data} + return state, nil +} + +func segmentedPiTRStateFromProto(state *backuppb.SegmentedPiTRState) (*SegmentedPiTRState, error) { + if state == nil { + return nil, nil + } + result := &SegmentedPiTRState{ + DbMaps: state.GetDbMaps(), + } + if state.GetSegmentedPitrStateVer() == 0 || len(state.GetSegmentedPitrState()) == 0 { + return result, nil + } + if state.GetSegmentedPitrStateVer() != segmentedPiTRStatePayloadVersion { + return nil, errors.Errorf("unsupported segmented pitr state version: %d", state.GetSegmentedPitrStateVer()) + } + var payload segmentedPiTRStatePayload + if err := json.Unmarshal(state.GetSegmentedPitrState()[0], &payload); err != nil { + return nil, errors.Trace(err) + } + result.TiFlashItems = payload.TiFlashItems + result.IngestRecorderState = payload.IngestRecorderState + return result, nil +} diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 3916e4292fcca..d386933c17689 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -105,6 +105,8 @@ const ( // FlagStreamStartTS and FlagStreamRestoreTS is used for log restore timestamp range. FlagStreamStartTS = "start-ts" FlagStreamRestoreTS = "restored-ts" + // FlagLastSegment indicates whether this restore is the last segment. + FlagLastSegment = "last-segment" // FlagStreamFullBackupStorage is used for log restore, represents the full backup storage. FlagStreamFullBackupStorage = "full-backup-storage" // FlagPiTRBatchCount and FlagPiTRBatchSize are used for restore log with batch method. @@ -279,11 +281,13 @@ type RestoreConfig struct { // whether RestoreTS was explicitly specified by user vs auto-detected IsRestoredTSUserSpecified bool `json:"-" toml:"-"` // rewriteTS is the rewritten timestamp of meta kvs. - RewriteTS uint64 `json:"-" toml:"-"` - tiflashRecorder *tiflashrec.TiFlashRecorder `json:"-" toml:"-"` - PitrBatchCount uint32 `json:"pitr-batch-count" toml:"pitr-batch-count"` - PitrBatchSize uint32 `json:"pitr-batch-size" toml:"pitr-batch-size"` - PitrConcurrency uint32 `json:"-" toml:"-"` + RewriteTS uint64 `json:"-" toml:"-"` + tiflashRecorder *tiflashrec.TiFlashRecorder `json:"-" toml:"-"` + LastRestore bool `json:"last-segment" toml:"last-segment"` + IsLastRestoreUserSpecified bool `json:"-" toml:"-"` + PitrBatchCount uint32 `json:"pitr-batch-count" toml:"pitr-batch-count"` + PitrBatchSize uint32 `json:"pitr-batch-size" toml:"pitr-batch-size"` + PitrConcurrency uint32 `json:"-" toml:"-"` UseCheckpoint bool `json:"use-checkpoint" toml:"use-checkpoint"` CheckpointStorage string `json:"checkpoint-storage" toml:"checkpoint-storage"` @@ -383,6 +387,7 @@ func DefineStreamRestoreFlags(command *cobra.Command) { "support TSO or datetime, e.g. '400036290571534337' or '2018-05-11 01:42:23+0800'") command.Flags().String(FlagStreamRestoreTS, "", "the point of restore, used for log restore.\n"+ "support TSO or datetime, e.g. '400036290571534337' or '2018-05-11 01:42:23+0800'") + command.Flags().Bool(FlagLastSegment, true, "whether this restore is the last segment of a segmented PiTR task") command.Flags().String(FlagStreamFullBackupStorage, "", "specify the backup full storage. "+ "fill it if want restore full backup before restore log.") command.Flags().Uint32(FlagPiTRBatchCount, defaultPiTRBatchCount, "specify the batch count to restore log.") @@ -406,6 +411,11 @@ func (cfg *RestoreConfig) ParseStreamRestoreFlags(flags *pflag.FlagSet) error { if cfg.RestoreTS, err = ParseTSString(tsString, true); err != nil { return errors.Trace(err) } + cfg.LastRestore, err = flags.GetBool(FlagLastSegment) + if err != nil { + return errors.Trace(err) + } + cfg.IsLastRestoreUserSpecified = flags.Changed(FlagLastSegment) // check if RestoreTS was explicitly specified by user cfg.IsRestoredTSUserSpecified = flags.Changed(FlagStreamRestoreTS) @@ -614,6 +624,9 @@ func (cfg *RestoreConfig) Adjust() { } func (cfg *RestoreConfig) adjustRestoreConfigForStreamRestore() { + if !cfg.IsLastRestoreUserSpecified { + cfg.LastRestore = true + } if cfg.PitrConcurrency == 0 { cfg.PitrConcurrency = defaultPiTRConcurrency } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 611d7d391b2c2..753b2c718b249 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1478,6 +1478,7 @@ type LogRestoreConfig struct { tableMappingManager *stream.TableMappingManager logClient *logclient.LogClient ddlFiles []logclient.Log + ingestRecorderState *ingestrec.RecorderState } // restoreStream starts the log restore @@ -1627,6 +1628,9 @@ func restoreStream( if err != nil { return errors.Trace(err) } + if cfg.ingestRecorderState != nil { + schemasReplace.GetIngestRecorder().MergeState(cfg.ingestRecorderState) + } importModeSwitcher := restore.NewImportModeSwitcher(mgr.GetPDClient(), cfg.Config.SwitchModeInterval, mgr.GetTLSConfig()) @@ -1679,9 +1683,6 @@ func restoreStream( rewriteRules := buildRewriteRules(schemasReplace) ingestRecorder := schemasReplace.GetIngestRecorder() - if err := rangeFilterFromIngestRecorder(ingestRecorder, rewriteRules); err != nil { - return errors.Trace(err) - } logFilesIter, err := client.LoadDMLFiles(ctx) if err != nil { @@ -1790,17 +1791,33 @@ func restoreStream( return errors.Annotate(err, "failed to insert rows into gc_delete_range") } - // index ingestion is not captured by regular log backup, so we need to manually ingest again - if err = client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g); err != nil { - return errors.Annotate(err, "failed to repair ingest index") - } + if !cfg.LastRestore { + state := &logclient.SegmentedPiTRState{ + DbMaps: cfg.tableMappingManager.ToProto(), + IngestRecorderState: ingestRecorder.ExportState(), + } + if cfg.tiflashRecorder != nil { + state.TiFlashItems = cfg.tiflashRecorder.GetItems() + } + if err := client.SaveSegmentedPiTRState(ctx, state, cfg.logCheckpointMetaManager); err != nil { + return errors.Annotate(err, "failed to save segmented pitr state") + } + } else { + if err := rangeFilterFromIngestRecorder(ingestRecorder, rewriteRules); err != nil { + return errors.Trace(err) + } + // index ingestion is not captured by regular log backup, so we need to manually ingest again + if err = client.RepairIngestIndex(ctx, ingestRecorder, cfg.logCheckpointMetaManager, g); err != nil { + return errors.Annotate(err, "failed to repair ingest index") + } - if cfg.tiflashRecorder != nil { - sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema()) - log.Info("Generating SQLs for restoring TiFlash Replica", - zap.Strings("sqls", sqls)) - if err := client.ResetTiflashReplicas(ctx, sqls, g); err != nil { - return errors.Annotate(err, "failed to reset tiflash replicas") + if cfg.tiflashRecorder != nil { + sqls := cfg.tiflashRecorder.GenerateAlterTableDDLs(mgr.GetDomain().InfoSchema()) + log.Info("Generating SQLs for restoring TiFlash Replica", + zap.Strings("sqls", sqls)) + if err := client.ResetTiflashReplicas(ctx, sqls, g); err != nil { + return errors.Annotate(err, "failed to reset tiflash replicas") + } } } @@ -2218,11 +2235,17 @@ func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, // get the schemas ID replace information. saved := isCurrentIdMapSaved(cfg.checkpointTaskInfo) hasFullBackupStorage := len(cfg.FullBackupStorage) != 0 - err := client.GetBaseIDMapAndMerge(ctx, hasFullBackupStorage, saved, + state, err := client.GetBaseIDMapAndMerge(ctx, hasFullBackupStorage, saved, cfg.logCheckpointMetaManager, cfg.tableMappingManager) if err != nil { return errors.Trace(err) } + if state != nil { + if state.TiFlashItems != nil && cfg.tiflashRecorder != nil { + cfg.tiflashRecorder.Load(state.TiFlashItems) + } + cfg.ingestRecorderState = state.IngestRecorderState + } if saved { return nil From dea4ab019b6fe3173b4b27d6bedf992aca90aefc Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Tue, 3 Feb 2026 18:20:40 +0000 Subject: [PATCH 18/18] added some tests Signed-off-by: Juncen Yu --- br/pkg/restore/log_client/client.go | 27 +++--------- br/pkg/restore/log_client/client_test.go | 55 +++++++++++++++++++----- br/pkg/restore/log_client/export_test.go | 13 ++++-- br/pkg/restore/log_client/id_map.go | 30 ------------- br/pkg/task/stream.go | 17 +++++++- 5 files changed, 75 insertions(+), 67 deletions(-) diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 5f5988f48133a..78f5d93c1b76f 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -1008,14 +1008,6 @@ type FullBackupStorageConfig struct { Opts *storeapi.Options } -type GetIDMapConfig struct { - // required - LoadSavedIDMap bool - - // optional - TableMappingManager *stream.TableMappingManager -} - // GetBaseIDMapAndMerge get the id map from following ways // 1. from previously saved id map if the same task has been running and built/saved id map already but failed later // 2. from previous different task. A PiTR job might be split into multiple runs/tasks and each task only restores @@ -1025,13 +1017,11 @@ func (rc *LogClient) GetBaseIDMapAndMerge( hasFullBackupStorageConfig, loadSavedIDMap bool, logCheckpointMetaManager checkpoint.LogMetaManagerT, - tableMappingManager *stream.TableMappingManager, ) (*SegmentedPiTRState, error) { var ( - err error - state *SegmentedPiTRState - dbMaps []*backuppb.PitrDBMap - dbReplaces map[stream.UpstreamID]*stream.DBReplace + err error + state *SegmentedPiTRState + dbMaps []*backuppb.PitrDBMap ) // this is a retry, id map saved last time, load it from external storage @@ -1068,13 +1058,6 @@ func (rc *LogClient) GetBaseIDMapAndMerge( log.Error("no id maps found") return nil, errors.New("no base id map found from saved id or last restored PiTR") } - dbReplaces = stream.FromDBMapProto(dbMaps) - - stream.LogDBReplaceMap("base db replace info", dbReplaces) - if len(dbReplaces) != 0 { - tableMappingManager.SetFromPiTRIDMap() - tableMappingManager.MergeBaseDBReplace(dbReplaces) - } return state, nil } @@ -1987,14 +1970,14 @@ func (rc *LogClient) GetGCRows() []*stream.PreDelRangeQuery { func (rc *LogClient) SaveIdMapWithFailPoints( ctx context.Context, - manager *stream.TableMappingManager, + state *SegmentedPiTRState, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { failpoint.Inject("failed-before-id-maps-saved", func(_ failpoint.Value) { failpoint.Return(errors.New("failpoint: failed before id maps saved")) }) - if err := rc.saveIDMap(ctx, manager, logCheckpointMetaManager); err != nil { + if err := rc.SaveSegmentedPiTRState(ctx, state, logCheckpointMetaManager); err != nil { return errors.Trace(err) } diff --git a/br/pkg/restore/log_client/client_test.go b/br/pkg/restore/log_client/client_test.go index 64f875af2b2c5..07b8073022e4d 100644 --- a/br/pkg/restore/log_client/client_test.go +++ b/br/pkg/restore/log_client/client_test.go @@ -20,6 +20,7 @@ import ( "fmt" "math" "path/filepath" + "strings" "sync" "testing" "time" @@ -1346,9 +1347,8 @@ func TestInitSchemasReplaceForDDL(t *testing.T) { require.NoError(t, err) err = stg.WriteFile(ctx, logclient.PitrIDMapsFilename(123, 1), []byte("123")) require.NoError(t, err) - _, err = client.GetBaseIDMapAndMerge(ctx, false, false, nil, stream.NewTableMappingManager()) - require.Error(t, err) - require.Contains(t, err.Error(), "proto: wrong") + _, err = client.GetBaseIDMapAndMerge(ctx, false, false, nil) + requireInvalidProtoError(t, err) err = stg.DeleteFile(ctx, logclient.PitrIDMapsFilename(123, 1)) require.NoError(t, err) } @@ -1358,9 +1358,8 @@ func TestInitSchemasReplaceForDDL(t *testing.T) { client.SetStorage(ctx, backend, nil) err := stg.WriteFile(ctx, logclient.PitrIDMapsFilename(123, 2), []byte("123")) require.NoError(t, err) - _, err = client.GetBaseIDMapAndMerge(ctx, false, true, nil, stream.NewTableMappingManager()) - require.Error(t, err) - require.Contains(t, err.Error(), "proto: wrong") + _, err = client.GetBaseIDMapAndMerge(ctx, false, true, nil) + requireInvalidProtoError(t, err) err = stg.DeleteFile(ctx, logclient.PitrIDMapsFilename(123, 2)) require.NoError(t, err) } @@ -1373,12 +1372,20 @@ func TestInitSchemasReplaceForDDL(t *testing.T) { se, err := g.CreateSession(s.Mock.Storage) require.NoError(t, err) client := logclient.TEST_NewLogClient(123, 1, 2, 1, s.Mock.Domain, se) - _, err = client.GetBaseIDMapAndMerge(ctx, false, true, nil, stream.NewTableMappingManager()) + _, err = client.GetBaseIDMapAndMerge(ctx, false, true, nil) require.Error(t, err) require.Contains(t, err.Error(), "no base id map found from saved id or last restored PiTR") } } +func requireInvalidProtoError(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + errMsg := err.Error() + require.True(t, strings.Contains(errMsg, "proto") || strings.Contains(errMsg, "EOF"), + "unexpected error: %s", errMsg) +} + func downstreamID(upstreamID int64) int64 { return upstreamID + 10000000 } @@ -1446,8 +1453,30 @@ func TestPITRIDMap(t *testing.T) { baseTableMappingManager := &stream.TableMappingManager{ DBReplaceMap: getDBMap(), } - err = client.TEST_saveIDMap(ctx, baseTableMappingManager, nil) + tiflashItems := map[int64]model.TiFlashReplicaInfo{ + 1: {Count: 1, Available: true}, + 2: {Count: 2, LocationLabels: []string{"zone", "rack"}, AvailablePartitionIDs: []int64{3, 4}}, + } + ingestState := &ingestrec.RecorderState{ + Items: map[int64]map[int64]ingestrec.IndexState{ + 10: { + 1: {IsPrimary: true}, + 2: {IsPrimary: false}, + }, + }, + } + state := &logclient.SegmentedPiTRState{ + DbMaps: baseTableMappingManager.ToProto(), + TiFlashItems: tiflashItems, + IngestRecorderState: ingestState, + } + err = client.TEST_saveIDMap(ctx, state, nil) require.NoError(t, err) + loadedState, err := client.TEST_loadSegmentedPiTRState(ctx, 2, nil) + require.NoError(t, err) + require.NotNil(t, loadedState) + require.Equal(t, tiflashItems, loadedState.TiFlashItems) + require.Equal(t, ingestState, loadedState.IngestRecorderState) newSchemaReplaces, err := client.TEST_initSchemasMap(ctx, 1, nil) require.NoError(t, err) require.Nil(t, newSchemaReplaces) @@ -1496,7 +1525,10 @@ func TestPITRIDMapOnStorage(t *testing.T) { baseTableMappingManager := &stream.TableMappingManager{ DBReplaceMap: getDBMap(), } - err = client.TEST_saveIDMap(ctx, baseTableMappingManager, nil) + state := &logclient.SegmentedPiTRState{ + DbMaps: baseTableMappingManager.ToProto(), + } + err = client.TEST_saveIDMap(ctx, state, nil) require.NoError(t, err) newSchemaReplaces, err := client.TEST_initSchemasMap(ctx, 1, nil) require.NoError(t, err) @@ -1552,7 +1584,10 @@ func TestPITRIDMapOnCheckpointStorage(t *testing.T) { baseTableMappingManager := &stream.TableMappingManager{ DBReplaceMap: getDBMap(), } - err = client.TEST_saveIDMap(ctx, baseTableMappingManager, logCheckpointMetaManager) + state := &logclient.SegmentedPiTRState{ + DbMaps: baseTableMappingManager.ToProto(), + } + err = client.TEST_saveIDMap(ctx, state, logCheckpointMetaManager) require.NoError(t, err) newSchemaReplaces, err := client.TEST_initSchemasMap(ctx, 1, logCheckpointMetaManager) require.NoError(t, err) diff --git a/br/pkg/restore/log_client/export_test.go b/br/pkg/restore/log_client/export_test.go index 743b2d97cab37..c6f6a630cad4b 100644 --- a/br/pkg/restore/log_client/export_test.go +++ b/br/pkg/restore/log_client/export_test.go @@ -22,7 +22,6 @@ import ( "github.com/pingcap/kvproto/pkg/encryptionpb" "github.com/pingcap/tidb/br/pkg/checkpoint" "github.com/pingcap/tidb/br/pkg/glue" - "github.com/pingcap/tidb/br/pkg/stream" "github.com/pingcap/tidb/br/pkg/utils/iter" "github.com/pingcap/tidb/pkg/domain" "github.com/pingcap/tidb/pkg/objstore/storeapi" @@ -66,10 +65,10 @@ func (m *PhysicalWithMigrations) Physical() *backuppb.DataFileGroup { func (rc *LogClient) TEST_saveIDMap( ctx context.Context, - m *stream.TableMappingManager, + state *SegmentedPiTRState, logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { - return rc.SaveIdMapWithFailPoints(ctx, m, logCheckpointMetaManager) + return rc.SaveIdMapWithFailPoints(ctx, state, logCheckpointMetaManager) } func (rc *LogClient) TEST_initSchemasMap( @@ -87,6 +86,14 @@ func (rc *LogClient) TEST_initSchemasMap( return state.DbMaps, nil } +func (rc *LogClient) TEST_loadSegmentedPiTRState( + ctx context.Context, + restoreTS uint64, + logCheckpointMetaManager checkpoint.LogMetaManagerT, +) (*SegmentedPiTRState, error) { + return rc.loadSegmentedPiTRState(ctx, restoreTS, logCheckpointMetaManager, true) +} + // readStreamMetaByTS is used for streaming task. collect all meta file by TS, it is for test usage. func (lm *LogFileManager) ReadStreamMeta(ctx context.Context) ([]*MetaName, error) { metas, err := lm.streamingMeta(ctx) diff --git a/br/pkg/restore/log_client/id_map.go b/br/pkg/restore/log_client/id_map.go index 1d2df0e2291de..4fdb45d2b38db 100644 --- a/br/pkg/restore/log_client/id_map.go +++ b/br/pkg/restore/log_client/id_map.go @@ -24,7 +24,6 @@ import ( "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" "github.com/pingcap/tidb/br/pkg/restore" - "github.com/pingcap/tidb/br/pkg/stream" "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/objstore/storeapi" "github.com/pingcap/tidb/pkg/parser/ast" @@ -55,39 +54,10 @@ func (rc *LogClient) tryGetCheckpointStorage( return logCheckpointMetaManager.TryGetStorage() } -// saveIDMap saves the id mapping information. -func (rc *LogClient) saveIDMap( - ctx context.Context, - manager *stream.TableMappingManager, - logCheckpointMetaManager checkpoint.LogMetaManagerT, -) error { - state := &SegmentedPiTRState{ - DbMaps: manager.ToProto(), - } - existingState, err := rc.loadSegmentedPiTRState(ctx, rc.restoreTS, logCheckpointMetaManager, true) - if err != nil { - return errors.Trace(err) - } - if existingState != nil { - state.TiFlashItems = existingState.TiFlashItems - state.IngestRecorderState = existingState.IngestRecorderState - } - return rc.saveSegmentedPiTRState(ctx, state, logCheckpointMetaManager) -} - -// SaveSegmentedPiTRState saves segmented PiTR state for later restores. func (rc *LogClient) SaveSegmentedPiTRState( ctx context.Context, state *SegmentedPiTRState, logCheckpointMetaManager checkpoint.LogMetaManagerT, -) error { - return rc.saveSegmentedPiTRState(ctx, state, logCheckpointMetaManager) -} - -func (rc *LogClient) saveSegmentedPiTRState( - ctx context.Context, - state *SegmentedPiTRState, - logCheckpointMetaManager checkpoint.LogMetaManagerT, ) error { if state == nil { return errors.New("segmented pitr state is nil") diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 753b2c718b249..93cdc95206c5e 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -2236,11 +2236,15 @@ func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, saved := isCurrentIdMapSaved(cfg.checkpointTaskInfo) hasFullBackupStorage := len(cfg.FullBackupStorage) != 0 state, err := client.GetBaseIDMapAndMerge(ctx, hasFullBackupStorage, saved, - cfg.logCheckpointMetaManager, cfg.tableMappingManager) + cfg.logCheckpointMetaManager) if err != nil { return errors.Trace(err) } if state != nil { + if len(state.DbMaps) > 0 { + cfg.tableMappingManager.SetFromPiTRIDMap() + cfg.tableMappingManager.MergeBaseDBReplace(stream.FromDBMapProto(state.DbMaps)) + } if state.TiFlashItems != nil && cfg.tiflashRecorder != nil { cfg.tiflashRecorder.Load(state.TiFlashItems) } @@ -2263,7 +2267,16 @@ func buildAndSaveIDMapIfNeeded(ctx context.Context, client *logclient.LogClient, if err != nil { return errors.Trace(err) } - if err = client.SaveIdMapWithFailPoints(ctx, cfg.tableMappingManager, cfg.logCheckpointMetaManager); err != nil { + newState := &logclient.SegmentedPiTRState{ + DbMaps: cfg.tableMappingManager.ToProto(), + IngestRecorderState: cfg.ingestRecorderState, + } + if cfg.tiflashRecorder != nil { + newState.TiFlashItems = cfg.tiflashRecorder.GetItems() + } else if state != nil && state.TiFlashItems != nil { + newState.TiFlashItems = state.TiFlashItems + } + if err = client.SaveIdMapWithFailPoints(ctx, newState, cfg.logCheckpointMetaManager); err != nil { return errors.Trace(err) } return nil