Skip to content

Commit 0e40980

Browse files
OliverS929ti-chi-bot
authored andcommitted
This is an automated cherry-pick of #61346
Signed-off-by: ti-chi-bot <[email protected]>
1 parent 469af9d commit 0e40980

File tree

6 files changed

+304
-2
lines changed

6 files changed

+304
-2
lines changed

br/pkg/lightning/backend/local/region_job.go

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,15 @@ func (local *Backend) writeToTiKV(ctx context.Context, j *regionJob) error {
204204
return err
205205
}
206206

207+
<<<<<<< HEAD:br/pkg/lightning/backend/local/region_job.go
207208
func (local *Backend) doWrite(ctx context.Context, j *regionJob) error {
208209
if j.stage != regionScanned {
209210
return nil
210211
}
211212

213+
=======
214+
func (local *Backend) doWrite(ctx context.Context, j *regionJob) (ret *tikvWriteResult, err error) {
215+
>>>>>>> cc0925eeafc (Lightning: Attempt to return writeTooSlow when we experience write timeout. (#61346)):pkg/lightning/backend/local/region_job.go
212216
failpoint.Inject("fakeRegionJobs", func() {
213217
front := j.injected[0]
214218
j.injected = j.injected[1:]
@@ -221,9 +225,33 @@ func (local *Backend) doWrite(ctx context.Context, j *regionJob) error {
221225
})
222226

223227
var cancel context.CancelFunc
224-
ctx, cancel = context.WithTimeoutCause(ctx, 15*time.Minute, common.ErrWriteTooSlow)
228+
// set a timeout for the write operation, if it takes too long, we will return with common.ErrWriteTooSlow and let caller retry the whole job instead of being stuck forever.
229+
timeout := 15 * time.Minute
230+
ctx, cancel = context.WithTimeoutCause(ctx, timeout, common.ErrWriteTooSlow)
225231
defer cancel()
226232

233+
// A defer function to handle all DeadlineExceeded errors that may occur
234+
// during the write operation using this context with 15 minutes timeout.
235+
// When the error is "context deadline exceeded", we will check if the cause
236+
// is common.ErrWriteTooSlow and return the common.ErrWriteTooSlow instead so
237+
// our caller would be able to retry this doWrite operation. By doing this
238+
// defer we are hoping to handle all DeadlineExceeded error during this
239+
// write, either from gRPC stream or write limiter WaitN operation.
240+
wctx := ctx
241+
defer func() {
242+
if err == nil {
243+
return
244+
}
245+
if errors.Cause(err) == context.DeadlineExceeded {
246+
if cause := context.Cause(wctx); goerrors.Is(cause, common.ErrWriteTooSlow) {
247+
log.FromContext(ctx).Info("Experiencing a wait timeout while writing to tikv",
248+
zap.Int("store-write-bwlimit", local.BackendConfig.StoreWriteBWLimit),
249+
zap.Int("limit-size", local.writeLimiter.Limit()))
250+
err = errors.Trace(cause) // return the common.ErrWriteTooSlow instead to let caller retry it
251+
}
252+
}
253+
}()
254+
227255
apiVersion := local.tikvCodec.GetAPIVersion()
228256
clientFactory := local.importClientFactory
229257
kvBatchSize := local.KVWriteBatchSize
@@ -341,6 +369,19 @@ func (local *Backend) doWrite(ctx context.Context, j *regionJob) error {
341369
regionMaxSize = j.regionSplitSize * 4 / 3
342370
}
343371

372+
// preparation work for the write timeout fault injection, only enabled if the following failpoint is enabled
373+
wcancel := func() {}
374+
failpoint.Inject("shortWaitNTimeout", func(val failpoint.Value) {
375+
var innerTimeout time.Duration
376+
// GO_FAILPOINTS action supplies the duration in
377+
ms, _ := val.(int)
378+
innerTimeout = time.Duration(ms) * time.Millisecond
379+
log.FromContext(ctx).Info("Injecting a timeout to write context.")
380+
wctx, wcancel = context.WithTimeoutCause(
381+
ctx, innerTimeout, common.ErrWriteTooSlow)
382+
})
383+
defer wcancel()
384+
344385
flushKVs := func() error {
345386
req.Chunk.(*sst.WriteRequest_Batch).Batch.Pairs = pairs[:count]
346387
preparedMsg := &grpc.PreparedMsg{}
@@ -351,7 +392,25 @@ func (local *Backend) doWrite(ctx context.Context, j *regionJob) error {
351392
}
352393

353394
for i := range clients {
354-
if err := writeLimiter.WaitN(ctx, allPeers[i].StoreId, int(size)); err != nil {
395+
// original ctx would be used when failpoint is not enabled
396+
// that new context would be used when failpoint is enabled
397+
err := writeLimiter.WaitN(wctx, allPeers[i].StoreId, int(size))
398+
if err != nil {
399+
// We expect to encounter two types of errors here:
400+
// 1. context.DeadlineExceeded — occurs when the calculated delay is
401+
// less than the remaining time in the context, but the context
402+
// expires while sleeping.
403+
// 2. "rate: Wait(n=%d) would exceed context deadline" — a fast-fail
404+
// path triggered when the delay already exceeds the remaining
405+
// time for context before sleeping.
406+
//
407+
// Unfortunately, we cannot precisely control when the context will
408+
// expire, so both scenarios are valid and expected.
409+
// Fortunately, the "rate: Wait" error is already treated as
410+
// retryable, so we only need to explicitly handle
411+
// context.DeadlineExceeded here.
412+
// We rely on the defer function at the top of doWrite to handle it
413+
// for us in general.
355414
return errors.Trace(err)
356415
}
357416
if err := clients[i].SendMsg(preparedMsg); err != nil {
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/sh
2+
#
3+
# Copyright 2025 PingCAP, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -eu
18+
19+
log_file="${TEST_DIR}/lightning.log"
20+
if [ $# -gt 1 ]; then
21+
log_file=$2
22+
fi
23+
24+
if ! grep -Fq "$1" "$log_file"; then
25+
echo "TEST FAILED: LIGHTNING LOG DOES NOT CONTAIN '$1'"
26+
echo "____________________________________"
27+
cat "$log_file"
28+
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
29+
exit 1
30+
fi

lightning/tests/README.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Integration tests
2+
3+
This folder contains all tests which relies on external processes such as TiDB.
4+
5+
## Preparations
6+
7+
1. The following executables must be copied or linked into these locations:
8+
9+
* `bin/tidb-server`
10+
* `bin/tikv-server`
11+
* `bin/pd-server`
12+
* `bin/tiflash`
13+
* `bin/minio`
14+
* `bin/mc`
15+
16+
The versions must be ≥2.1.0.
17+
18+
**Only some tests requires `minio`/`mc`** which can be downloaded from the official site, you can skip them if you don't need to run those cases.
19+
20+
You can use `tiup` to download binaries related to TiDB cluster, and then link them to the `bin` directory:
21+
```shell
22+
cluster_version=v8.1.0 # change to the version you need
23+
tiup install tidb:$cluster_version tikv:$cluster_version pd:$cluster_version tiflash:$cluster_version
24+
ln -s ~/.tiup/components/tidb/$cluster_version/tidb-server bin/tidb-server
25+
ln -s ~/.tiup/components/tikv/$cluster_version/tikv-server bin/tikv-server
26+
ln -s ~/.tiup/components/pd/$cluster_version/pd-server bin/pd-server
27+
ln -s ~/.tiup/components/tiflash/$cluster_version/tiflash/tiflash bin/tiflash
28+
```
29+
30+
2. `make build_for_lightning_integration_test`
31+
32+
`make server` to build the latest TiDB server if your test requires it.
33+
34+
3. The following programs must be installed:
35+
36+
* `mysql` (the CLI client)
37+
* `curl`
38+
* `openssl`
39+
* `wget`
40+
* `lsof`
41+
42+
4. The user executing the tests must have permission to create the folder
43+
`/tmp/lightning_test`. All test artifacts will be written into this folder.
44+
45+
## Running
46+
47+
Run `make lightning_integration_test` to execute all the integration tests.
48+
- Logs will be written into `/tmp/lightning_test` directory.
49+
50+
Run `tests/run.sh --debug` to pause immediately after all servers are started.
51+
52+
If you only want to run some tests, you can use:
53+
```shell
54+
TEST_NAME="lightning_gcs lightning_view" lightning/tests/run.sh
55+
```
56+
57+
Case names are separated by spaces.
58+
59+
## Writing new tests
60+
61+
1. New integration tests can be written as shell scripts in `tests/TEST_NAME/run.sh`.
62+
- `TEST_NAME` should start with `lightning_`.
63+
- The script should exit with a nonzero error code on failure.
64+
2. Add TEST_NAME to existing group in [run_group_lightning_tests.sh](./run_group_lightning_tests.sh)(Recommended), or add a new group for it.
65+
3. If you add a new group, the name of the new group must be added to CI [lightning-integration-test](https://github.com/PingCAP-QE/ci/blob/main/pipelines/pingcap/tidb/latest/pull_lightning_integration_test.groovy).
66+
67+
Several convenient commands are provided in [utils](../../tests/_utils/):
68+
69+
* `run_sql <SQL>` — Executes an SQL query on the TiDB database
70+
* `run_lightning [CONFIG]` — Starts `tidb-lightning` using `tests/TEST_NAME/CONFIG.toml`
71+
* `check_contains <TEXT>` — Checks if the previous `run_sql` result contains the given text
72+
(in `-E` format)
73+
* `check_not_contains <TEXT>` — Checks if the previous `run_sql` result does not contain the given
74+
text (in `-E` format)
75+
* `check_lightning_log_contains <TEXT>` — Checks if the current lightning log contains the given text
76+
(in `-E` format)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[tikv-importer]
2+
store-write-bwlimit = "1MiB"
3+
4+
[mydumper.csv]
5+
header = false
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2025 PingCAP, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -eux
18+
19+
CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
20+
21+
mkdir -p "$TEST_DIR/data"
22+
23+
run_sql "DROP DATABASE IF EXISTS test;"
24+
run_sql "DROP TABLE IF EXISTS test.t;"
25+
26+
cat <<EOF >"$TEST_DIR/data/test-schema-create.sql"
27+
CREATE DATABASE test;
28+
EOF
29+
cat <<EOF >"$TEST_DIR/data/test.t-schema.sql"
30+
CREATE TABLE test.t (
31+
id int,
32+
a int,
33+
b int,
34+
c int
35+
);
36+
EOF
37+
38+
# Generate 200k rows. Total size is about 5MiB.
39+
set +x
40+
for i in {1..200000}; do
41+
echo "$i,$i,$i,$i" >>"$TEST_DIR/data/test.t.0.csv"
42+
done
43+
set -x
44+
45+
export GO_FAILPOINTS="github.com/pingcap/tidb/pkg/lightning/backend/local/shortWaitNTimeout=100*return(1)"
46+
47+
run_lightning --backend local -d "$TEST_DIR/data" --config "$CUR/config.toml"
48+
check_lightning_log_contains 'Experiencing a wait timeout while writing to tikv'
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Copyright 2022 PingCAP, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# This script split the integration tests into 9 groups to support parallel group tests execution.
18+
# all the integration tests are located in br/tests directory. only the directories
19+
# containing run.sh will be considered as valid lightning integration tests. the script will print the total case number
20+
21+
set -eo pipefail
22+
23+
# Step 1
24+
CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
25+
group=$1
26+
export COV_DIR="/tmp/group_cover"
27+
rm -rf $COV_DIR
28+
mkdir -p $COV_DIR
29+
30+
# Define groups
31+
# Note: If new group is added, the group name must also be added to CI
32+
# * https://github.com/PingCAP-QE/ci/blob/main/pipelines/pingcap/tidb/latest/pull_lightning_integration_test.groovy
33+
# Each group of tests consumes as much time as possible, thus reducing CI waiting time.
34+
# Putting multiple light tests together and heavy tests in a separate group.
35+
declare -A groups
36+
groups=(
37+
["G00"]='lightning_auto_random_default lightning_bom_file lightning_character_sets lightning_check_partial_imported lightning_checkpoint lightning_checkpoint_chunks lightning_checkpoint_columns lightning_checkpoint_dirty_tableid'
38+
["G01"]='lightning_checkpoint_engines lightning_checkpoint_engines_order lightning_checkpoint_error_destroy lightning_checkpoint_parquet lightning_checkpoint_timestamp lightning_checksum_mismatch lightning_cmdline_override lightning_column_permutation lightning_common_handle lightning_compress lightning_concurrent-restore'
39+
["G02"]='lightning_config_max_error lightning_config_skip_csv_header lightning_csv lightning_default-columns lightning_disable_scheduler_by_key_range lightning_disk_quota lightning_distributed_import lightning_drop_other_tables_halfway lightning_duplicate_detection lightning_duplicate_detection_new lightning_duplicate_resolution_error lightning_duplicate_resolution_error_pk_multiple_files lightning_duplicate_resolution_error_uk_multiple_files lightning_duplicate_resolution_error_uk_multiple_files_multicol_index lightning_duplicate_resolution_incremental'
40+
["G03"]='lightning_duplicate_resolution_merge lightning_duplicate_resolution_replace_multiple_keys_clustered_pk lightning_duplicate_resolution_replace_multiple_keys_nonclustered_pk lightning_duplicate_resolution_replace_multiple_unique_keys_clustered_pk lightning_duplicate_resolution_replace_multiple_unique_keys_nonclustered_pk lightning_duplicate_resolution_replace_one_key lightning_duplicate_resolution_replace_one_key_multiple_conflicts_clustered_pk lightning_duplicate_resolution_replace_one_key_multiple_conflicts_nonclustered_pk'
41+
["G04"]='lightning_duplicate_resolution_replace_one_unique_key_clustered_pk lightning_duplicate_resolution_replace_one_unique_key_multiple_conflicts_clustered_pk lightning_duplicate_resolution_replace_one_unique_key_multiple_conflicts_nonclustered_pk lightning_duplicate_resolution_replace_one_unique_key_nonclustered_varchar_pk lightning_error_summary lightning_examples lightning_exotic_filenames lightning_extend_routes'
42+
["G05"]='lightning_fail_fast lightning_fail_fast_on_nonretry_err lightning_file_routing lightning_foreign_key lightning_gcs lightning_generated_columns lightning_ignore_columns lightning_import_compress lightning_incremental lightning_issue_282 lightning_issue_40657 lightning_issue_410 lightning_issue_519 lightning_local_backend lightning_max_incr'
43+
["G06"]='lightning_max_random lightning_multi_valued_index lightning_new_collation lightning_no_schema lightning_parquet lightning_partition_incremental lightning_partitioned-table lightning_record_network lightning_reload_cert lightning_restore lightning_routes lightning_routes_panic lightning_row-format-v2 lightning_s3'
44+
["G07"]='lightning_shard_rowid lightning_source_linkfile lightning_sqlmode lightning_tidb_duplicate_data lightning_tidb_rowid lightning_tiflash lightning_tikv_multi_rocksdb lightning_too_many_columns lightning_tool_135'
45+
["G08"]='lightning_pd_leader_switch lightning_tool_1420 lightning_tool_1472 lightning_tool_241 lightning_ttl lightning_unused_config_keys lightning_various_types lightning_view lightning_write_batch lightning_write_limit lightning_write_timeout lightning_add_index lightning_alter_random lightning_auto_columns'
46+
)
47+
48+
# Get other lightning cases not in groups, to avoid missing any case
49+
others=()
50+
for script in "$CUR"/*/run.sh; do
51+
test_name="$(basename "$(dirname "$script")")"
52+
if [[ $test_name != lightning_* ]]; then
53+
continue
54+
fi
55+
# shellcheck disable=SC2076
56+
if [[ ! " ${groups[*]} " =~ " ${test_name} " ]]; then
57+
others=("${others[@]} ${test_name}")
58+
fi
59+
done
60+
61+
if [[ "$group" == "others" ]]; then
62+
if [[ -z $others ]]; then
63+
echo "All lightning test cases have been added to groups"
64+
exit 0
65+
fi
66+
echo "Error: "$others" is not added to any group in br/tests/run_group_lightning_tests.sh"
67+
exit 1
68+
elif [[ " ${!groups[*]} " =~ " ${group} " ]]; then
69+
test_names="${groups[${group}]}"
70+
# Run test cases
71+
if [[ -n $test_names ]]; then
72+
echo ""
73+
echo "Run cases: ${test_names}"
74+
for case_name in $test_names; do
75+
echo "Run cases: ${case_name}"
76+
rm -rf /tmp/lightning_test
77+
mkdir -p /tmp/lightning_test
78+
TEST_NAME=${case_name} ${CUR}/run.sh
79+
done
80+
fi
81+
else
82+
echo "Error: invalid group name: ${group}"
83+
exit 1
84+
fi

0 commit comments

Comments
 (0)