Skip to content

Commit 4fc7848

Browse files
szhengacShuai Zheng
andauthored
add local rank explicitly for mpirun (deepspeedai#72)
Co-authored-by: Shuai Zheng <[email protected]>
1 parent 78d69cb commit 4fc7848

File tree

3 files changed

+5
-1
lines changed

3 files changed

+5
-1
lines changed

BingBertSquad/nvidia_run_squad_deepspeed.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,7 @@ def main():
742742
parser = get_argument_parser()
743743

744744
deepspeed.init_distributed(dist_backend='nccl')
745+
args.local_rank = int(os.environ['LOCAL_RANK'])
745746

746747
# Include DeepSpeed configuration arguments
747748
parser = deepspeed.add_config_arguments(parser)

bing_bert/deepspeed_train.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ def prepare_optimizer_parameters(args, model):
391391
def prepare_model_optimizer(args):
392392
# Initialize torch distributed
393393
deepspeed.init_distributed(dist_backend='nccl')
394+
args.local_rank = int(os.environ['LOCAL_RANK'])
394395

395396
# Loading Model
396397
model = BertMultiTask(args)

pipeline_parallelism/train.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python3
22

3+
import os
34
import argparse
45

56
import torch
@@ -148,8 +149,9 @@ def train_pipe(args, part='parameters'):
148149
if __name__ == '__main__':
149150
args = get_args()
150151

151-
torch.cuda.set_device(args.local_rank)
152152
deepspeed.init_distributed(dist_backend=args.backend)
153+
args.local_rank = int(os.environ['LOCAL_RANK'])
154+
torch.cuda.set_device(args.local_rank)
153155

154156
if args.pipeline_parallel_size == 0:
155157
train_base(args)

0 commit comments

Comments
 (0)