Skip to content

Commit f0bf91c

Browse files
committed
[Not for landing] piggy back on titan for scale init test
ghstack-source-id: 8852e63604fc363c10327b8cd997a5a40c3a7533 Pull Request resolved: #841
1 parent fb0a942 commit f0bf91c

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

train.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,16 @@ def loss_fn(pred, labels):
430430

431431

432432
if __name__ == "__main__":
433-
config = JobConfig()
434-
config.parse_args()
435-
main(config)
436-
torch.distributed.destroy_process_group()
433+
# The first one is just for warm up.
434+
for root_size in [128, 8, 16, 32, 64, 128, 256]:
435+
os.environ["TORCH_NCCL_RANKS_PER_ROOT"] = str(root_size)
436+
start = time.perf_counter()
437+
torch.distributed.init_process_group(backend="nccl")
438+
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
439+
torch.distributed.barrier()
440+
end = time.perf_counter()
441+
torch.distributed.destroy_process_group()
442+
print(f"Time to init process group: {end - start:.6f} seconds for {root_size} ranks per roots")
443+
# config = JobConfig()
444+
# config.parse_args()
445+
# main(config)

0 commit comments

Comments
 (0)