Skip to content

Commit 36212dd

Browse files
authored
add prmoe to cifar10 example (deepspeedai#163)
1 parent 9c48e36 commit 36212dd

File tree

2 files changed

+54
-15
lines changed

2 files changed

+54
-15
lines changed

cifar/cifar10_deepspeed.py

+32-15
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,18 @@ def add_argument():
5151
type=int,
5252
help='(moe) expert parallel world size')
5353
parser.add_argument('--num-experts',
54-
default=1,
5554
type=int,
56-
help='(moe) number of total experts')
55+
nargs='+',
56+
default=[
57+
1,
58+
],
59+
help='number of experts list, MoE related.')
60+
parser.add_argument(
61+
'--mlp-type',
62+
type=str,
63+
default='standard',
64+
help=
65+
'Only applicable when num-experts > 1, accepts [standard, residual]')
5766
parser.add_argument('--top-k',
5867
default=1,
5968
type=int,
@@ -168,9 +177,6 @@ def imshow(img):
168177

169178
args = add_argument()
170179

171-
if args.moe:
172-
deepspeed.utils.groups.initialize(ep_size=args.ep_world_size)
173-
174180

175181
class Net(nn.Module):
176182
def __init__(self):
@@ -181,14 +187,21 @@ def __init__(self):
181187
self.fc1 = nn.Linear(16 * 5 * 5, 120)
182188
self.fc2 = nn.Linear(120, 84)
183189
if args.moe:
184-
self.fc3 = nn.Linear(84, 84)
185-
self.fc3 = deepspeed.moe.layer.MoE(
186-
hidden_size=84,
187-
expert=self.fc3,
188-
num_experts=args.num_experts,
189-
k=args.top_k,
190-
min_capacity=args.min_capacity,
191-
noisy_gate_policy=args.noisy_gate_policy)
190+
fc3 = nn.Linear(84, 84)
191+
self.moe_layer_list = []
192+
for n_e in args.num_experts:
193+
# create moe layers based on the number of experts
194+
self.moe_layer_list.append(
195+
deepspeed.moe.layer.MoE(
196+
hidden_size=84,
197+
expert=fc3,
198+
num_experts=n_e,
199+
ep_size=args.ep_world_size,
200+
use_residual=args.mlp_type == 'residual',
201+
k=args.top_k,
202+
min_capacity=args.min_capacity,
203+
noisy_gate_policy=args.noisy_gate_policy))
204+
self.moe_layer_list = nn.ModuleList(self.moe_layer_list)
192205
self.fc4 = nn.Linear(84, 10)
193206
else:
194207
self.fc3 = nn.Linear(84, 10)
@@ -200,7 +213,8 @@ def forward(self, x):
200213
x = F.relu(self.fc1(x))
201214
x = F.relu(self.fc2(x))
202215
if args.moe:
203-
x, _, _ = self.fc3(x)
216+
for layer in self.moe_layer_list:
217+
x, _, _ = layer(x)
204218
x = self.fc4(x)
205219
else:
206220
x = self.fc3(x)
@@ -213,7 +227,10 @@ def forward(self, x):
213227
def create_moe_param_groups(model):
214228
from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
215229

216-
parameters = {'params': model.parameters(), 'name': 'parameters'}
230+
parameters = {
231+
'params': [p for p in model.parameters()],
232+
'name': 'parameters'
233+
}
217234

218235
return split_params_into_different_moe_groups_for_optimizer(parameters)
219236

cifar/run_ds_prmoe.sh

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
# Number of nodes
4+
NUM_NODES=1
5+
# Number of GPUs per node
6+
NUM_GPUS=2
7+
# Size of expert parallel world (should be less than total world size)
8+
EP_SIZE=2
9+
# Number of total experts, note here we need to pass >= two numbers (numbers can be different)
10+
EXPERTS='2 4'
11+
12+
deepspeed --num_nodes=${NUM_NODES} --num_gpus=${NUM_GPUS} cifar10_deepspeed.py \
13+
--log-interval 100 \
14+
--deepspeed \
15+
--deepspeed_config ds_config.json \
16+
--moe \
17+
--ep-world-size ${EP_SIZE} \
18+
--num-experts ${EXPERTS} \
19+
--top-k 1 \
20+
--mlp-type 'residual' \
21+
--noisy-gate-policy 'RSample' \
22+
--moe-param-group

0 commit comments

Comments
 (0)