Skip to content

Commit f617051

Browse files
committed
Adding the circulant graph queued variable ring algorithm for Bcast.
This algorithm achieves better performance than existing algorithms for both small and large message sizes. The algorithms is based on the circulant graph abstraction and Jesper Larsson Traff's recent paper: https://dl.acm.org/doi/full/10.1145/3735139. It creates communication schedules around various rings in the circulant graph, then repeats the schedule to pipeline message chunks. We introduce a FIFO queue for overlapping sends and receives across communication rounds, which particularly benefits small messages. In the graph below, we show the algorithm's performance for a fixed chunk size (256k) and queue length (24) for various scales on ANL Aurora (N, PPN). The baseline for this graph is the best-performing algorithm currently in MPICH, so all speedups represent improvements over all algorithms currently in the library. We note that the performance drops around our selected chunk size (256k). By tuning the chunk size near this message size, it is possible to achieve a speedup across all message sizes for all scales.
1 parent 7fcdc20 commit f617051

File tree

6 files changed

+49
-1
lines changed

6 files changed

+49
-1
lines changed

src/mpi/coll/bcast/Makefile.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ mpi_core_sources += \
1313
src/mpi/coll/bcast/bcast_intra_binomial.c \
1414
src/mpi/coll/bcast/bcast_intra_scatter_recursive_doubling_allgather.c \
1515
src/mpi/coll/bcast/bcast_intra_scatter_ring_allgather.c \
16+
src/mpi/coll/bcast/bcast_intra_circ_qvring.c \
1617
src/mpi/coll/bcast/bcast_intra_smp.c \
1718
src/mpi/coll/bcast/bcast_intra_tree.c \
1819
src/mpi/coll/bcast/bcast_intra_pipelined_tree.c \

src/mpi/coll/coll_algorithms.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ bcast-intra:
6666
binomial
6767
scatter_recursive_doubling_allgather
6868
scatter_ring_allgather
69+
circ_qvring
70+
extra_params: chunk_size, q_len
71+
cvar_params: CIRC_CHUNK_SIZE, CIRC_Q_LEN
6972
smp
7073
restrictions: parent-comm
7174
tree

src/mpi/coll/cvars.txt

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,33 @@ cvars:
197197
auto - Internal algorithm selection (can be overridden with MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE)
198198
binomial - Force Binomial Tree
199199
nb - Force nonblocking algorithm
200+
circ_qvring - Force queued circulant graph algorithm
200201
smp - Force smp algorithm
201202
scatter_recursive_doubling_allgather - Force Scatter Recursive-Doubling Allgather
202203
scatter_ring_allgather - Force Scatter Ring
203204
pipelined_tree - Force tree-based pipelined algorithm
204205
tree - Force tree-based algorithm
205-
206+
207+
- name : MPIR_CVAR_BCAST_CIRC_CHUNK_SIZE
208+
category : COLLECTIVE
209+
type : int
210+
default : 0
211+
class : none
212+
verbosity : MPI_T_VERBOSITY_USER_BASIC
213+
scope : MPI_T_SCOPE_ALL_EQ
214+
description : >-
215+
Determines message chunk size for circ_qvring (0 forces single message send)
216+
217+
- name : MPIR_CVAR_BCAST_CIRC_Q_LEN
218+
category : COLLECTIVE
219+
type : int
220+
default : 1
221+
class : none
222+
verbosity : MPI_T_VERBOSITY_USER_BASIC
223+
scope : MPI_T_SCOPE_ALL_EQ
224+
description : >-
225+
Determines how many bcast rounds can be in flight at once for circ_qvring
226+
206227
- name : MPIR_CVAR_BCAST_TREE_KVAL
207228
category : COLLECTIVE
208229
type : int

src/mpi/coll/include/csel_container.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ typedef enum {
5858
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_binomial,
5959
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_scatter_recursive_doubling_allgather,
6060
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_scatter_ring_allgather,
61+
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_circ_qvring,
6162
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_smp,
6263
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_tree,
6364
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_pipelined_tree,
@@ -325,6 +326,10 @@ typedef struct {
325326
int chunk_size;
326327
int recv_pre_posted;
327328
} intra_pipelined_tree;
329+
struct {
330+
int chunk_size;
331+
int q_len;
332+
} intra_circ_qvring;
328333
} bcast;
329334
struct {
330335
struct {

src/mpi/coll/src/csel_container.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,19 @@ static void parse_container_params(struct json_object *obj, MPII_Csel_container_
4343
}
4444
break;
4545

46+
case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_circ_qvring:
47+
{
48+
json_object_object_foreach(obj, key, val) {
49+
ckey = MPL_strdup_no_spaces(key);
50+
if (!strncmp(ckey, "chunk_size=", strlen("chunk_size=")))
51+
cnt->u.bcast.intra_circ_qvring.chunk_size = atoi(ckey + strlen("chunk_size="));
52+
else if (!strncmp(ckey, "q_len=", strlen("q_len=")))
53+
cnt->u.bcast.intra_circ_qvring.q_len = atoi(ckey + strlen("q_len="));
54+
MPL_free(ckey);
55+
}
56+
}
57+
break;
58+
4659
case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_tree:
4760
{
4861
json_object_object_foreach(obj, key, val) {
@@ -482,6 +495,8 @@ void *MPII_Create_container(struct json_object *obj)
482495
MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_scatter_recursive_doubling_allgather;
483496
else if (!strcmp(ckey, "algorithm=MPIR_Bcast_intra_scatter_ring_allgather"))
484497
cnt->id = MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_scatter_ring_allgather;
498+
else if (!strcmp(ckey, "algorithm=MPIR_Bcast_intra_circ_qvring"))
499+
cnt->id = MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_circ_qvring;
485500
else if (!strcmp(ckey, "algorithm=MPIR_Bcast_intra_smp"))
486501
cnt->id = MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Bcast_intra_smp;
487502
else if (!strcmp(ckey, "algorithm=MPIR_Bcast_intra_tree"))

test/mpi/maint/coll_cvars.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,9 @@ algorithms:
355355
smp
356356
scatter_recursive_doubling_allgather
357357
scatter_ring_allgather
358+
circ_qvring
359+
.MPIR_CVAR_BCAST_CIRC_CHUNK_SIZE=0,1,131072,262144
360+
.MPIR_CVAR_BCAST_CIRC_Q_LEN=1,4,8
358361
tree
359362
.MPIR_CVAR_BCAST_TREE_TYPE=kary,knomial_1,knomial_2
360363
.MPIR_CVAR_BCAST_IS_NON_BLOCKING=1

0 commit comments

Comments
 (0)