Skip to content

Commit 031cc4e

Browse files
committed
Worker distribution: option to deprioritize hyperthreading
1 parent cd029d9 commit 031cc4e

File tree

3 files changed

+111
-24
lines changed

3 files changed

+111
-24
lines changed

libworkstream_df/config.h

+60-12
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,63 @@
7272
* - distribute_minimise_worker_communication
7373
* The workers are placed on processing units to minimise the communication
7474
* latency between all the workers
75+
*
76+
* - distribute_minimise_worker_communication_hyperthreading_last
77+
* The workers are placed on processing units to minimise the communication
78+
* latency between all the workers but selects a hyperthreaded processing unit
79+
* as a last resort
7580
*
7681
* - distribute_maximise_per_worker_resources
7782
* The workers are placed on processing units to maximise the resources
7883
* available to each worker (cache and memory)
7984
*/
8085

81-
#define WORKER_DISTRIBUTION_ALGORITHM distribute_minimise_worker_communication
86+
#define WORKER_DISTRIBUTION_ALGORITHM distribute_minimise_worker_communication_hyperthreading_last
87+
88+
/*
89+
* Task pushing
90+
*/
91+
92+
#define NUM_PUSH_SLOTS 0
93+
94+
/*
95+
* When pushing work to another worker, only allow to push to workers that are
96+
* not too close. PUSH_MIN_MEM_LEVEL represents the level in the HWLOC
97+
* hierarchy when such push is allowed.
98+
*/
99+
100+
#define PUSH_MIN_MEM_LEVEL 1
101+
102+
/*
103+
* ??
104+
*/
105+
106+
#define NUM_PUSH_REORDER_SLOTS 0
107+
108+
/*
109+
* ??
110+
*/
111+
112+
#define PUSH_MIN_FRAME_SIZE (64 * 1024)
113+
#define PUSH_MIN_REL_FRAME_SIZE 1.3
114+
115+
/*
116+
* Push to numa node in the order of numa nodes (SEQ) or randomly.
117+
*/
118+
119+
#define PUSH_EQUAL_SEQ
120+
// #define PUSH_EQUAL_RANDOM
121+
122+
/*
123+
* ??
124+
*/
125+
126+
#define PUSH_STRATEGY_MAX_WRITER
127+
//#define PUSH_STRATEGY_OWNER
128+
//#define PUSH_STRATEGY_SPLIT_OWNER
129+
//#define PUSH_STRATEGY_SPLIT_OWNER_CHAIN
130+
//#define PUSH_STRATEGY_SPLIT_OWNER_CHAIN_INNER_MW
131+
//#define PUSH_STRATEGY_SPLIT_SCORE_NODES
82132

83133
/*********************** OpenStream Debug Options ***********************/
84134

@@ -165,16 +215,6 @@
165215

166216
/*********************** OpenStream Probably Broken Options (Post-HWLOC untested) ***********************/
167217

168-
#define PUSH_MIN_MEM_LEVEL 1
169-
#define PUSH_MIN_FRAME_SIZE (64 * 1024)
170-
#define PUSH_MIN_REL_FRAME_SIZE 1.3
171-
#define NUM_PUSH_SLOTS 0
172-
#define ALLOW_PUSHES (NUM_PUSH_SLOTS > 0)
173-
174-
#define NUM_PUSH_REORDER_SLOTS 0
175-
#define ALLOW_PUSH_REORDER (ALLOW_PUSHES && NUM_PUSH_REORDER_SLOTS > 0)
176-
177-
178218
//#define WS_PAPI_PROFILE
179219
//#define WS_PAPI_MULTIPLEX
180220

@@ -190,6 +230,10 @@
190230

191231
#define ALLOW_WQEVENT_SAMPLING (MAX_WQEVENT_SAMPLES > 0)
192232

233+
#define ALLOW_PUSHES (NUM_PUSH_SLOTS > 0)
234+
235+
#define ALLOW_PUSH_REORDER (ALLOW_PUSHES && NUM_PUSH_REORDER_SLOTS > 0)
236+
193237
#ifndef IN_GCC
194238
#include <string.h>
195239

@@ -212,4 +256,8 @@
212256
#error "MATRIX_PROFILE defined, but WQUEUE_PROFILE != 1"
213257
#endif
214258

215-
#endif
259+
#if ALLOW_PUSHES && !WQUEUE_PROFILE
260+
#error "WORK_PUSHING defined, but WQUEUE_PROFILE != 1"
261+
#endif
262+
263+
#endif

libworkstream_df/hwloc-support.c

+50-12
Original file line numberDiff line numberDiff line change
@@ -273,39 +273,73 @@ bool restrict_topology_to_glibc_cpuset(cpu_set_t set) {
273273
static void
274274
distrib_minimizing_latency(unsigned num_workers, unsigned wanted_workers,
275275
unsigned (*restrict latency_matrix)[num_workers],
276-
hwloc_cpuset_t sets[wanted_workers]) {
276+
hwloc_cpuset_t sets[wanted_workers],
277+
bool use_hyperthreaded_pu_last) {
278+
hwloc_const_cpuset_t topology_cpuset =
279+
hwloc_topology_get_complete_cpuset(machine_topology);
280+
277281
hwloc_cpuset_t logical_indexes = hwloc_bitmap_alloc();
282+
hwloc_cpuset_t tmp_set = hwloc_bitmap_alloc();
278283
hwloc_bitmap_set(logical_indexes, 0);
279284
// The greedy algorithm should work reasonably well
280285
for (unsigned i = 1; i < wanted_workers; ++i) {
281286
unsigned best_cost = UINT_MAX;
282287
unsigned best_candidate = 0;
283-
for (unsigned j = 0; j < num_workers; ++j) {
288+
hwloc_bitmap_zero(tmp_set);
289+
hwloc_bitmap_set(tmp_set, 0);
290+
hwloc_obj_t core = hwloc_get_next_obj_covering_cpuset_by_type(
291+
machine_topology, tmp_set, HWLOC_OBJ_CORE, NULL);
292+
hwloc_bitmap_and(tmp_set, core->cpuset, logical_indexes);
293+
int num_pu_used = hwloc_bitmap_weight(tmp_set);
294+
unsigned j;
295+
hwloc_bitmap_foreach_begin(j, topology_cpuset);
296+
{
284297
if (!hwloc_bitmap_isset(logical_indexes, j)) {
298+
hwloc_bitmap_zero(tmp_set);
299+
hwloc_bitmap_set(tmp_set, j);
300+
hwloc_obj_t core = hwloc_get_next_obj_covering_cpuset_by_type(
301+
machine_topology, tmp_set, HWLOC_OBJ_CORE, NULL);
302+
hwloc_bitmap_and(tmp_set, core->cpuset, logical_indexes);
303+
int num_pu_used_in_core = hwloc_bitmap_weight(tmp_set);
285304
unsigned one_of_the_chosen;
286305
unsigned j_cost = 0;
287306
hwloc_bitmap_foreach_begin(one_of_the_chosen, logical_indexes);
288-
j_cost += latency_matrix[j][one_of_the_chosen] +
289-
latency_matrix[one_of_the_chosen][j];
307+
{
308+
j_cost += latency_matrix[j][one_of_the_chosen] +
309+
latency_matrix[one_of_the_chosen][j];
310+
}
290311
hwloc_bitmap_foreach_end();
291-
if (j_cost < best_cost) {
292-
best_candidate = j;
293-
best_cost = j_cost;
312+
if (use_hyperthreaded_pu_last) {
313+
if (num_pu_used_in_core < num_pu_used ||
314+
(num_pu_used_in_core == num_pu_used && j_cost < best_cost)) {
315+
best_candidate = j;
316+
best_cost = j_cost;
317+
num_pu_used = num_pu_used_in_core;
318+
}
319+
} else {
320+
if (j_cost < best_cost) {
321+
best_candidate = j;
322+
best_cost = j_cost;
323+
}
294324
}
295325
}
296326
}
327+
hwloc_bitmap_foreach_end();
297328
assert(best_candidate != 0);
298329
hwloc_bitmap_set(logical_indexes, best_candidate);
299330
}
300331
unsigned one_of_the_chosen;
301332
unsigned num_chosen = 0;
302333
hwloc_bitmap_foreach_begin(one_of_the_chosen, logical_indexes);
303-
hwloc_obj_t pu =
304-
hwloc_get_obj_by_type(machine_topology, HWLOC_OBJ_PU, one_of_the_chosen);
305-
hwloc_bitmap_set(sets[num_chosen], pu->os_index);
306-
num_chosen++;
334+
{
335+
hwloc_obj_t pu = hwloc_get_obj_by_type(machine_topology, HWLOC_OBJ_PU,
336+
one_of_the_chosen);
337+
hwloc_bitmap_set(sets[num_chosen], pu->os_index);
338+
num_chosen++;
339+
}
307340
hwloc_bitmap_foreach_end();
308341
hwloc_bitmap_free(logical_indexes);
342+
hwloc_bitmap_free(tmp_set);
309343
}
310344

311345
bool distribute_worker_on_topology(
@@ -320,16 +354,20 @@ bool distribute_worker_on_topology(
320354
for (unsigned i = 0; i < num_workers; ++i) {
321355
distrib_sets[i] = hwloc_bitmap_alloc();
322356
}
357+
bool use_hyperthreaded_cores_last = false;
323358
switch (howto_distribute) {
324359
case distribute_maximise_per_worker_resources: {
325360
hwloc_obj_t topo_root = hwloc_get_root_obj(machine_topology);
326361
hwloc_distrib(machine_topology, &topo_root, 1u, distrib_sets, num_workers,
327362
INT_MAX, 0);
328363
} break;
364+
case distribute_minimise_worker_communication_hyperthreading_last:
365+
use_hyperthreaded_cores_last = true;
366+
// fallthrough
329367
case distribute_minimise_worker_communication: {
330368
unsigned nproc = num_available_processing_units();
331369
distrib_minimizing_latency(nproc, num_workers, pu_latency_distances_arr__,
332-
distrib_sets);
370+
distrib_sets, use_hyperthreaded_cores_last);
333371
} break;
334372
default:
335373
break;

libworkstream_df/hwloc-support.h

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern void *pu_bandwidth_distances_arr__;
2121
enum hwloc_wstream_worker_distribution_algorithm {
2222
distribute_maximise_per_worker_resources,
2323
distribute_minimise_worker_communication,
24+
distribute_minimise_worker_communication_hyperthreading_last,
2425
};
2526

2627
// Initializes the hwloc support by discovereing the current machine topology

0 commit comments

Comments
 (0)