@@ -273,39 +273,73 @@ bool restrict_topology_to_glibc_cpuset(cpu_set_t set) {
273
273
static void
274
274
distrib_minimizing_latency (unsigned num_workers , unsigned wanted_workers ,
275
275
unsigned (* restrict latency_matrix )[num_workers ],
276
- hwloc_cpuset_t sets [wanted_workers ]) {
276
+ hwloc_cpuset_t sets [wanted_workers ],
277
+ bool use_hyperthreaded_pu_last ) {
278
+ hwloc_const_cpuset_t topology_cpuset =
279
+ hwloc_topology_get_complete_cpuset (machine_topology );
280
+
277
281
hwloc_cpuset_t logical_indexes = hwloc_bitmap_alloc ();
282
+ hwloc_cpuset_t tmp_set = hwloc_bitmap_alloc ();
278
283
hwloc_bitmap_set (logical_indexes , 0 );
279
284
// The greedy algorithm should work reasonably well
280
285
for (unsigned i = 1 ; i < wanted_workers ; ++ i ) {
281
286
unsigned best_cost = UINT_MAX ;
282
287
unsigned best_candidate = 0 ;
283
- for (unsigned j = 0 ; j < num_workers ; ++ j ) {
288
+ hwloc_bitmap_zero (tmp_set );
289
+ hwloc_bitmap_set (tmp_set , 0 );
290
+ hwloc_obj_t core = hwloc_get_next_obj_covering_cpuset_by_type (
291
+ machine_topology , tmp_set , HWLOC_OBJ_CORE , NULL );
292
+ hwloc_bitmap_and (tmp_set , core -> cpuset , logical_indexes );
293
+ int num_pu_used = hwloc_bitmap_weight (tmp_set );
294
+ unsigned j ;
295
+ hwloc_bitmap_foreach_begin (j , topology_cpuset );
296
+ {
284
297
if (!hwloc_bitmap_isset (logical_indexes , j )) {
298
+ hwloc_bitmap_zero (tmp_set );
299
+ hwloc_bitmap_set (tmp_set , j );
300
+ hwloc_obj_t core = hwloc_get_next_obj_covering_cpuset_by_type (
301
+ machine_topology , tmp_set , HWLOC_OBJ_CORE , NULL );
302
+ hwloc_bitmap_and (tmp_set , core -> cpuset , logical_indexes );
303
+ int num_pu_used_in_core = hwloc_bitmap_weight (tmp_set );
285
304
unsigned one_of_the_chosen ;
286
305
unsigned j_cost = 0 ;
287
306
hwloc_bitmap_foreach_begin (one_of_the_chosen , logical_indexes );
288
- j_cost += latency_matrix [j ][one_of_the_chosen ] +
289
- latency_matrix [one_of_the_chosen ][j ];
307
+ {
308
+ j_cost += latency_matrix [j ][one_of_the_chosen ] +
309
+ latency_matrix [one_of_the_chosen ][j ];
310
+ }
290
311
hwloc_bitmap_foreach_end ();
291
- if (j_cost < best_cost ) {
292
- best_candidate = j ;
293
- best_cost = j_cost ;
312
+ if (use_hyperthreaded_pu_last ) {
313
+ if (num_pu_used_in_core < num_pu_used ||
314
+ (num_pu_used_in_core == num_pu_used && j_cost < best_cost )) {
315
+ best_candidate = j ;
316
+ best_cost = j_cost ;
317
+ num_pu_used = num_pu_used_in_core ;
318
+ }
319
+ } else {
320
+ if (j_cost < best_cost ) {
321
+ best_candidate = j ;
322
+ best_cost = j_cost ;
323
+ }
294
324
}
295
325
}
296
326
}
327
+ hwloc_bitmap_foreach_end ();
297
328
assert (best_candidate != 0 );
298
329
hwloc_bitmap_set (logical_indexes , best_candidate );
299
330
}
300
331
unsigned one_of_the_chosen ;
301
332
unsigned num_chosen = 0 ;
302
333
hwloc_bitmap_foreach_begin (one_of_the_chosen , logical_indexes );
303
- hwloc_obj_t pu =
304
- hwloc_get_obj_by_type (machine_topology , HWLOC_OBJ_PU , one_of_the_chosen );
305
- hwloc_bitmap_set (sets [num_chosen ], pu -> os_index );
306
- num_chosen ++ ;
334
+ {
335
+ hwloc_obj_t pu = hwloc_get_obj_by_type (machine_topology , HWLOC_OBJ_PU ,
336
+ one_of_the_chosen );
337
+ hwloc_bitmap_set (sets [num_chosen ], pu -> os_index );
338
+ num_chosen ++ ;
339
+ }
307
340
hwloc_bitmap_foreach_end ();
308
341
hwloc_bitmap_free (logical_indexes );
342
+ hwloc_bitmap_free (tmp_set );
309
343
}
310
344
311
345
bool distribute_worker_on_topology (
@@ -320,16 +354,20 @@ bool distribute_worker_on_topology(
320
354
for (unsigned i = 0 ; i < num_workers ; ++ i ) {
321
355
distrib_sets [i ] = hwloc_bitmap_alloc ();
322
356
}
357
+ bool use_hyperthreaded_cores_last = false;
323
358
switch (howto_distribute ) {
324
359
case distribute_maximise_per_worker_resources : {
325
360
hwloc_obj_t topo_root = hwloc_get_root_obj (machine_topology );
326
361
hwloc_distrib (machine_topology , & topo_root , 1u , distrib_sets , num_workers ,
327
362
INT_MAX , 0 );
328
363
} break ;
364
+ case distribute_minimise_worker_communication_hyperthreading_last :
365
+ use_hyperthreaded_cores_last = true;
366
+ // fallthrough
329
367
case distribute_minimise_worker_communication : {
330
368
unsigned nproc = num_available_processing_units ();
331
369
distrib_minimizing_latency (nproc , num_workers , pu_latency_distances_arr__ ,
332
- distrib_sets );
370
+ distrib_sets , use_hyperthreaded_cores_last );
333
371
} break ;
334
372
default :
335
373
break ;
0 commit comments