Worker distribution: option to deprioritize hyperthreading

Syllo · Syllo · commit 031cc4edace2 · 2020-01-31T14:25:12.000Z
diff --git a/libworkstream_df/config.h b/libworkstream_df/config.h
@@ -72,13 +72,63 @@
  * - distribute_minimise_worker_communication
  *      The workers are placed on processing units to minimise the communication
  *      latency between all the workers
+ *
+ * - distribute_minimise_worker_communication_hyperthreading_last
+ *      The workers are placed on processing units to minimise the communication
+ *      latency between all the workers but selects a hyperthreaded processing unit
+ *      as a last resort
  * 
  * - distribute_maximise_per_worker_resources
  *      The workers are placed on processing units to maximise the resources
  *      available to each worker (cache and memory)
  */
 
-#define WORKER_DISTRIBUTION_ALGORITHM distribute_minimise_worker_communication
+#define WORKER_DISTRIBUTION_ALGORITHM distribute_minimise_worker_communication_hyperthreading_last
+
+/*
+ * Task pushing
+ */
+
+#define NUM_PUSH_SLOTS 0
+
+/*
+ * When pushing work to another worker, only allow to push to workers that are
+ * not too close. PUSH_MIN_MEM_LEVEL represents the level in the HWLOC
+ * hierarchy when such push is allowed.
+ */
+
+#define PUSH_MIN_MEM_LEVEL 1
+
+/*
+ * ??
+ */
+
+#define NUM_PUSH_REORDER_SLOTS 0
+
+/*
+ * ??
+ */
+
+#define PUSH_MIN_FRAME_SIZE (64 * 1024)
+#define PUSH_MIN_REL_FRAME_SIZE 1.3
+
+/*
+ * Push to numa node in the order of numa nodes (SEQ) or randomly.
+ */
+
+#define PUSH_EQUAL_SEQ
+// #define PUSH_EQUAL_RANDOM
+
+/*
+ * ??
+ */
+
+#define PUSH_STRATEGY_MAX_WRITER
+//#define PUSH_STRATEGY_OWNER
+//#define PUSH_STRATEGY_SPLIT_OWNER
+//#define PUSH_STRATEGY_SPLIT_OWNER_CHAIN
+//#define PUSH_STRATEGY_SPLIT_OWNER_CHAIN_INNER_MW
+//#define PUSH_STRATEGY_SPLIT_SCORE_NODES
 
  /*********************** OpenStream Debug Options ***********************/
 
@@ -165,16 +215,6 @@
 
  /*********************** OpenStream Probably Broken Options (Post-HWLOC untested) ***********************/
 
-#define PUSH_MIN_MEM_LEVEL 1
-#define PUSH_MIN_FRAME_SIZE (64 * 1024)
-#define PUSH_MIN_REL_FRAME_SIZE 1.3
-#define NUM_PUSH_SLOTS 0
-#define ALLOW_PUSHES (NUM_PUSH_SLOTS > 0)
-
-#define NUM_PUSH_REORDER_SLOTS 0
-#define ALLOW_PUSH_REORDER (ALLOW_PUSHES && NUM_PUSH_REORDER_SLOTS > 0)
-
-
 //#define WS_PAPI_PROFILE
 //#define WS_PAPI_MULTIPLEX
 
@@ -190,6 +230,10 @@
 
 #define ALLOW_WQEVENT_SAMPLING (MAX_WQEVENT_SAMPLES > 0)
 
+#define ALLOW_PUSHES (NUM_PUSH_SLOTS > 0)
+
+#define ALLOW_PUSH_REORDER (ALLOW_PUSHES && NUM_PUSH_REORDER_SLOTS > 0)
+
 #ifndef IN_GCC
 #include <string.h>
 
@@ -212,4 +256,8 @@
 #error "MATRIX_PROFILE defined, but WQUEUE_PROFILE != 1"
 #endif
 
-#endif
+#if ALLOW_PUSHES && !WQUEUE_PROFILE
+#error "WORK_PUSHING defined, but WQUEUE_PROFILE != 1"
+#endif
+
+#endif
diff --git a/libworkstream_df/hwloc-support.c b/libworkstream_df/hwloc-support.c
@@ -273,39 +273,73 @@ bool restrict_topology_to_glibc_cpuset(cpu_set_t set) {
 static void
 distrib_minimizing_latency(unsigned num_workers, unsigned wanted_workers,
                            unsigned (*restrict latency_matrix)[num_workers],
-                           hwloc_cpuset_t sets[wanted_workers]) {
+                           hwloc_cpuset_t sets[wanted_workers],
+                           bool use_hyperthreaded_pu_last) {
+  hwloc_const_cpuset_t topology_cpuset =
+      hwloc_topology_get_complete_cpuset(machine_topology);
+
   hwloc_cpuset_t logical_indexes = hwloc_bitmap_alloc();
+  hwloc_cpuset_t tmp_set = hwloc_bitmap_alloc();
   hwloc_bitmap_set(logical_indexes, 0);
   // The greedy algorithm should work reasonably well
   for (unsigned i = 1; i < wanted_workers; ++i) {
     unsigned best_cost = UINT_MAX;
     unsigned best_candidate = 0;
-    for (unsigned j = 0; j < num_workers; ++j) {
+    hwloc_bitmap_zero(tmp_set);
+    hwloc_bitmap_set(tmp_set, 0);
+    hwloc_obj_t core = hwloc_get_next_obj_covering_cpuset_by_type(
+        machine_topology, tmp_set, HWLOC_OBJ_CORE, NULL);
+    hwloc_bitmap_and(tmp_set, core->cpuset, logical_indexes);
+    int num_pu_used = hwloc_bitmap_weight(tmp_set);
+    unsigned j;
+    hwloc_bitmap_foreach_begin(j, topology_cpuset);
+    {
       if (!hwloc_bitmap_isset(logical_indexes, j)) {
+        hwloc_bitmap_zero(tmp_set);
+        hwloc_bitmap_set(tmp_set, j);
+        hwloc_obj_t core = hwloc_get_next_obj_covering_cpuset_by_type(
+            machine_topology, tmp_set, HWLOC_OBJ_CORE, NULL);
+        hwloc_bitmap_and(tmp_set, core->cpuset, logical_indexes);
+        int num_pu_used_in_core = hwloc_bitmap_weight(tmp_set);
         unsigned one_of_the_chosen;
         unsigned j_cost = 0;
         hwloc_bitmap_foreach_begin(one_of_the_chosen, logical_indexes);
-        j_cost += latency_matrix[j][one_of_the_chosen] +
-                  latency_matrix[one_of_the_chosen][j];
+        {
+          j_cost += latency_matrix[j][one_of_the_chosen] +
+                    latency_matrix[one_of_the_chosen][j];
+        }
         hwloc_bitmap_foreach_end();
-        if (j_cost < best_cost) {
-          best_candidate = j;
-          best_cost = j_cost;
+        if (use_hyperthreaded_pu_last) {
+          if (num_pu_used_in_core < num_pu_used ||
+              (num_pu_used_in_core == num_pu_used && j_cost < best_cost)) {
+            best_candidate = j;
+            best_cost = j_cost;
+            num_pu_used = num_pu_used_in_core;
+          }
+        } else {
+          if (j_cost < best_cost) {
+            best_candidate = j;
+            best_cost = j_cost;
+          }
         }
       }
     }
+    hwloc_bitmap_foreach_end();
     assert(best_candidate != 0);
     hwloc_bitmap_set(logical_indexes, best_candidate);
   }
   unsigned one_of_the_chosen;
   unsigned num_chosen = 0;
   hwloc_bitmap_foreach_begin(one_of_the_chosen, logical_indexes);
-  hwloc_obj_t pu =
-      hwloc_get_obj_by_type(machine_topology, HWLOC_OBJ_PU, one_of_the_chosen);
-  hwloc_bitmap_set(sets[num_chosen], pu->os_index);
-  num_chosen++;
+  {
+    hwloc_obj_t pu = hwloc_get_obj_by_type(machine_topology, HWLOC_OBJ_PU,
+                                           one_of_the_chosen);
+    hwloc_bitmap_set(sets[num_chosen], pu->os_index);
+    num_chosen++;
+  }
   hwloc_bitmap_foreach_end();
   hwloc_bitmap_free(logical_indexes);
+  hwloc_bitmap_free(tmp_set);
 }
 
 bool distribute_worker_on_topology(
@@ -320,16 +354,20 @@ bool distribute_worker_on_topology(
   for (unsigned i = 0; i < num_workers; ++i) {
     distrib_sets[i] = hwloc_bitmap_alloc();
   }
+  bool use_hyperthreaded_cores_last = false;
   switch (howto_distribute) {
   case distribute_maximise_per_worker_resources: {
     hwloc_obj_t topo_root = hwloc_get_root_obj(machine_topology);
     hwloc_distrib(machine_topology, &topo_root, 1u, distrib_sets, num_workers,
                   INT_MAX, 0);
   } break;
+  case distribute_minimise_worker_communication_hyperthreading_last:
+    use_hyperthreaded_cores_last = true;
+    // fallthrough
   case distribute_minimise_worker_communication: {
     unsigned nproc = num_available_processing_units();
     distrib_minimizing_latency(nproc, num_workers, pu_latency_distances_arr__,
-                               distrib_sets);
+                               distrib_sets, use_hyperthreaded_cores_last);
   } break;
   default:
     break;
diff --git a/libworkstream_df/hwloc-support.h b/libworkstream_df/hwloc-support.h
@@ -21,6 +21,7 @@ extern void *pu_bandwidth_distances_arr__;
 enum hwloc_wstream_worker_distribution_algorithm {
   distribute_maximise_per_worker_resources,
   distribute_minimise_worker_communication,
+  distribute_minimise_worker_communication_hyperthreading_last,
 };
 
 // Initializes the hwloc support by discovereing the current machine topology