Push tasks in working condition with hwloc

Syllo · Syllo · commit cd029d9d8a73 · 2020-01-30T18:16:24.000Z
Drop compilation constants for dynamic values and allocations to make
task pushing work with OpenStream.
diff --git a/libworkstream_df/hwloc-support.c b/libworkstream_df/hwloc-support.c
@@ -483,4 +483,17 @@ void openstream_hwloc_cleanup(void) {
   cpuid_to_closest_numa_node = NULL;
   num_numa_nodes = 0;
   topology_depth = 0;
+}
+
+unsigned hwloc_mem_transfer_cost(unsigned numa_node_a, unsigned numa_node_b) {
+  // Best information source should be provided by inter-node bandwith
+  if (pu_bandwidth_matrix_size) {
+    return pu_bandwidth_distances[numa_node_a][numa_node_b];
+  }
+  // Second best information source can be extracted by looking at the latency
+  if (pu_latency_matrix_size) {
+    return pu_latency_distances[numa_node_a][numa_node_b];
+  }
+  // Assume uniform transfer cost when no other data is available
+  return 1;
 }
diff --git a/libworkstream_df/hwloc-support.h b/libworkstream_df/hwloc-support.h
@@ -60,4 +60,6 @@ unsigned closest_numa_node_of_processing_unit(const hwloc_obj_t obj);
 
 void openstream_hwloc_cleanup(void);
 
+unsigned hwloc_mem_transfer_cost(unsigned numa_node_a, unsigned numa_node_b);
+
 #endif // HWLOC_SUPPORT_H_
diff --git a/libworkstream_df/profiling.c b/libworkstream_df/profiling.c
@@ -292,7 +292,7 @@ void init_wqueue_counters(wstream_df_thread_p th) {
 #if ALLOW_PUSHES
   th->steals_pushed = 0;
   th->pushes_fails = 0;
-  memset(th->pushes_mem, 0, sizeof(th->pushes_mem));
+  th->pushes_mem = calloc(topology_depth, sizeof(*th->pushes_mem));
 #endif
 
   th->reuse_addr = 0;
@@ -434,7 +434,7 @@ void dump_wqueue_counters (unsigned int num_workers, wstream_df_thread_p* wstrea
 {
 #ifdef WS_PAPI_PROFILE
 	#ifdef DUMP_NUMA_COUNTERS
-	for(int i = 0; i < MAX_NUMA_NODES; i++) {
+	for(int i = 0; i < num_numa_nodes; i++) {
 		dump_numa_counters_single(numa_node_by_id(i));
 	}
 	#endif
diff --git a/libworkstream_df/profiling.h b/libworkstream_df/profiling.h
@@ -12,10 +12,10 @@ struct wstream_df_thread;
 struct wstream_df_numa_node;
 extern unsigned wstream_num_workers;
 
-#if ALLOW_PUSHES
+#if ALLOW_PUSHES && WQUEUE_PROFILE
 #define WSTREAM_DF_THREAD_WQUEUE_PROFILE_PUSH_FIELDS \
 	unsigned long long steals_pushed; \
-	unsigned long long pushes_mem[MEM_NUM_LEVELS]; \
+	unsigned long long *pushes_mem; \
 	unsigned long long pushes_fails;
 #else
 #define WSTREAM_DF_THREAD_WQUEUE_PROFILE_PUSH_FIELDS
diff --git a/libworkstream_df/work_distribution.c b/libworkstream_df/work_distribution.c
@@ -118,7 +118,7 @@ void import_pushes(wstream_df_thread_p cthread)
   }
 }
 
-int work_push_beneficial_max_writer(wstream_df_frame_p fp, wstream_df_thread_p cthread, int num_workers, int* target_worker)
+int work_push_beneficial_max_writer(wstream_df_frame_p fp, wstream_df_thread_p cthread, int num_workers, unsigned* target_worker)
 {
   unsigned int max_worker;
   int max_data;
@@ -223,7 +223,7 @@ int work_push_beneficial_split_owner_chain(wstream_df_frame_p fp, wstream_df_thr
   unsigned int max_worker;
   int numa_node_id;
   int max_data;
-  size_t data[MAX_NUMA_NODES];
+  size_t data[num_numa_nodes];
   wstream_df_numa_node_p numa_node;
   unsigned int rand_idx;
 
@@ -247,7 +247,7 @@ int work_push_beneficial_split_owner_chain(wstream_df_frame_p fp, wstream_df_thr
   max_data = data[cthread->numa_node->id];
   numa_node_id = cthread->numa_node->id;
 
-  for(int i = 0; i < MAX_NUMA_NODES; i++) {
+  for(unsigned i = 0; i < num_numa_nodes; i++) {
     if((int)data[i] > max_data) {
       max_data = data[i];
       numa_node_id = i;
@@ -272,13 +272,13 @@ int work_push_beneficial_split_owner_chain_inner_mw(wstream_df_frame_p fp, wstre
   unsigned int max_worker;
   int numa_node_id;
   int max_data;
-  size_t data[MAX_NUMA_NODES];
+  size_t data[num_numa_nodes];
   wstream_df_numa_node_p numa_node;
   unsigned int rand_idx;
   int node_id;
 
 #if defined(PUSH_EQUAL_RANDOM)
-    size_t others[MAX_NUMA_NODES];
+    size_t others[num_numa_nodes];
     int num_others = 0;
 #endif
 
@@ -308,14 +308,14 @@ int work_push_beneficial_split_owner_chain_inner_mw(wstream_df_frame_p fp, wstre
   numa_node_id = cthread->numa_node->id;
 
 #if defined(PUSH_EQUAL_SEQ)
-  for(int i = 0; i < MAX_NUMA_NODES; i++) {
+  for(unsigned i = 0; i < num_numa_nodes; i++) {
     if((int)data[i] > max_data) {
       max_data = data[i];
       numa_node_id = i;
     }
   }
 #elif defined(PUSH_EQUAL_RANDOM)
-  for(int i = 0; i < MAX_NUMA_NODES; i++) {
+  for(unsigned i = 0; i < num_numa_nodes; i++) {
     if((int)data[i] > max_data)
       others[num_others++] = i;
 
@@ -368,9 +368,8 @@ int work_push_beneficial_split_score_nodes(wstream_df_frame_p fp, wstream_df_thr
 {
   unsigned int max_worker;
   int numa_node_id;
-  int max_data;
-  size_t data[MAX_NUMA_NODES];
-  size_t scores[MAX_NUMA_NODES];
+  size_t data[num_numa_nodes];
+  size_t scores[num_numa_nodes];
   size_t min_score;
   wstream_df_numa_node_p numa_node;
   int factor;
@@ -379,7 +378,7 @@ int work_push_beneficial_split_score_nodes(wstream_df_frame_p fp, wstream_df_thr
   int input_size = 0;
 
 #if defined(PUSH_EQUAL_RANDOM)
-    size_t others[MAX_NUMA_NODES];
+    size_t others[num_numa_nodes];
     int num_others = 0;
 #endif
 
@@ -390,8 +389,10 @@ int work_push_beneficial_split_score_nodes(wstream_df_frame_p fp, wstream_df_thr
     /* By default assume that data is going to be reused */
     if(vi->reuse_data_view)
       node_id = slab_numa_node_of(vi->reuse_data_view->data);
+#if USE_BROADCAST_TABLES
     else if(vi->broadcast_table) /* Peek view with deferred copy */
       node_id = -1;
+#endif // USE_BROADCAST_TABLES
     else
       node_id = slab_numa_node_of(vi->data);
 
@@ -407,22 +408,22 @@ int work_push_beneficial_split_score_nodes(wstream_df_frame_p fp, wstream_df_thr
   if(input_size < PUSH_MIN_FRAME_SIZE)
     return 0;
 
-  for(int target_node = 0; target_node < MAX_NUMA_NODES; target_node++)
-    for(int source_node = 0; source_node < MAX_NUMA_NODES; source_node++)
-      scores[target_node] += data[source_node] * mem_transfer_costs(target_node, source_node);
+  for(unsigned target_node = 0; target_node < num_numa_nodes; target_node++)
+    for(unsigned source_node = 0; source_node < num_numa_nodes; source_node++)
+      scores[target_node] += data[source_node] * hwloc_mem_transfer_cost(target_node, source_node);
 
   min_score = scores[cthread->numa_node->id];
   numa_node_id = cthread->numa_node->id;
 
 #if defined(PUSH_EQUAL_SEQ)
-  for(int i = 0; i < MAX_NUMA_NODES; i++) {
+  for(unsigned i = 0; i < num_numa_nodes; i++) {
     if(scores[i] < min_score) {
       min_score = scores[i];
       numa_node_id = i;
     }
   }
 #elif defined(PUSH_EQUAL_RANDOM)
-  for(int i = 0; i < MAX_NUMA_NODES; i++) {
+  for(int i = 0; i < num_numa_nodes; i++) {
     if(scores[i] == min_score)
       others[num_others++] = i;
 
@@ -466,7 +467,7 @@ int work_push_beneficial_split_score_nodes(wstream_df_frame_p fp, wstream_df_thr
  * of the worker suited best for execution in target_worker. Otherwise 0 is
  * returned.
  */
-int work_push_beneficial(wstream_df_frame_p fp, wstream_df_thread_p cthread, int num_workers, int* target_worker)
+int work_push_beneficial(wstream_df_frame_p fp, wstream_df_thread_p cthread, int num_workers, wstream_df_thread_p* wstream_df_worker_threads, int* target_worker)
 {
   int res;
   unsigned int lcl_target_worker;
@@ -496,7 +497,7 @@ int work_push_beneficial(wstream_df_frame_p fp, wstream_df_thread_p cthread, int
   if(/* Only migrate to a different worker */
      lcl_target_worker != cthread->worker_id &&
      /* Do not migrate to workers that are too close in the memory hierarchy */
-     mem_lowest_common_level(cthread->worker_id, worker_id_to_cpu(lcl_target_worker)) >= PUSH_MIN_MEM_LEVEL)
+     level_of_common_ancestor(cthread->cpu, wstream_df_worker_threads[lcl_target_worker]->cpu) >= PUSH_MIN_MEM_LEVEL)
     {
       *target_worker = lcl_target_worker;
       return 1;
@@ -517,7 +518,6 @@ int work_try_push(wstream_df_frame_p fp,
 {
   int level;
   int curr_owner;
-  int fp_size;
 
   /* Save current owner for statistics and update new owner */
   curr_owner = fp->last_owner;
@@ -526,11 +526,14 @@ int work_try_push(wstream_df_frame_p fp,
   /* We need to copy frame attributes used afterwards as the frame will
    * be under control of the target worker once it is pushed.
    */
-  fp_size = fp->size;
+
+#if ALLOW_WQEVENT_SAMPLING
+  int fp_size = fp->size;
+#endif // ALLOW_WQEVENT_SAMPLING
 
   if(fifo_pushback(&wstream_df_worker_threads[target_worker]->push_fifo, fp)) {
     /* Push was successful, update traces and statistics */
-    level = mem_lowest_common_level(cthread->worker_id, worker_id_to_cpu(target_worker));
+    level = level_of_common_ancestor(cthread->cpu, wstream_df_worker_threads[target_worker]->cpu);
     inc_wqueue_counter(&cthread->pushes_mem[level], 1);
 
     trace_push(cthread, target_worker, worker_id_to_cpu(target_worker), fp_size, fp);
diff --git a/libworkstream_df/work_distribution.h b/libworkstream_df/work_distribution.h
@@ -10,7 +10,7 @@ void reorder_pushes(wstream_df_thread_p cthread);
 
 #if ALLOW_PUSHES
 void import_pushes(wstream_df_thread_p cthread);
-int work_push_beneficial(wstream_df_frame_p fp, wstream_df_thread_p cthread, int num_workers, int* target_worker);
+int work_push_beneficial(wstream_df_frame_p fp, wstream_df_thread_p cthread, int num_workers, wstream_df_thread_p* wstream_df_worker_threads, int* target_worker);
 int work_try_push(wstream_df_frame_p fp, int target_worker, wstream_df_thread_p cthread, wstream_df_thread_p* wstream_df_worker_threads);
 #endif
 
diff --git a/libworkstream_df/wstream_df.c b/libworkstream_df/wstream_df.c
@@ -397,8 +397,9 @@ tdecrease_n (void *data, size_t n, bool is_write)
 #if ALLOW_PUSHES
       int target_worker;
       /* Check whether the frame should be pushed somewhere else */
-      int beneficial = work_push_beneficial(fp, cthread, wstream_num_workers,
-      &target_worker);
+      int beneficial =
+          work_push_beneficial(fp, cthread, wstream_num_workers,
+                               wstream_df_worker_threads, &target_worker);
 
 #ifdef PUSH_ONLY_IF_NOT_STOLEN_AND_CACHE_EMPTY
       int curr_stolen = (cthread->current_frame &&
@@ -904,11 +905,12 @@ __attribute__((__optimize__("O1"))) static void worker_thread(void) {
 
   trace_state_change(cthread, WORKER_STATE_SEEKING);
   while (true) {
-    if (cthread->yield)
+    if (cthread->yield) {
       while (true) {
         struct timespec ts = {.tv_sec = 0, .tv_nsec = 100000000};
         nanosleep(&ts, NULL);
       }
+    }
 
 #if ALLOW_PUSHES
 #if !ALLOW_PUSH_REORDER