openmp/runtime/src/kmp_dispatch_hier.h

   1 /*
   2  * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef KMP_DISPATCH_HIER_H
  14 #define KMP_DISPATCH_HIER_H
  15 #include "kmp.h"
  16 #include "kmp_dispatch.h"
  17
  18 // Layer type for scheduling hierarchy
  19 enum kmp_hier_layer_e {
  20   LAYER_THREAD = -1,
  21   LAYER_L1,
  22   LAYER_L2,
  23   LAYER_L3,
  24   LAYER_NUMA,
  25   LAYER_LOOP,
  26   LAYER_LAST
  27 };
  28
  29 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
  30 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
  31   switch (type) {
  32   case kmp_hier_layer_e::LAYER_THREAD:
  33     return "THREAD";
  34   case kmp_hier_layer_e::LAYER_L1:
  35     return "L1";
  36   case kmp_hier_layer_e::LAYER_L2:
  37     return "L2";
  38   case kmp_hier_layer_e::LAYER_L3:
  39     return "L3";
  40   case kmp_hier_layer_e::LAYER_NUMA:
  41     return "NUMA";
  42   case kmp_hier_layer_e::LAYER_LOOP:
  43     return "WHOLE_LOOP";
  44   case kmp_hier_layer_e::LAYER_LAST:
  45     return "LAST";
  46   }
  47   KMP_ASSERT(0);
  48   // Appease compilers, should never get here
  49   return "ERROR";
  50 }
  51
  52 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
  53 typedef struct kmp_hier_sched_env_t {
  54   int size;
  55   int capacity;
  56   enum sched_type *scheds;
  57   kmp_int32 *small_chunks;
  58   kmp_int64 *large_chunks;
  59   kmp_hier_layer_e *layers;
  60   // Append a level of the hierarchy
  61   void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
  62     if (capacity == 0) {
  63       scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
  64                                                  kmp_hier_layer_e::LAYER_LAST);
  65       small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
  66                                                  kmp_hier_layer_e::LAYER_LAST);
  67       large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
  68                                                  kmp_hier_layer_e::LAYER_LAST);
  69       layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
  70                                                   kmp_hier_layer_e::LAYER_LAST);
  71       capacity = kmp_hier_layer_e::LAYER_LAST;
  72     }
  73     int current_size = size;
  74     KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
  75     scheds[current_size] = sched;
  76     layers[current_size] = layer;
  77     small_chunks[current_size] = chunk;
  78     large_chunks[current_size] = (kmp_int64)chunk;
  79     size++;
  80   }
  81   // Sort the hierarchy using selection sort, size will always be small
  82   // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
  83   void sort() {
  84     if (size <= 1)
  85       return;
  86     for (int i = 0; i < size; ++i) {
  87       int switch_index = i;
  88       for (int j = i + 1; j < size; ++j) {
  89         if (layers[j] < layers[switch_index])
  90           switch_index = j;
  91       }
  92       if (switch_index != i) {
  93         kmp_hier_layer_e temp1 = layers[i];
  94         enum sched_type temp2 = scheds[i];
  95         kmp_int32 temp3 = small_chunks[i];
  96         kmp_int64 temp4 = large_chunks[i];
  97         layers[i] = layers[switch_index];
  98         scheds[i] = scheds[switch_index];
  99         small_chunks[i] = small_chunks[switch_index];
 100         large_chunks[i] = large_chunks[switch_index];
 101         layers[switch_index] = temp1;
 102         scheds[switch_index] = temp2;
 103         small_chunks[switch_index] = temp3;
 104         large_chunks[switch_index] = temp4;
 105       }
 106     }
 107   }
 108   // Free all memory
 109   void deallocate() {
 110     if (capacity > 0) {
 111       __kmp_free(scheds);
 112       __kmp_free(layers);
 113       __kmp_free(small_chunks);
 114       __kmp_free(large_chunks);
 115       scheds = NULL;
 116       layers = NULL;
 117       small_chunks = NULL;
 118       large_chunks = NULL;
 119     }
 120     size = 0;
 121     capacity = 0;
 122   }
 123 } kmp_hier_sched_env_t;
 124
 125 extern int __kmp_dispatch_hand_threading;
 126 extern kmp_hier_sched_env_t __kmp_hier_scheds;
 127
 128 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
 129 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
 130 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
 131
 132 extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
 133 extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
 134 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
 135                                         kmp_hier_layer_e t2);
 136 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
 137
 138 template <typename T> struct kmp_hier_shared_bdata_t {
 139   typedef typename traits_t<T>::signed_t ST;
 140   volatile kmp_uint64 val[2];
 141   kmp_int32 status[2];
 142   T lb[2];
 143   T ub[2];
 144   ST st[2];
 145   dispatch_shared_info_template<T> sh[2];
 146   void zero() {
 147     val[0] = val[1] = 0;
 148     status[0] = status[1] = 0;
 149     lb[0] = lb[1] = 0;
 150     ub[0] = ub[1] = 0;
 151     st[0] = st[1] = 0;
 152     sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
 153   }
 154   void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
 155                             kmp_uint64 index) {
 156     lb[1 - index] = nlb;
 157     ub[1 - index] = nub;
 158     st[1 - index] = nst;
 159     status[1 - index] = nstatus;
 160   }
 161   void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
 162     lb[1 - index] = nlb;
 163     ub[1 - index] = nub;
 164     st[1 - index] = nst;
 165     status[1 - index] = nstatus;
 166     sh[1 - index].u.s.iteration = 0;
 167   }
 168
 169   kmp_int32 get_next_status(kmp_uint64 index) const {
 170     return status[1 - index];
 171   }
 172   T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
 173   T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
 174   ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
 175   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
 176     return &(sh[1 - index]);
 177   }
 178
 179   kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
 180   T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
 181   T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
 182   ST get_curr_st(kmp_uint64 index) const { return st[index]; }
 183   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
 184     return &(sh[index]);
 185   }
 186 };
 187
 188 /*
 189  * In the barrier implementations, num_active is the number of threads that are
 190  * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
 191  * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
 192  * structure. tdata is the thread private data that resides on the thread
 193  * data structure.
 194  *
 195  * The reset_shared() method is used to initialize the barrier data on the
 196  * kmp_hier_top_unit_t hierarchy structure
 197  *
 198  * The reset_private() method is used to initialize the barrier data on the
 199  * thread's private dispatch buffer structure
 200  *
 201  * The barrier() method takes an id, which is that thread's id for the
 202  * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
 203  * inside barrier() until all fellow threads who are attached to that
 204  * kmp_hier_top_unit_t structure have arrived.
 205  */
 206
 207 // Core barrier implementation
 208 // Can be used in a unit with between 2 to 8 threads
 209 template <typename T> class core_barrier_impl {
 210   static inline kmp_uint64 get_wait_val(int num_active) {
 211     kmp_uint64 wait_val = 0LL;
 212     switch (num_active) {
 213     case 2:
 214       wait_val = 0x0101LL;
 215       break;
 216     case 3:
 217       wait_val = 0x010101LL;
 218       break;
 219     case 4:
 220       wait_val = 0x01010101LL;
 221       break;
 222     case 5:
 223       wait_val = 0x0101010101LL;
 224       break;
 225     case 6:
 226       wait_val = 0x010101010101LL;
 227       break;
 228     case 7:
 229       wait_val = 0x01010101010101LL;
 230       break;
 231     case 8:
 232       wait_val = 0x0101010101010101LL;
 233       break;
 234     default:
 235       // don't use the core_barrier_impl for more than 8 threads
 236       KMP_ASSERT(0);
 237     }
 238     return wait_val;
 239   }
 240
 241 public:
 242   static void reset_private(kmp_int32 num_active,
 243                             kmp_hier_private_bdata_t *tdata);
 244   static void reset_shared(kmp_int32 num_active,
 245                            kmp_hier_shared_bdata_t<T> *bdata);
 246   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
 247                       kmp_hier_private_bdata_t *tdata);
 248 };
 249
 250 template <typename T>
 251 void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
 252                                          kmp_hier_private_bdata_t *tdata) {
 253   tdata->num_active = num_active;
 254   tdata->index = 0;
 255   tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
 256 }
 257 template <typename T>
 258 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
 259                                         kmp_hier_shared_bdata_t<T> *bdata) {
 260   bdata->val[0] = bdata->val[1] = 0LL;
 261   bdata->status[0] = bdata->status[1] = 0LL;
 262 }
 263 template <typename T>
 264 void core_barrier_impl<T>::barrier(kmp_int32 id,
 265                                    kmp_hier_shared_bdata_t<T> *bdata,
 266                                    kmp_hier_private_bdata_t *tdata) {
 267   kmp_uint64 current_index = tdata->index;
 268   kmp_uint64 next_index = 1 - current_index;
 269   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
 270   kmp_uint64 next_wait_value =
 271       (current_wait_value ? 0 : get_wait_val(tdata->num_active));
 272   KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
 273                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
 274                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
 275                 next_wait_value));
 276   char v = (current_wait_value ? '\1' : '\0');
 277   (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
 278   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
 279                          __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
 280   tdata->wait_val[current_index] = next_wait_value;
 281   tdata->index = next_index;
 282 }
 283
 284 // Counter barrier implementation
 285 // Can be used in a unit with arbitrary number of active threads
 286 template <typename T> class counter_barrier_impl {
 287 public:
 288   static void reset_private(kmp_int32 num_active,
 289                             kmp_hier_private_bdata_t *tdata);
 290   static void reset_shared(kmp_int32 num_active,
 291                            kmp_hier_shared_bdata_t<T> *bdata);
 292   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
 293                       kmp_hier_private_bdata_t *tdata);
 294 };
 295
 296 template <typename T>
 297 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
 298                                             kmp_hier_private_bdata_t *tdata) {
 299   tdata->num_active = num_active;
 300   tdata->index = 0;
 301   tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
 302 }
 303 template <typename T>
 304 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
 305                                            kmp_hier_shared_bdata_t<T> *bdata) {
 306   bdata->val[0] = bdata->val[1] = 0LL;
 307   bdata->status[0] = bdata->status[1] = 0LL;
 308 }
 309 template <typename T>
 310 void counter_barrier_impl<T>::barrier(kmp_int32 id,
 311                                       kmp_hier_shared_bdata_t<T> *bdata,
 312                                       kmp_hier_private_bdata_t *tdata) {
 313   volatile kmp_int64 *val;
 314   kmp_uint64 current_index = tdata->index;
 315   kmp_uint64 next_index = 1 - current_index;
 316   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
 317   kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
 318
 319   KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
 320                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
 321                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
 322                 next_wait_value));
 323   val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
 324   KMP_TEST_THEN_INC64(val);
 325   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
 326                          __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
 327   tdata->wait_val[current_index] = next_wait_value;
 328   tdata->index = next_index;
 329 }
 330
 331 // Data associated with topology unit within a layer
 332 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache
 333 template <typename T> struct kmp_hier_top_unit_t {
 334   typedef typename traits_t<T>::signed_t ST;
 335   typedef typename traits_t<T>::unsigned_t UT;
 336   kmp_int32 active; // number of topology units that communicate with this unit
 337   // chunk information (lower/upper bound, stride, etc.)
 338   dispatch_private_info_template<T> hier_pr;
 339   kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
 340   kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
 341
 342   kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
 343   void reset_shared_barrier() {
 344     KMP_DEBUG_ASSERT(active > 0);
 345     if (active == 1)
 346       return;
 347     hier_barrier.zero();
 348     if (active >= 2 && active <= 8) {
 349       core_barrier_impl<T>::reset_shared(active, &hier_barrier);
 350     } else {
 351       counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
 352     }
 353   }
 354   void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
 355     KMP_DEBUG_ASSERT(tdata);
 356     KMP_DEBUG_ASSERT(active > 0);
 357     if (active == 1)
 358       return;
 359     if (active >= 2 && active <= 8) {
 360       core_barrier_impl<T>::reset_private(active, tdata);
 361     } else {
 362       counter_barrier_impl<T>::reset_private(active, tdata);
 363     }
 364   }
 365   void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
 366     KMP_DEBUG_ASSERT(tdata);
 367     KMP_DEBUG_ASSERT(active > 0);
 368     KMP_DEBUG_ASSERT(id >= 0 && id < active);
 369     if (active == 1) {
 370       tdata->index = 1 - tdata->index;
 371       return;
 372     }
 373     if (active >= 2 && active <= 8) {
 374       core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
 375     } else {
 376       counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
 377     }
 378   }
 379
 380   kmp_int32 get_next_status(kmp_uint64 index) const {
 381     return hier_barrier.get_next_status(index);
 382   }
 383   T get_next_lb(kmp_uint64 index) const {
 384     return hier_barrier.get_next_lb(index);
 385   }
 386   T get_next_ub(kmp_uint64 index) const {
 387     return hier_barrier.get_next_ub(index);
 388   }
 389   ST get_next_st(kmp_uint64 index) const {
 390     return hier_barrier.get_next_st(index);
 391   }
 392   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
 393     return hier_barrier.get_next_sh(index);
 394   }
 395
 396   kmp_int32 get_curr_status(kmp_uint64 index) const {
 397     return hier_barrier.get_curr_status(index);
 398   }
 399   T get_curr_lb(kmp_uint64 index) const {
 400     return hier_barrier.get_curr_lb(index);
 401   }
 402   T get_curr_ub(kmp_uint64 index) const {
 403     return hier_barrier.get_curr_ub(index);
 404   }
 405   ST get_curr_st(kmp_uint64 index) const {
 406     return hier_barrier.get_curr_st(index);
 407   }
 408   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
 409     return hier_barrier.get_curr_sh(index);
 410   }
 411
 412   void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
 413                             kmp_uint64 index) {
 414     hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
 415   }
 416   void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
 417     hier_barrier.set_next(lb, ub, st, status, index);
 418   }
 419   dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
 420   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
 421   dispatch_private_info_template<T> *get_parent_pr() {
 422     return &(hier_parent->hier_pr);
 423   }
 424
 425   kmp_int32 is_active() const { return active; }
 426   kmp_int32 get_num_active() const { return active; }
 427 #ifdef KMP_DEBUG
 428   void print() {
 429     KD_TRACE(
 430         10,
 431         ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
 432          active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
 433          hier_pr.u.p.tc));
 434   }
 435 #endif
 436 };
 437
 438 // Information regarding a single layer within the scheduling hierarchy
 439 template <typename T> struct kmp_hier_layer_info_t {
 440   int num_active; // number of threads active in this level
 441   kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
 442   enum sched_type sched; // static, dynamic, guided, etc.
 443   typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
 444   int length; // length of the kmp_hier_top_unit_t array
 445
 446 #ifdef KMP_DEBUG
 447   // Print this layer's information
 448   void print() {
 449     const char *t = __kmp_get_hier_str(type);
 450     KD_TRACE(
 451         10,
 452         ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
 453          "length:%d\n",
 454          num_active, t, sched, chunk, length));
 455   }
 456 #endif
 457 };
 458
 459 /*
 460  * Structure to implement entire hierarchy
 461  *
 462  * The hierarchy is kept as an array of arrays to represent the different
 463  * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
 464  * highest layer.
 465  * Example:
 466  * [ 2 ] -> [ L3 | L3 ]
 467  * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
 468  * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
 469  * There is also an array of layer_info_t which has information regarding
 470  * each layer
 471  */
 472 template <typename T> struct kmp_hier_t {
 473 public:
 474   typedef typename traits_t<T>::unsigned_t UT;
 475   typedef typename traits_t<T>::signed_t ST;
 476
 477 private:
 478   int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
 479                    kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
 480                    kmp_int32 previous_id, int hier_level) {
 481     int status;
 482     kmp_info_t *th = __kmp_threads[gtid];
 483     auto parent = current->get_parent();
 484     bool last_layer = (hier_level == get_num_layers() - 1);
 485     KMP_DEBUG_ASSERT(th);
 486     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
 487     KMP_DEBUG_ASSERT(current);
 488     KMP_DEBUG_ASSERT(hier_level >= 0);
 489     KMP_DEBUG_ASSERT(hier_level < get_num_layers());
 490     KMP_DEBUG_ASSERT(tdata);
 491     KMP_DEBUG_ASSERT(parent || last_layer);
 492
 493     KD_TRACE(
 494         1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
 495
 496     T hier_id = (T)current->get_hier_id();
 497     // Attempt to grab next iteration range for this level
 498     if (previous_id == 0) {
 499       KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
 500                    gtid, hier_level));
 501       kmp_int32 contains_last;
 502       T my_lb, my_ub;
 503       ST my_st;
 504       T nproc;
 505       dispatch_shared_info_template<T> volatile *my_sh;
 506       dispatch_private_info_template<T> *my_pr;
 507       if (last_layer) {
 508         // last layer below the very top uses the single shared buffer
 509         // from the team struct.
 510         KD_TRACE(10,
 511                  ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
 512                   gtid, hier_level));
 513         my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 514             th->th.th_dispatch->th_dispatch_sh_current);
 515         nproc = (T)get_top_level_nproc();
 516       } else {
 517         // middle layers use the shared buffer inside the kmp_hier_top_unit_t
 518         // structure
 519         KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
 520                       gtid, hier_level));
 521         my_sh =
 522             parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
 523         nproc = (T)parent->get_num_active();
 524       }
 525       my_pr = current->get_my_pr();
 526       KMP_DEBUG_ASSERT(my_sh);
 527       KMP_DEBUG_ASSERT(my_pr);
 528       enum sched_type schedule = get_sched(hier_level);
 529       ST chunk = (ST)get_chunk(hier_level);
 530       status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
 531                                                 &contains_last, &my_lb, &my_ub,
 532                                                 &my_st, nproc, hier_id);
 533       KD_TRACE(
 534           10,
 535           ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
 536            gtid, hier_level, status));
 537       // When no iterations are found (status == 0) and this is not the last
 538       // layer, attempt to go up the hierarchy for more iterations
 539       if (status == 0 && !last_layer) {
 540         kmp_int32 hid;
 541         __kmp_type_convert(hier_id, &hid);
 542         status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
 543                               &my_st, hid, hier_level + 1);
 544         KD_TRACE(
 545             10,
 546             ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
 547              gtid, hier_level, status));
 548         if (status == 1) {
 549           kmp_hier_private_bdata_t *upper_tdata =
 550               &(th->th.th_hier_bar_data[hier_level + 1]);
 551           my_sh = parent->get_curr_sh(upper_tdata->index);
 552           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
 553                         gtid, hier_level));
 554           __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
 555                                         parent->get_curr_lb(upper_tdata->index),
 556                                         parent->get_curr_ub(upper_tdata->index),
 557                                         parent->get_curr_st(upper_tdata->index),
 558 #if USE_ITT_BUILD
 559                                         NULL,
 560 #endif
 561                                         chunk, nproc, hier_id);
 562           status = __kmp_dispatch_next_algorithm<T>(
 563               gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
 564               hier_id);
 565           if (!status) {
 566             KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
 567                           "setting to 2!\n",
 568                           gtid, hier_level));
 569             status = 2;
 570           }
 571         }
 572       }
 573       current->set_next(my_lb, my_ub, my_st, status, tdata->index);
 574       // Propagate whether a unit holds the actual global last iteration
 575       // The contains_last attribute is sent downwards from the top to the
 576       // bottom of the hierarchy via the contains_last flag inside the
 577       // private dispatch buffers in the hierarchy's middle layers
 578       if (contains_last) {
 579         // If the next_algorithm() method returns 1 for p_last and it is the
 580         // last layer or our parent contains the last serial chunk, then the
 581         // chunk must contain the last serial iteration.
 582         if (last_layer || parent->hier_pr.flags.contains_last) {
 583           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
 584                         "to contain last.\n",
 585                         gtid, hier_level));
 586           current->hier_pr.flags.contains_last = contains_last;
 587         }
 588         if (!current->hier_pr.flags.contains_last)
 589           contains_last = FALSE;
 590       }
 591       if (p_last)
 592         *p_last = contains_last;
 593     } // if primary thread of this unit
 594     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
 595       KD_TRACE(10,
 596                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
 597                 gtid, hier_level));
 598       current->barrier(previous_id, tdata);
 599       KD_TRACE(10,
 600                ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
 601                 gtid, hier_level, current->get_curr_status(tdata->index)));
 602     } else {
 603       KMP_DEBUG_ASSERT(previous_id == 0);
 604       return status;
 605     }
 606     return current->get_curr_status(tdata->index);
 607   }
 608
 609 public:
 610   int top_level_nproc;
 611   int num_layers;
 612   bool valid;
 613   int type_size;
 614   kmp_hier_layer_info_t<T> *info;
 615   kmp_hier_top_unit_t<T> **layers;
 616   // Deallocate all memory from this hierarchy
 617   void deallocate() {
 618     for (int i = 0; i < num_layers; ++i)
 619       if (layers[i] != NULL) {
 620         __kmp_free(layers[i]);
 621       }
 622     if (layers != NULL) {
 623       __kmp_free(layers);
 624       layers = NULL;
 625     }
 626     if (info != NULL) {
 627       __kmp_free(info);
 628       info = NULL;
 629     }
 630     num_layers = 0;
 631     valid = false;
 632   }
 633   // Returns true if reallocation is needed else false
 634   bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
 635                           const enum sched_type *new_scheds,
 636                           const ST *new_chunks) const {
 637     if (!valid || layers == NULL || info == NULL ||
 638         traits_t<T>::type_size != type_size || n != num_layers)
 639       return true;
 640     for (int i = 0; i < n; ++i) {
 641       if (info[i].type != new_layers[i])
 642         return true;
 643       if (info[i].sched != new_scheds[i])
 644         return true;
 645       if (info[i].chunk != new_chunks[i])
 646         return true;
 647     }
 648     return false;
 649   }
 650   // A single thread should call this function while the other threads wait
 651   // create a new scheduling hierarchy consisting of new_layers, new_scheds
 652   // and new_chunks.  These should come pre-sorted according to
 653   // kmp_hier_layer_e value.  This function will try to avoid reallocation
 654   // if it can
 655   void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
 656                      const enum sched_type *new_scheds, const ST *new_chunks) {
 657     top_level_nproc = 0;
 658     if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
 659       KD_TRACE(
 660           10,
 661           ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
 662       for (int i = 0; i < n; ++i) {
 663         info[i].num_active = 0;
 664         for (int j = 0; j < get_length(i); ++j)
 665           layers[i][j].active = 0;
 666       }
 667       return;
 668     }
 669     KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
 670     deallocate();
 671     type_size = traits_t<T>::type_size;
 672     num_layers = n;
 673     info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
 674         sizeof(kmp_hier_layer_info_t<T>) * n);
 675     layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
 676         sizeof(kmp_hier_top_unit_t<T> *) * n);
 677     for (int i = 0; i < n; ++i) {
 678       int max = 0;
 679       kmp_hier_layer_e layer = new_layers[i];
 680       info[i].num_active = 0;
 681       info[i].type = layer;
 682       info[i].sched = new_scheds[i];
 683       info[i].chunk = new_chunks[i];
 684       max = __kmp_hier_max_units[layer + 1];
 685       if (max == 0) {
 686         valid = false;
 687         KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
 688         deallocate();
 689         return;
 690       }
 691       info[i].length = max;
 692       layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
 693           sizeof(kmp_hier_top_unit_t<T>) * max);
 694       for (int j = 0; j < max; ++j) {
 695         layers[i][j].active = 0;
 696         layers[i][j].hier_pr.flags.use_hier = TRUE;
 697       }
 698     }
 699     valid = true;
 700   }
 701   // loc - source file location
 702   // gtid - global thread identifier
 703   // pr - this thread's private dispatch buffer (corresponding with gtid)
 704   // p_last (return value) - pointer to flag indicating this set of iterations
 705   // contains last
 706   //          iteration
 707   // p_lb (return value) - lower bound for this chunk of iterations
 708   // p_ub (return value) - upper bound for this chunk of iterations
 709   // p_st (return value) - stride for this chunk of iterations
 710   //
 711   // Returns 1 if there are more iterations to perform, 0 otherwise
 712   int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
 713            kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
 714     int status;
 715     kmp_int32 contains_last = 0;
 716     kmp_info_t *th = __kmp_threads[gtid];
 717     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
 718     auto parent = pr->get_parent();
 719     KMP_DEBUG_ASSERT(parent);
 720     KMP_DEBUG_ASSERT(th);
 721     KMP_DEBUG_ASSERT(tdata);
 722     KMP_DEBUG_ASSERT(parent);
 723     T nproc = (T)parent->get_num_active();
 724     T unit_id = (T)pr->get_hier_id();
 725     KD_TRACE(
 726         10,
 727         ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
 728          gtid, nproc, unit_id));
 729     // Handthreading implementation
 730     // Each iteration is performed by all threads on last unit (typically
 731     // cores/tiles)
 732     // e.g., threads 0,1,2,3 all execute iteration 0
 733     //       threads 0,1,2,3 all execute iteration 1
 734     //       threads 4,5,6,7 all execute iteration 2
 735     //       threads 4,5,6,7 all execute iteration 3
 736     //       ... etc.
 737     if (__kmp_dispatch_hand_threading) {
 738       KD_TRACE(10,
 739                ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
 740                 gtid));
 741       if (unit_id == 0) {
 742         // For hand threading, the sh buffer on the lowest level is only ever
 743         // modified and read by the primary thread on that level.  Because of
 744         // this, we can always use the first sh buffer.
 745         auto sh = &(parent->hier_barrier.sh[0]);
 746         KMP_DEBUG_ASSERT(sh);
 747         status = __kmp_dispatch_next_algorithm<T>(
 748             gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
 749         if (!status) {
 750           bool done = false;
 751           while (!done) {
 752             done = true;
 753             kmp_int32 uid;
 754             __kmp_type_convert(unit_id, &uid);
 755             status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
 756                                   p_st, uid, 0);
 757             if (status == 1) {
 758               __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
 759                                             parent->get_next_lb(tdata->index),
 760                                             parent->get_next_ub(tdata->index),
 761                                             parent->get_next_st(tdata->index),
 762 #if USE_ITT_BUILD
 763                                             NULL,
 764 #endif
 765                                             pr->u.p.parm1, nproc, unit_id);
 766               sh->u.s.iteration = 0;
 767               status = __kmp_dispatch_next_algorithm<T>(
 768                   gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
 769                   unit_id);
 770               if (!status) {
 771                 KD_TRACE(10,
 772                          ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
 773                           "after next_pr_sh()"
 774                           "trying again.\n",
 775                           gtid));
 776                 done = false;
 777               }
 778             } else if (status == 2) {
 779               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
 780                             "trying again.\n",
 781                             gtid));
 782               done = false;
 783             }
 784           }
 785         }
 786         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
 787       } // if primary thread of lowest unit level
 788       parent->barrier(pr->get_hier_id(), tdata);
 789       if (unit_id != 0) {
 790         *p_lb = parent->get_curr_lb(tdata->index);
 791         *p_ub = parent->get_curr_ub(tdata->index);
 792         *p_st = parent->get_curr_st(tdata->index);
 793         status = parent->get_curr_status(tdata->index);
 794       }
 795     } else {
 796       // Normal implementation
 797       // Each thread grabs an iteration chunk and executes it (no cooperation)
 798       auto sh = parent->get_curr_sh(tdata->index);
 799       KMP_DEBUG_ASSERT(sh);
 800       status = __kmp_dispatch_next_algorithm<T>(
 801           gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
 802       KD_TRACE(10,
 803                ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
 804                 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
 805                 gtid, status, contains_last, *p_lb, *p_ub, *p_st));
 806       if (!status) {
 807         bool done = false;
 808         while (!done) {
 809           done = true;
 810           kmp_int32 uid;
 811           __kmp_type_convert(unit_id, &uid);
 812           status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
 813                                 p_st, uid, 0);
 814           if (status == 1) {
 815             sh = parent->get_curr_sh(tdata->index);
 816             __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
 817                                           parent->get_curr_lb(tdata->index),
 818                                           parent->get_curr_ub(tdata->index),
 819                                           parent->get_curr_st(tdata->index),
 820 #if USE_ITT_BUILD
 821                                           NULL,
 822 #endif
 823                                           pr->u.p.parm1, nproc, unit_id);
 824             status = __kmp_dispatch_next_algorithm<T>(
 825                 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
 826             if (!status) {
 827               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
 828                             "after next_pr_sh()"
 829                             "trying again.\n",
 830                             gtid));
 831               done = false;
 832             }
 833           } else if (status == 2) {
 834             KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
 835                           "trying again.\n",
 836                           gtid));
 837             done = false;
 838           }
 839         }
 840       }
 841     }
 842     if (contains_last && !parent->hier_pr.flags.contains_last) {
 843       KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
 844                     "contains_last to FALSE\n",
 845                     gtid));
 846       contains_last = FALSE;
 847     }
 848     if (p_last)
 849       *p_last = contains_last;
 850     KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
 851                   status));
 852     return status;
 853   }
 854   // These functions probe the layer info structure
 855   // Returns the type of topology unit given level
 856   kmp_hier_layer_e get_type(int level) const {
 857     KMP_DEBUG_ASSERT(level >= 0);
 858     KMP_DEBUG_ASSERT(level < num_layers);
 859     return info[level].type;
 860   }
 861   // Returns the schedule type at given level
 862   enum sched_type get_sched(int level) const {
 863     KMP_DEBUG_ASSERT(level >= 0);
 864     KMP_DEBUG_ASSERT(level < num_layers);
 865     return info[level].sched;
 866   }
 867   // Returns the chunk size at given level
 868   ST get_chunk(int level) const {
 869     KMP_DEBUG_ASSERT(level >= 0);
 870     KMP_DEBUG_ASSERT(level < num_layers);
 871     return info[level].chunk;
 872   }
 873   // Returns the number of active threads at given level
 874   int get_num_active(int level) const {
 875     KMP_DEBUG_ASSERT(level >= 0);
 876     KMP_DEBUG_ASSERT(level < num_layers);
 877     return info[level].num_active;
 878   }
 879   // Returns the length of topology unit array at given level
 880   int get_length(int level) const {
 881     KMP_DEBUG_ASSERT(level >= 0);
 882     KMP_DEBUG_ASSERT(level < num_layers);
 883     return info[level].length;
 884   }
 885   // Returns the topology unit given the level and index
 886   kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
 887     KMP_DEBUG_ASSERT(level >= 0);
 888     KMP_DEBUG_ASSERT(level < num_layers);
 889     KMP_DEBUG_ASSERT(index >= 0);
 890     KMP_DEBUG_ASSERT(index < get_length(level));
 891     return &(layers[level][index]);
 892   }
 893   // Returns the number of layers in the hierarchy
 894   int get_num_layers() const { return num_layers; }
 895   // Returns the number of threads in the top layer
 896   // This is necessary because we don't store a topology unit as
 897   // the very top level and the scheduling algorithms need this information
 898   int get_top_level_nproc() const { return top_level_nproc; }
 899   // Return whether this hierarchy is valid or not
 900   bool is_valid() const { return valid; }
 901 #ifdef KMP_DEBUG
 902   // Print the hierarchy
 903   void print() {
 904     KD_TRACE(10, ("kmp_hier_t:\n"));
 905     for (int i = num_layers - 1; i >= 0; --i) {
 906       KD_TRACE(10, ("Info[%d] = ", i));
 907       info[i].print();
 908     }
 909     for (int i = num_layers - 1; i >= 0; --i) {
 910       KD_TRACE(10, ("Layer[%d] =\n", i));
 911       for (int j = 0; j < info[i].length; ++j) {
 912         layers[i][j].print();
 913       }
 914     }
 915   }
 916 #endif
 917 };
 918
 919 template <typename T>
 920 void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
 921                                    kmp_hier_layer_e *new_layers,
 922                                    enum sched_type *new_scheds,
 923                                    typename traits_t<T>::signed_t *new_chunks,
 924                                    T lb, T ub,
 925                                    typename traits_t<T>::signed_t st) {
 926   int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
 927   unsigned int my_buffer_index;
 928   kmp_info_t *th;
 929   kmp_team_t *team;
 930   dispatch_private_info_template<T> *pr;
 931   dispatch_shared_info_template<T> volatile *sh;
 932   gtid = __kmp_entry_gtid();
 933   tid = __kmp_tid_from_gtid(gtid);
 934 #ifdef KMP_DEBUG
 935   KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
 936                 gtid, n));
 937   for (int i = 0; i < n; ++i) {
 938     const char *layer = __kmp_get_hier_str(new_layers[i]);
 939     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
 940                   "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
 941                   gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
 942   }
 943 #endif // KMP_DEBUG
 944   KMP_DEBUG_ASSERT(n > 0);
 945   KMP_DEBUG_ASSERT(new_layers);
 946   KMP_DEBUG_ASSERT(new_scheds);
 947   KMP_DEBUG_ASSERT(new_chunks);
 948   if (!TCR_4(__kmp_init_parallel))
 949     __kmp_parallel_initialize();
 950   __kmp_resume_if_soft_paused();
 951
 952   th = __kmp_threads[gtid];
 953   team = th->th.th_team;
 954   active = !team->t.t_serialized;
 955   th->th.th_ident = loc;
 956   num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
 957   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
 958                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 959   my_buffer_index = th->th.th_dispatch->th_disp_index;
 960   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 961       &th->th.th_dispatch
 962            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 963   sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 964       &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 965   if (!active) {
 966     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
 967                   "Using normal dispatch functions.\n",
 968                   gtid));
 969     KMP_DEBUG_ASSERT(pr);
 970     pr->flags.use_hier = FALSE;
 971     pr->flags.contains_last = FALSE;
 972     return;
 973   }
 974   KMP_DEBUG_ASSERT(pr);
 975   KMP_DEBUG_ASSERT(sh);
 976   pr->flags.use_hier = TRUE;
 977   pr->u.p.tc = 0;
 978   // Have primary thread allocate the hierarchy
 979   if (__kmp_tid_from_gtid(gtid) == 0) {
 980     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
 981                   "hierarchy\n",
 982                   gtid, pr, sh));
 983     if (sh->hier == NULL) {
 984       sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
 985     }
 986     sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
 987     sh->u.s.iteration = 0;
 988   }
 989   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 990   // Check to make sure the hierarchy is valid
 991   kmp_hier_t<T> *hier = sh->hier;
 992   if (!sh->hier->is_valid()) {
 993     pr->flags.use_hier = FALSE;
 994     return;
 995   }
 996   // Have threads allocate their thread-private barrier data if it hasn't
 997   // already been allocated
 998   if (th->th.th_hier_bar_data == NULL) {
 999     th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
1000         sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
1001   }
1002   // Have threads "register" themselves by modifying the active count for each
1003   // level they are involved in. The active count will act as nthreads for that
1004   // level regarding the scheduling algorithms
1005   for (int i = 0; i < n; ++i) {
1006     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1007     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1008     // Setup the thread's private dispatch buffer's hierarchy pointers
1009     if (i == 0)
1010       pr->hier_parent = my_unit;
1011     // If this unit is already active, then increment active count and wait
1012     if (my_unit->is_active()) {
1013       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1014                     "is already active (%d)\n",
1015                     gtid, my_unit, my_unit->active));
1016       KMP_TEST_THEN_INC32(&(my_unit->active));
1017       break;
1018     }
1019     // Flag that this unit is active
1020     if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
1021       // Do not setup parent pointer for top level unit since it has no parent
1022       if (i < n - 1) {
1023         // Setup middle layer pointers to parents
1024         my_unit->get_my_pr()->hier_id =
1025             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1026                                                  hier->get_type(i + 1));
1027         int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1028         my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1029       } else {
1030         // Setup top layer information (no parent pointers are set)
1031         my_unit->get_my_pr()->hier_id =
1032             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1033                                                  kmp_hier_layer_e::LAYER_LOOP);
1034         KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1035         my_unit->hier_parent = nullptr;
1036       }
1037       // Set trip count to 0 so that next() operation will initially climb up
1038       // the hierarchy to get more iterations (early exit in next() for tc == 0)
1039       my_unit->get_my_pr()->u.p.tc = 0;
1040       // Increment this layer's number of active units
1041       KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1042       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1043                     "incrementing num_active\n",
1044                     gtid, my_unit));
1045     } else {
1046       KMP_TEST_THEN_INC32(&(my_unit->active));
1047       break;
1048     }
1049   }
1050   // Set this thread's id
1051   num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1052       kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1053   pr->hier_id = tid % num_threads_per_layer1;
1054   // For oversubscribed threads, increment their index within the lowest unit
1055   // This is done to prevent having two or more threads with id 0, id 1, etc.
1056   if (tid >= num_hw_threads)
1057     pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1058   KD_TRACE(
1059       10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1060            gtid, pr->hier_id));
1061
1062   pr->flags.contains_last = FALSE;
1063   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1064
1065   // Now that the number of active threads at each level is determined,
1066   // the barrier data for each unit can be initialized and the last layer's
1067   // loop information can be initialized.
1068   int prev_id = pr->get_hier_id();
1069   for (int i = 0; i < n; ++i) {
1070     if (prev_id != 0)
1071       break;
1072     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1073     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1074     // Only primary threads of this unit within the hierarchy do initialization
1075     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1076                   gtid, i));
1077     my_unit->reset_shared_barrier();
1078     my_unit->hier_pr.flags.contains_last = FALSE;
1079     // Last layer, initialize the private buffers with entire loop information
1080     // Now the next next_algorithm() call will get the first chunk of
1081     // iterations properly
1082     if (i == n - 1) {
1083       __kmp_dispatch_init_algorithm<T>(
1084           loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1085 #if USE_ITT_BUILD
1086           NULL,
1087 #endif
1088           hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1089     }
1090     prev_id = my_unit->get_hier_id();
1091   }
1092   // Initialize each layer of the thread's private barrier data
1093   kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1094   for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1095     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1096     unit->reset_private_barrier(tdata);
1097   }
1098   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1099
1100 #ifdef KMP_DEBUG
1101   if (__kmp_tid_from_gtid(gtid) == 0) {
1102     for (int i = 0; i < n; ++i) {
1103       KD_TRACE(10,
1104                ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1105                 gtid, i, hier->get_num_active(i)));
1106     }
1107     hier->print();
1108   }
1109   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1110 #endif // KMP_DEBUG
1111 }
1112 #endif