openmp/runtime/src/kmp_runtime.cpp

   1 /*
   2  * kmp_runtime.cpp -- KPTS runtime support library
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "kmp.h"
  14 #include "kmp_affinity.h"
  15 #include "kmp_atomic.h"
  16 #include "kmp_environment.h"
  17 #include "kmp_error.h"
  18 #include "kmp_i18n.h"
  19 #include "kmp_io.h"
  20 #include "kmp_itt.h"
  21 #include "kmp_settings.h"
  22 #include "kmp_stats.h"
  23 #include "kmp_str.h"
  24 #include "kmp_wait_release.h"
  25 #include "kmp_wrapper_getpid.h"
  26 #include "kmp_dispatch.h"
  27 #if KMP_USE_HIER_SCHED
  28 #include "kmp_dispatch_hier.h"
  29 #endif
  30
  31 #if OMPT_SUPPORT
  32 #include "ompt-specific.h"
  33 #endif
  34 #if OMPD_SUPPORT
  35 #include "ompd-specific.h"
  36 #endif
  37
  38 #if OMP_PROFILING_SUPPORT
  39 #include "llvm/Support/TimeProfiler.h"
  40 static char *ProfileTraceFile = nullptr;
  41 #endif
  42
  43 /* these are temporary issues to be dealt with */
  44 #define KMP_USE_PRCTL 0
  45
  46 #if KMP_OS_WINDOWS
  47 #include <process.h>
  48 #endif
  49
  50 #if KMP_OS_WINDOWS
  51 // windows does not need include files as it doesn't use shared memory
  52 #else
  53 #include <sys/mman.h>
  54 #include <sys/stat.h>
  55 #include <fcntl.h>
  56 #define SHM_SIZE 1024
  57 #endif
  58
  59 #if defined(KMP_GOMP_COMPAT)
  60 char const __kmp_version_alt_comp[] =
  61     KMP_VERSION_PREFIX "alternative compiler support: yes";
  62 #endif /* defined(KMP_GOMP_COMPAT) */
  63
  64 char const __kmp_version_omp_api[] =
  65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
  66
  67 #ifdef KMP_DEBUG
  68 char const __kmp_version_lock[] =
  69     KMP_VERSION_PREFIX "lock type: run time selectable";
  70 #endif /* KMP_DEBUG */
  71
  72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
  73
  74 /* ------------------------------------------------------------------------ */
  75
  76 #if KMP_USE_MONITOR
  77 kmp_info_t __kmp_monitor;
  78 #endif
  79
  80 /* Forward declarations */
  81
  82 void __kmp_cleanup(void);
  83
  84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
  85                                   int gtid);
  86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
  87                                   kmp_internal_control_t *new_icvs,
  88                                   ident_t *loc);
  89 #if KMP_AFFINITY_SUPPORTED
  90 static void __kmp_partition_places(kmp_team_t *team,
  91                                    int update_master_only = 0);
  92 #endif
  93 static void __kmp_do_serial_initialize(void);
  94 void __kmp_fork_barrier(int gtid, int tid);
  95 void __kmp_join_barrier(int gtid);
  96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
  97                           kmp_internal_control_t *new_icvs, ident_t *loc);
  98
  99 #ifdef USE_LOAD_BALANCE
 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
 101 #endif
 102
 103 static int __kmp_expand_threads(int nNeed);
 104 #if KMP_OS_WINDOWS
 105 static int __kmp_unregister_root_other_thread(int gtid);
 106 #endif
 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
 109
 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
 111                                int new_nthreads);
 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 113
 114 /* Calculate the identifier of the current thread */
 115 /* fast (and somewhat portable) way to get unique identifier of executing
 116    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
 117 int __kmp_get_global_thread_id() {
 118   int i;
 119   kmp_info_t **other_threads;
 120   size_t stack_data;
 121   char *stack_addr;
 122   size_t stack_size;
 123   char *stack_base;
 124
 125   KA_TRACE(
 126       1000,
 127       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
 128        __kmp_nth, __kmp_all_nth));
 129
 130   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
 131      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
 132      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
 133      __kmp_init_gtid for this to work. */
 134
 135   if (!TCR_4(__kmp_init_gtid))
 136     return KMP_GTID_DNE;
 137
 138 #ifdef KMP_TDATA_GTID
 139   if (TCR_4(__kmp_gtid_mode) >= 3) {
 140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
 141     return __kmp_gtid;
 142   }
 143 #endif
 144   if (TCR_4(__kmp_gtid_mode) >= 2) {
 145     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
 146     return __kmp_gtid_get_specific();
 147   }
 148   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
 149
 150   stack_addr = (char *)&stack_data;
 151   other_threads = __kmp_threads;
 152
 153   /* ATT: The code below is a source of potential bugs due to unsynchronized
 154      access to __kmp_threads array. For example:
 155      1. Current thread loads other_threads[i] to thr and checks it, it is
 156         non-NULL.
 157      2. Current thread is suspended by OS.
 158      3. Another thread unregisters and finishes (debug versions of free()
 159         may fill memory with something like 0xEF).
 160      4. Current thread is resumed.
 161      5. Current thread reads junk from *thr.
 162      TODO: Fix it.  --ln  */
 163
 164   for (i = 0; i < __kmp_threads_capacity; i++) {
 165
 166     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
 167     if (!thr)
 168       continue;
 169
 170     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
 171     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
 172
 173     /* stack grows down -- search through all of the active threads */
 174
 175     if (stack_addr <= stack_base) {
 176       size_t stack_diff = stack_base - stack_addr;
 177
 178       if (stack_diff <= stack_size) {
 179         /* The only way we can be closer than the allocated */
 180         /* stack size is if we are running on this thread. */
 181         // __kmp_gtid_get_specific can return negative value because this
 182         // function can be called by thread destructor. However, before the
 183         // thread destructor is called, the value of the corresponding
 184         // thread-specific data will be reset to NULL.
 185         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
 186                          __kmp_gtid_get_specific() == i);
 187         return i;
 188       }
 189     }
 190   }
 191
 192   /* get specific to try and determine our gtid */
 193   KA_TRACE(1000,
 194            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
 195             "thread, using TLS\n"));
 196   i = __kmp_gtid_get_specific();
 197
 198   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
 199
 200   /* if we havn't been assigned a gtid, then return code */
 201   if (i < 0)
 202     return i;
 203
 204   // other_threads[i] can be nullptr at this point because the corresponding
 205   // thread could have already been destructed. It can happen when this function
 206   // is called in end library routine.
 207   if (!TCR_SYNC_PTR(other_threads[i]))
 208     return i;
 209
 210   /* dynamically updated stack window for uber threads to avoid get_specific
 211      call */
 212   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
 213     KMP_FATAL(StackOverflow, i);
 214   }
 215
 216   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
 217   if (stack_addr > stack_base) {
 218     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
 219     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
 220             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
 221                 stack_base);
 222   } else {
 223     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
 224             stack_base - stack_addr);
 225   }
 226
 227   /* Reprint stack bounds for ubermaster since they have been refined */
 228   if (__kmp_storage_map) {
 229     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
 230     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
 231     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
 232                                  other_threads[i]->th.th_info.ds.ds_stacksize,
 233                                  "th_%d stack (refinement)", i);
 234   }
 235   return i;
 236 }
 237
 238 int __kmp_get_global_thread_id_reg() {
 239   int gtid;
 240
 241   if (!__kmp_init_serial) {
 242     gtid = KMP_GTID_DNE;
 243   } else
 244 #ifdef KMP_TDATA_GTID
 245       if (TCR_4(__kmp_gtid_mode) >= 3) {
 246     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
 247     gtid = __kmp_gtid;
 248   } else
 249 #endif
 250       if (TCR_4(__kmp_gtid_mode) >= 2) {
 251     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
 252     gtid = __kmp_gtid_get_specific();
 253   } else {
 254     KA_TRACE(1000,
 255              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
 256     gtid = __kmp_get_global_thread_id();
 257   }
 258
 259   /* we must be a new uber master sibling thread */
 260   if (gtid == KMP_GTID_DNE) {
 261     KA_TRACE(10,
 262              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
 263               "Registering a new gtid.\n"));
 264     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
 265     if (!__kmp_init_serial) {
 266       __kmp_do_serial_initialize();
 267       gtid = __kmp_gtid_get_specific();
 268     } else {
 269       gtid = __kmp_register_root(FALSE);
 270     }
 271     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 272     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
 273   }
 274
 275   KMP_DEBUG_ASSERT(gtid >= 0);
 276
 277   return gtid;
 278 }
 279
 280 /* caller must hold forkjoin_lock */
 281 void __kmp_check_stack_overlap(kmp_info_t *th) {
 282   int f;
 283   char *stack_beg = NULL;
 284   char *stack_end = NULL;
 285   int gtid;
 286
 287   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
 288   if (__kmp_storage_map) {
 289     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
 290     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
 291
 292     gtid = __kmp_gtid_from_thread(th);
 293
 294     if (gtid == KMP_GTID_MONITOR) {
 295       __kmp_print_storage_map_gtid(
 296           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
 297           "th_%s stack (%s)", "mon",
 298           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
 299     } else {
 300       __kmp_print_storage_map_gtid(
 301           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
 302           "th_%d stack (%s)", gtid,
 303           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
 304     }
 305   }
 306
 307   /* No point in checking ubermaster threads since they use refinement and
 308    * cannot overlap */
 309   gtid = __kmp_gtid_from_thread(th);
 310   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
 311     KA_TRACE(10,
 312              ("__kmp_check_stack_overlap: performing extensive checking\n"));
 313     if (stack_beg == NULL) {
 314       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
 315       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
 316     }
 317
 318     for (f = 0; f < __kmp_threads_capacity; f++) {
 319       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
 320
 321       if (f_th && f_th != th) {
 322         char *other_stack_end =
 323             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
 324         char *other_stack_beg =
 325             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
 326         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
 327             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
 328
 329           /* Print the other stack values before the abort */
 330           if (__kmp_storage_map)
 331             __kmp_print_storage_map_gtid(
 332                 -1, other_stack_beg, other_stack_end,
 333                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
 334                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
 335
 336           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
 337                       __kmp_msg_null);
 338         }
 339       }
 340     }
 341   }
 342   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
 343 }
 344
 345 /* ------------------------------------------------------------------------ */
 346
 347 void __kmp_infinite_loop(void) {
 348   static int done = FALSE;
 349
 350   while (!done) {
 351     KMP_YIELD(TRUE);
 352   }
 353 }
 354
 355 #define MAX_MESSAGE 512
 356
 357 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
 358                                   char const *format, ...) {
 359   char buffer[MAX_MESSAGE];
 360   va_list ap;
 361
 362   va_start(ap, format);
 363   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
 364                p2, (unsigned long)size, format);
 365   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
 366   __kmp_vprintf(kmp_err, buffer, ap);
 367 #if KMP_PRINT_DATA_PLACEMENT
 368   int node;
 369   if (gtid >= 0) {
 370     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
 371       if (__kmp_storage_map_verbose) {
 372         node = __kmp_get_host_node(p1);
 373         if (node < 0) /* doesn't work, so don't try this next time */
 374           __kmp_storage_map_verbose = FALSE;
 375         else {
 376           char *last;
 377           int lastNode;
 378           int localProc = __kmp_get_cpu_from_gtid(gtid);
 379
 380           const int page_size = KMP_GET_PAGE_SIZE();
 381
 382           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
 383           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
 384           if (localProc >= 0)
 385             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
 386                                  localProc >> 1);
 387           else
 388             __kmp_printf_no_lock("  GTID %d\n", gtid);
 389 #if KMP_USE_PRCTL
 390           /* The more elaborate format is disabled for now because of the prctl
 391            * hanging bug. */
 392           do {
 393             last = p1;
 394             lastNode = node;
 395             /* This loop collates adjacent pages with the same host node. */
 396             do {
 397               (char *)p1 += page_size;
 398             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
 399             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
 400                                  lastNode);
 401           } while (p1 <= p2);
 402 #else
 403           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
 404                                (char *)p1 + (page_size - 1),
 405                                __kmp_get_host_node(p1));
 406           if (p1 < p2) {
 407             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
 408                                  (char *)p2 + (page_size - 1),
 409                                  __kmp_get_host_node(p2));
 410           }
 411 #endif
 412         }
 413       }
 414     } else
 415       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
 416   }
 417 #endif /* KMP_PRINT_DATA_PLACEMENT */
 418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
 419
 420   va_end(ap);
 421 }
 422
 423 void __kmp_warn(char const *format, ...) {
 424   char buffer[MAX_MESSAGE];
 425   va_list ap;
 426
 427   if (__kmp_generate_warnings == kmp_warnings_off) {
 428     return;
 429   }
 430
 431   va_start(ap, format);
 432
 433   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
 434   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
 435   __kmp_vprintf(kmp_err, buffer, ap);
 436   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
 437
 438   va_end(ap);
 439 }
 440
 441 void __kmp_abort_process() {
 442   // Later threads may stall here, but that's ok because abort() will kill them.
 443   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
 444
 445   if (__kmp_debug_buf) {
 446     __kmp_dump_debug_buffer();
 447   }
 448
 449   if (KMP_OS_WINDOWS) {
 450     // Let other threads know of abnormal termination and prevent deadlock
 451     // if abort happened during library initialization or shutdown
 452     __kmp_global.g.g_abort = SIGABRT;
 453
 454     /* On Windows* OS by default abort() causes pop-up error box, which stalls
 455        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
 456        boxes. _set_abort_behavior() works well, but this function is not
 457        available in VS7 (this is not problem for DLL, but it is a problem for
 458        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
 459        help, at least in some versions of MS C RTL.
 460
 461        It seems following sequence is the only way to simulate abort() and
 462        avoid pop-up error box. */
 463     raise(SIGABRT);
 464     _exit(3); // Just in case, if signal ignored, exit anyway.
 465   } else {
 466     __kmp_unregister_library();
 467     abort();
 468   }
 469
 470   __kmp_infinite_loop();
 471   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
 472
 473 } // __kmp_abort_process
 474
 475 void __kmp_abort_thread(void) {
 476   // TODO: Eliminate g_abort global variable and this function.
 477   // In case of abort just call abort(), it will kill all the threads.
 478   __kmp_infinite_loop();
 479 } // __kmp_abort_thread
 480
 481 /* Print out the storage map for the major kmp_info_t thread data structures
 482    that are allocated together. */
 483
 484 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
 485   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
 486                                gtid);
 487
 488   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
 489                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
 490
 491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
 492                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
 493
 494   __kmp_print_storage_map_gtid(
 495       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
 496       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
 497
 498   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
 499                                &thr->th.th_bar[bs_plain_barrier + 1],
 500                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
 501                                gtid);
 502
 503   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
 504                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
 505                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
 506                                gtid);
 507
 508 #if KMP_FAST_REDUCTION_BARRIER
 509   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
 510                                &thr->th.th_bar[bs_reduction_barrier + 1],
 511                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
 512                                gtid);
 513 #endif // KMP_FAST_REDUCTION_BARRIER
 514 }
 515
 516 /* Print out the storage map for the major kmp_team_t team data structures
 517    that are allocated together. */
 518
 519 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
 520                                          int team_id, int num_thr) {
 521   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
 522   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
 523                                header, team_id);
 524
 525   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
 526                                &team->t.t_bar[bs_last_barrier],
 527                                sizeof(kmp_balign_team_t) * bs_last_barrier,
 528                                "%s_%d.t_bar", header, team_id);
 529
 530   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
 531                                &team->t.t_bar[bs_plain_barrier + 1],
 532                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
 533                                header, team_id);
 534
 535   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
 536                                &team->t.t_bar[bs_forkjoin_barrier + 1],
 537                                sizeof(kmp_balign_team_t),
 538                                "%s_%d.t_bar[forkjoin]", header, team_id);
 539
 540 #if KMP_FAST_REDUCTION_BARRIER
 541   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
 542                                &team->t.t_bar[bs_reduction_barrier + 1],
 543                                sizeof(kmp_balign_team_t),
 544                                "%s_%d.t_bar[reduction]", header, team_id);
 545 #endif // KMP_FAST_REDUCTION_BARRIER
 546
 547   __kmp_print_storage_map_gtid(
 548       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
 549       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
 550
 551   __kmp_print_storage_map_gtid(
 552       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
 553       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
 554
 555   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
 556                                &team->t.t_disp_buffer[num_disp_buff],
 557                                sizeof(dispatch_shared_info_t) * num_disp_buff,
 558                                "%s_%d.t_disp_buffer", header, team_id);
 559 }
 560
 561 static void __kmp_init_allocator() {
 562   __kmp_init_memkind();
 563   __kmp_init_target_mem();
 564 }
 565 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 566
 567 /* ------------------------------------------------------------------------ */
 568
 569 #if ENABLE_LIBOMPTARGET
 570 static void __kmp_init_omptarget() {
 571   __kmp_init_target_task();
 572 }
 573 #endif
 574
 575 /* ------------------------------------------------------------------------ */
 576
 577 #if KMP_DYNAMIC_LIB
 578 #if KMP_OS_WINDOWS
 579
 580 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
 581   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
 582
 583   switch (fdwReason) {
 584
 585   case DLL_PROCESS_ATTACH:
 586     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
 587
 588     return TRUE;
 589
 590   case DLL_PROCESS_DETACH:
 591     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
 592
 593     // According to Windows* documentation for DllMain entry point:
 594     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
 595     //   lpReserved == NULL when FreeLibrary() is called,
 596     //   lpReserved != NULL when the process is terminated.
 597     // When FreeLibrary() is called, worker threads remain alive. So the
 598     // runtime's state is consistent and executing proper shutdown is OK.
 599     // When the process is terminated, worker threads have exited or been
 600     // forcefully terminated by the OS and only the shutdown thread remains.
 601     // This can leave the runtime in an inconsistent state.
 602     // Hence, only attempt proper cleanup when FreeLibrary() is called.
 603     // Otherwise, rely on OS to reclaim resources.
 604     if (lpReserved == NULL)
 605       __kmp_internal_end_library(__kmp_gtid_get_specific());
 606
 607     return TRUE;
 608
 609   case DLL_THREAD_ATTACH:
 610     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
 611
 612     /* if we want to register new siblings all the time here call
 613      * __kmp_get_gtid(); */
 614     return TRUE;
 615
 616   case DLL_THREAD_DETACH:
 617     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
 618
 619     __kmp_internal_end_thread(__kmp_gtid_get_specific());
 620     return TRUE;
 621   }
 622
 623   return TRUE;
 624 }
 625
 626 #endif /* KMP_OS_WINDOWS */
 627 #endif /* KMP_DYNAMIC_LIB */
 628
 629 /* __kmp_parallel_deo -- Wait until it's our turn. */
 630 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 631   int gtid = *gtid_ref;
 632 #ifdef BUILD_PARALLEL_ORDERED
 633   kmp_team_t *team = __kmp_team_from_gtid(gtid);
 634 #endif /* BUILD_PARALLEL_ORDERED */
 635
 636   if (__kmp_env_consistency_check) {
 637     if (__kmp_threads[gtid]->th.th_root->r.r_active)
 638 #if KMP_USE_DYNAMIC_LOCK
 639       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
 640 #else
 641       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
 642 #endif
 643   }
 644 #ifdef BUILD_PARALLEL_ORDERED
 645   if (!team->t.t_serialized) {
 646     KMP_MB();
 647     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
 648              NULL);
 649     KMP_MB();
 650   }
 651 #endif /* BUILD_PARALLEL_ORDERED */
 652 }
 653
 654 /* __kmp_parallel_dxo -- Signal the next task. */
 655 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 656   int gtid = *gtid_ref;
 657 #ifdef BUILD_PARALLEL_ORDERED
 658   int tid = __kmp_tid_from_gtid(gtid);
 659   kmp_team_t *team = __kmp_team_from_gtid(gtid);
 660 #endif /* BUILD_PARALLEL_ORDERED */
 661
 662   if (__kmp_env_consistency_check) {
 663     if (__kmp_threads[gtid]->th.th_root->r.r_active)
 664       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
 665   }
 666 #ifdef BUILD_PARALLEL_ORDERED
 667   if (!team->t.t_serialized) {
 668     KMP_MB(); /* Flush all pending memory write invalidates.  */
 669
 670     /* use the tid of the next thread in this team */
 671     /* TODO replace with general release procedure */
 672     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
 673
 674     KMP_MB(); /* Flush all pending memory write invalidates.  */
 675   }
 676 #endif /* BUILD_PARALLEL_ORDERED */
 677 }
 678
 679 /* ------------------------------------------------------------------------ */
 680 /* The BARRIER for a SINGLE process section is always explicit   */
 681
 682 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
 683   int status;
 684   kmp_info_t *th;
 685   kmp_team_t *team;
 686
 687   if (!TCR_4(__kmp_init_parallel))
 688     __kmp_parallel_initialize();
 689   __kmp_resume_if_soft_paused();
 690
 691   th = __kmp_threads[gtid];
 692   team = th->th.th_team;
 693   status = 0;
 694
 695   th->th.th_ident = id_ref;
 696
 697   if (team->t.t_serialized) {
 698     status = 1;
 699   } else {
 700     kmp_int32 old_this = th->th.th_local.this_construct;
 701
 702     ++th->th.th_local.this_construct;
 703     /* try to set team count to thread count--success means thread got the
 704        single block */
 705     /* TODO: Should this be acquire or release? */
 706     if (team->t.t_construct == old_this) {
 707       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
 708                                               th->th.th_local.this_construct);
 709     }
 710 #if USE_ITT_BUILD
 711     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
 712         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
 713         team->t.t_active_level == 1) {
 714       // Only report metadata by primary thread of active team at level 1
 715       __kmp_itt_metadata_single(id_ref);
 716     }
 717 #endif /* USE_ITT_BUILD */
 718   }
 719
 720   if (__kmp_env_consistency_check) {
 721     if (status && push_ws) {
 722       __kmp_push_workshare(gtid, ct_psingle, id_ref);
 723     } else {
 724       __kmp_check_workshare(gtid, ct_psingle, id_ref);
 725     }
 726   }
 727 #if USE_ITT_BUILD
 728   if (status) {
 729     __kmp_itt_single_start(gtid);
 730   }
 731 #endif /* USE_ITT_BUILD */
 732   return status;
 733 }
 734
 735 void __kmp_exit_single(int gtid) {
 736 #if USE_ITT_BUILD
 737   __kmp_itt_single_end(gtid);
 738 #endif /* USE_ITT_BUILD */
 739   if (__kmp_env_consistency_check)
 740     __kmp_pop_workshare(gtid, ct_psingle, NULL);
 741 }
 742
 743 /* determine if we can go parallel or must use a serialized parallel region and
 744  * how many threads we can use
 745  * set_nproc is the number of threads requested for the team
 746  * returns 0 if we should serialize or only use one thread,
 747  * otherwise the number of threads to use
 748  * The forkjoin lock is held by the caller. */
 749 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
 750                                  int master_tid, int set_nthreads,
 751                                  int enter_teams) {
 752   int capacity;
 753   int new_nthreads;
 754   KMP_DEBUG_ASSERT(__kmp_init_serial);
 755   KMP_DEBUG_ASSERT(root && parent_team);
 756   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
 757
 758   // If dyn-var is set, dynamically adjust the number of desired threads,
 759   // according to the method specified by dynamic_mode.
 760   new_nthreads = set_nthreads;
 761   if (!get__dynamic_2(parent_team, master_tid)) {
 762     ;
 763   }
 764 #ifdef USE_LOAD_BALANCE
 765   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
 766     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
 767     if (new_nthreads == 1) {
 768       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
 769                     "reservation to 1 thread\n",
 770                     master_tid));
 771       return 1;
 772     }
 773     if (new_nthreads < set_nthreads) {
 774       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
 775                     "reservation to %d threads\n",
 776                     master_tid, new_nthreads));
 777     }
 778   }
 779 #endif /* USE_LOAD_BALANCE */
 780   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
 781     new_nthreads = __kmp_avail_proc - __kmp_nth +
 782                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
 783     if (new_nthreads <= 1) {
 784       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
 785                     "reservation to 1 thread\n",
 786                     master_tid));
 787       return 1;
 788     }
 789     if (new_nthreads < set_nthreads) {
 790       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
 791                     "reservation to %d threads\n",
 792                     master_tid, new_nthreads));
 793     } else {
 794       new_nthreads = set_nthreads;
 795     }
 796   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
 797     if (set_nthreads > 2) {
 798       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
 799       new_nthreads = (new_nthreads % set_nthreads) + 1;
 800       if (new_nthreads == 1) {
 801         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
 802                       "reservation to 1 thread\n",
 803                       master_tid));
 804         return 1;
 805       }
 806       if (new_nthreads < set_nthreads) {
 807         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
 808                       "reservation to %d threads\n",
 809                       master_tid, new_nthreads));
 810       }
 811     }
 812   } else {
 813     KMP_ASSERT(0);
 814   }
 815
 816   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
 817   if (__kmp_nth + new_nthreads -
 818           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
 819       __kmp_max_nth) {
 820     int tl_nthreads = __kmp_max_nth - __kmp_nth +
 821                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
 822     if (tl_nthreads <= 0) {
 823       tl_nthreads = 1;
 824     }
 825
 826     // If dyn-var is false, emit a 1-time warning.
 827     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
 828       __kmp_reserve_warn = 1;
 829       __kmp_msg(kmp_ms_warning,
 830                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
 831                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
 832     }
 833     if (tl_nthreads == 1) {
 834       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
 835                     "reduced reservation to 1 thread\n",
 836                     master_tid));
 837       return 1;
 838     }
 839     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
 840                   "reservation to %d threads\n",
 841                   master_tid, tl_nthreads));
 842     new_nthreads = tl_nthreads;
 843   }
 844
 845   // Respect OMP_THREAD_LIMIT
 846   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
 847   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
 848   if (cg_nthreads + new_nthreads -
 849           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
 850       max_cg_threads) {
 851     int tl_nthreads = max_cg_threads - cg_nthreads +
 852                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
 853     if (tl_nthreads <= 0) {
 854       tl_nthreads = 1;
 855     }
 856
 857     // If dyn-var is false, emit a 1-time warning.
 858     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
 859       __kmp_reserve_warn = 1;
 860       __kmp_msg(kmp_ms_warning,
 861                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
 862                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
 863     }
 864     if (tl_nthreads == 1) {
 865       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
 866                     "reduced reservation to 1 thread\n",
 867                     master_tid));
 868       return 1;
 869     }
 870     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
 871                   "reservation to %d threads\n",
 872                   master_tid, tl_nthreads));
 873     new_nthreads = tl_nthreads;
 874   }
 875
 876   // Check if the threads array is large enough, or needs expanding.
 877   // See comment in __kmp_register_root() about the adjustment if
 878   // __kmp_threads[0] == NULL.
 879   capacity = __kmp_threads_capacity;
 880   if (TCR_PTR(__kmp_threads[0]) == NULL) {
 881     --capacity;
 882   }
 883   // If it is not for initializing the hidden helper team, we need to take
 884   // __kmp_hidden_helper_threads_num out of the capacity because it is included
 885   // in __kmp_threads_capacity.
 886   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
 887     capacity -= __kmp_hidden_helper_threads_num;
 888   }
 889   if (__kmp_nth + new_nthreads -
 890           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
 891       capacity) {
 892     // Expand the threads array.
 893     int slotsRequired = __kmp_nth + new_nthreads -
 894                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
 895                         capacity;
 896     int slotsAdded = __kmp_expand_threads(slotsRequired);
 897     if (slotsAdded < slotsRequired) {
 898       // The threads array was not expanded enough.
 899       new_nthreads -= (slotsRequired - slotsAdded);
 900       KMP_ASSERT(new_nthreads >= 1);
 901
 902       // If dyn-var is false, emit a 1-time warning.
 903       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
 904         __kmp_reserve_warn = 1;
 905         if (__kmp_tp_cached) {
 906           __kmp_msg(kmp_ms_warning,
 907                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
 908                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
 909                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
 910         } else {
 911           __kmp_msg(kmp_ms_warning,
 912                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
 913                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
 914         }
 915       }
 916     }
 917   }
 918
 919 #ifdef KMP_DEBUG
 920   if (new_nthreads == 1) {
 921     KC_TRACE(10,
 922              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
 923               "dead roots and rechecking; requested %d threads\n",
 924               __kmp_get_gtid(), set_nthreads));
 925   } else {
 926     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
 927                   " %d threads\n",
 928                   __kmp_get_gtid(), new_nthreads, set_nthreads));
 929   }
 930 #endif // KMP_DEBUG
 931   return new_nthreads;
 932 }
 933
 934 /* Allocate threads from the thread pool and assign them to the new team. We are
 935    assured that there are enough threads available, because we checked on that
 936    earlier within critical section forkjoin */
 937 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
 938                                     kmp_info_t *master_th, int master_gtid,
 939                                     int fork_teams_workers) {
 940   int i;
 941   int use_hot_team;
 942
 943   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
 944   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
 945   KMP_MB();
 946
 947   /* first, let's setup the primary thread */
 948   master_th->th.th_info.ds.ds_tid = 0;
 949   master_th->th.th_team = team;
 950   master_th->th.th_team_nproc = team->t.t_nproc;
 951   master_th->th.th_team_master = master_th;
 952   master_th->th.th_team_serialized = FALSE;
 953   master_th->th.th_dispatch = &team->t.t_dispatch[0];
 954
 955 /* make sure we are not the optimized hot team */
 956 #if KMP_NESTED_HOT_TEAMS
 957   use_hot_team = 0;
 958   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
 959   if (hot_teams) { // hot teams array is not allocated if
 960     // KMP_HOT_TEAMS_MAX_LEVEL=0
 961     int level = team->t.t_active_level - 1; // index in array of hot teams
 962     if (master_th->th.th_teams_microtask) { // are we inside the teams?
 963       if (master_th->th.th_teams_size.nteams > 1) {
 964         ++level; // level was not increased in teams construct for
 965         // team_of_masters
 966       }
 967       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
 968           master_th->th.th_teams_level == team->t.t_level) {
 969         ++level; // level was not increased in teams construct for
 970         // team_of_workers before the parallel
 971       } // team->t.t_level will be increased inside parallel
 972     }
 973     if (level < __kmp_hot_teams_max_level) {
 974       if (hot_teams[level].hot_team) {
 975         // hot team has already been allocated for given level
 976         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
 977         use_hot_team = 1; // the team is ready to use
 978       } else {
 979         use_hot_team = 0; // AC: threads are not allocated yet
 980         hot_teams[level].hot_team = team; // remember new hot team
 981         hot_teams[level].hot_team_nth = team->t.t_nproc;
 982       }
 983     } else {
 984       use_hot_team = 0;
 985     }
 986   }
 987 #else
 988   use_hot_team = team == root->r.r_hot_team;
 989 #endif
 990   if (!use_hot_team) {
 991
 992     /* install the primary thread */
 993     team->t.t_threads[0] = master_th;
 994     __kmp_initialize_info(master_th, team, 0, master_gtid);
 995
 996     /* now, install the worker threads */
 997     for (i = 1; i < team->t.t_nproc; i++) {
 998
 999       /* fork or reallocate a new thread and install it in team */
1000       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1001       team->t.t_threads[i] = thr;
1002       KMP_DEBUG_ASSERT(thr);
1003       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1004       /* align team and thread arrived states */
1005       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1006                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1007                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1008                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1009                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1010                     team->t.t_bar[bs_plain_barrier].b_arrived));
1011       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1012       thr->th.th_teams_level = master_th->th.th_teams_level;
1013       thr->th.th_teams_size = master_th->th.th_teams_size;
1014       { // Initialize threads' barrier data.
1015         int b;
1016         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1017         for (b = 0; b < bs_last_barrier; ++b) {
1018           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1019           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1020 #if USE_DEBUGGER
1021           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1022 #endif
1023         }
1024       }
1025     }
1026
1027 #if KMP_AFFINITY_SUPPORTED
1028     // Do not partition the places list for teams construct workers who
1029     // haven't actually been forked to do real work yet. This partitioning
1030     // will take place in the parallel region nested within the teams construct.
1031     if (!fork_teams_workers) {
1032       __kmp_partition_places(team);
1033     }
1034 #endif
1035
1036     if (team->t.t_nproc > 1 &&
1037         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1038       team->t.b->update_num_threads(team->t.t_nproc);
1039       __kmp_add_threads_to_team(team, team->t.t_nproc);
1040     }
1041   }
1042
1043   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1044     for (i = 0; i < team->t.t_nproc; i++) {
1045       kmp_info_t *thr = team->t.t_threads[i];
1046       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1047           thr->th.th_prev_level != team->t.t_level) {
1048         team->t.t_display_affinity = 1;
1049         break;
1050       }
1051     }
1052   }
1053
1054   KMP_MB();
1055 }
1056
1057 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1058 // Propagate any changes to the floating point control registers out to the team
1059 // We try to avoid unnecessary writes to the relevant cache line in the team
1060 // structure, so we don't make changes unless they are needed.
1061 inline static void propagateFPControl(kmp_team_t *team) {
1062   if (__kmp_inherit_fp_control) {
1063     kmp_int16 x87_fpu_control_word;
1064     kmp_uint32 mxcsr;
1065
1066     // Get primary thread's values of FPU control flags (both X87 and vector)
1067     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1068     __kmp_store_mxcsr(&mxcsr);
1069     mxcsr &= KMP_X86_MXCSR_MASK;
1070
1071     // There is no point looking at t_fp_control_saved here.
1072     // If it is TRUE, we still have to update the values if they are different
1073     // from those we now have. If it is FALSE we didn't save anything yet, but
1074     // our objective is the same. We have to ensure that the values in the team
1075     // are the same as those we have.
1076     // So, this code achieves what we need whether or not t_fp_control_saved is
1077     // true. By checking whether the value needs updating we avoid unnecessary
1078     // writes that would put the cache-line into a written state, causing all
1079     // threads in the team to have to read it again.
1080     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1081     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1082     // Although we don't use this value, other code in the runtime wants to know
1083     // whether it should restore them. So we must ensure it is correct.
1084     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1085   } else {
1086     // Similarly here. Don't write to this cache-line in the team structure
1087     // unless we have to.
1088     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1089   }
1090 }
1091
1092 // Do the opposite, setting the hardware registers to the updated values from
1093 // the team.
1094 inline static void updateHWFPControl(kmp_team_t *team) {
1095   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1096     // Only reset the fp control regs if they have been changed in the team.
1097     // the parallel region that we are exiting.
1098     kmp_int16 x87_fpu_control_word;
1099     kmp_uint32 mxcsr;
1100     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1101     __kmp_store_mxcsr(&mxcsr);
1102     mxcsr &= KMP_X86_MXCSR_MASK;
1103
1104     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1105       __kmp_clear_x87_fpu_status_word();
1106       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1107     }
1108
1109     if (team->t.t_mxcsr != mxcsr) {
1110       __kmp_load_mxcsr(&team->t.t_mxcsr);
1111     }
1112   }
1113 }
1114 #else
1115 #define propagateFPControl(x) ((void)0)
1116 #define updateHWFPControl(x) ((void)0)
1117 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1118
1119 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1120                                      int realloc); // forward declaration
1121
1122 /* Run a parallel region that has been serialized, so runs only in a team of the
1123    single primary thread. */
1124 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1125   kmp_info_t *this_thr;
1126   kmp_team_t *serial_team;
1127
1128   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1129
1130   /* Skip all this code for autopar serialized loops since it results in
1131      unacceptable overhead */
1132   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1133     return;
1134
1135   if (!TCR_4(__kmp_init_parallel))
1136     __kmp_parallel_initialize();
1137   __kmp_resume_if_soft_paused();
1138
1139   this_thr = __kmp_threads[global_tid];
1140   serial_team = this_thr->th.th_serial_team;
1141
1142   /* utilize the serialized team held by this thread */
1143   KMP_DEBUG_ASSERT(serial_team);
1144   KMP_MB();
1145
1146   if (__kmp_tasking_mode != tskm_immediate_exec) {
1147     KMP_DEBUG_ASSERT(
1148         this_thr->th.th_task_team ==
1149         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1150     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1151                      NULL);
1152     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1153                   "team %p, new task_team = NULL\n",
1154                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1155     this_thr->th.th_task_team = NULL;
1156   }
1157
1158   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1159   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1160     proc_bind = proc_bind_false;
1161   } else if (proc_bind == proc_bind_default) {
1162     // No proc_bind clause was specified, so use the current value
1163     // of proc-bind-var for this parallel region.
1164     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1165   }
1166   // Reset for next parallel region
1167   this_thr->th.th_set_proc_bind = proc_bind_default;
1168
1169   // Reset num_threads for next parallel region
1170   this_thr->th.th_set_nproc = 0;
1171
1172 #if OMPT_SUPPORT
1173   ompt_data_t ompt_parallel_data = ompt_data_none;
1174   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1175   if (ompt_enabled.enabled &&
1176       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1177
1178     ompt_task_info_t *parent_task_info;
1179     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1180
1181     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1182     if (ompt_enabled.ompt_callback_parallel_begin) {
1183       int team_size = 1;
1184
1185       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1186           &(parent_task_info->task_data), &(parent_task_info->frame),
1187           &ompt_parallel_data, team_size,
1188           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1189     }
1190   }
1191 #endif // OMPT_SUPPORT
1192
1193   if (this_thr->th.th_team != serial_team) {
1194     // Nested level will be an index in the nested nthreads array
1195     int level = this_thr->th.th_team->t.t_level;
1196
1197     if (serial_team->t.t_serialized) {
1198       /* this serial team was already used
1199          TODO increase performance by making this locks more specific */
1200       kmp_team_t *new_team;
1201
1202       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1203
1204       new_team =
1205           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1206 #if OMPT_SUPPORT
1207                               ompt_parallel_data,
1208 #endif
1209                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1210                               0 USE_NESTED_HOT_ARG(NULL));
1211       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1212       KMP_ASSERT(new_team);
1213
1214       /* setup new serialized team and install it */
1215       new_team->t.t_threads[0] = this_thr;
1216       new_team->t.t_parent = this_thr->th.th_team;
1217       serial_team = new_team;
1218       this_thr->th.th_serial_team = serial_team;
1219
1220       KF_TRACE(
1221           10,
1222           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1223            global_tid, serial_team));
1224
1225       /* TODO the above breaks the requirement that if we run out of resources,
1226          then we can still guarantee that serialized teams are ok, since we may
1227          need to allocate a new one */
1228     } else {
1229       KF_TRACE(
1230           10,
1231           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1232            global_tid, serial_team));
1233     }
1234
1235     /* we have to initialize this serial team */
1236     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1237     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1238     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1239     serial_team->t.t_ident = loc;
1240     serial_team->t.t_serialized = 1;
1241     serial_team->t.t_nproc = 1;
1242     serial_team->t.t_parent = this_thr->th.th_team;
1243     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1244     this_thr->th.th_team = serial_team;
1245     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1246
1247     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1248                   this_thr->th.th_current_task));
1249     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1250     this_thr->th.th_current_task->td_flags.executing = 0;
1251
1252     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1253
1254     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1255        implicit task for each serialized task represented by
1256        team->t.t_serialized? */
1257     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1258               &this_thr->th.th_current_task->td_parent->td_icvs);
1259
1260     // Thread value exists in the nested nthreads array for the next nested
1261     // level
1262     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1263       this_thr->th.th_current_task->td_icvs.nproc =
1264           __kmp_nested_nth.nth[level + 1];
1265     }
1266
1267     if (__kmp_nested_proc_bind.used &&
1268         (level + 1 < __kmp_nested_proc_bind.used)) {
1269       this_thr->th.th_current_task->td_icvs.proc_bind =
1270           __kmp_nested_proc_bind.bind_types[level + 1];
1271     }
1272
1273 #if USE_DEBUGGER
1274     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1275 #endif
1276     this_thr->th.th_info.ds.ds_tid = 0;
1277
1278     /* set thread cache values */
1279     this_thr->th.th_team_nproc = 1;
1280     this_thr->th.th_team_master = this_thr;
1281     this_thr->th.th_team_serialized = 1;
1282
1283     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1284     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1285     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1286
1287     propagateFPControl(serial_team);
1288
1289     /* check if we need to allocate dispatch buffers stack */
1290     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1291     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1292       serial_team->t.t_dispatch->th_disp_buffer =
1293           (dispatch_private_info_t *)__kmp_allocate(
1294               sizeof(dispatch_private_info_t));
1295     }
1296     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1297
1298     KMP_MB();
1299
1300   } else {
1301     /* this serialized team is already being used,
1302      * that's fine, just add another nested level */
1303     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1304     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1305     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1306     ++serial_team->t.t_serialized;
1307     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1308
1309     // Nested level will be an index in the nested nthreads array
1310     int level = this_thr->th.th_team->t.t_level;
1311     // Thread value exists in the nested nthreads array for the next nested
1312     // level
1313     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1314       this_thr->th.th_current_task->td_icvs.nproc =
1315           __kmp_nested_nth.nth[level + 1];
1316     }
1317     serial_team->t.t_level++;
1318     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1319                   "of serial team %p to %d\n",
1320                   global_tid, serial_team, serial_team->t.t_level));
1321
1322     /* allocate/push dispatch buffers stack */
1323     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1324     {
1325       dispatch_private_info_t *disp_buffer =
1326           (dispatch_private_info_t *)__kmp_allocate(
1327               sizeof(dispatch_private_info_t));
1328       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1329       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1330     }
1331     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1332
1333     KMP_MB();
1334   }
1335   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1336
1337   // Perform the display affinity functionality for
1338   // serialized parallel regions
1339   if (__kmp_display_affinity) {
1340     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1341         this_thr->th.th_prev_num_threads != 1) {
1342       // NULL means use the affinity-format-var ICV
1343       __kmp_aux_display_affinity(global_tid, NULL);
1344       this_thr->th.th_prev_level = serial_team->t.t_level;
1345       this_thr->th.th_prev_num_threads = 1;
1346     }
1347   }
1348
1349   if (__kmp_env_consistency_check)
1350     __kmp_push_parallel(global_tid, NULL);
1351 #if OMPT_SUPPORT
1352   serial_team->t.ompt_team_info.master_return_address = codeptr;
1353   if (ompt_enabled.enabled &&
1354       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1355     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1356         OMPT_GET_FRAME_ADDRESS(0);
1357
1358     ompt_lw_taskteam_t lw_taskteam;
1359     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1360                             &ompt_parallel_data, codeptr);
1361
1362     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1363     // don't use lw_taskteam after linking. content was swaped
1364
1365     /* OMPT implicit task begin */
1366     if (ompt_enabled.ompt_callback_implicit_task) {
1367       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1368           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1369           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1370           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1371       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1372           __kmp_tid_from_gtid(global_tid);
1373     }
1374
1375     /* OMPT state */
1376     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1377     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1378         OMPT_GET_FRAME_ADDRESS(0);
1379   }
1380 #endif
1381 }
1382
1383 // Test if this fork is for a team closely nested in a teams construct
1384 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1385                                           microtask_t microtask, int level,
1386                                           int teams_level, kmp_va_list ap) {
1387   return (master_th->th.th_teams_microtask && ap &&
1388           microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1389 }
1390
1391 // Test if this fork is for the teams construct, i.e. to form the outer league
1392 // of teams
1393 static inline bool __kmp_is_entering_teams(int active_level, int level,
1394                                            int teams_level, kmp_va_list ap) {
1395   return ((ap == NULL && active_level == 0) ||
1396           (ap && teams_level > 0 && teams_level == level));
1397 }
1398
1399 // AC: This is start of parallel that is nested inside teams construct.
1400 // The team is actual (hot), all workers are ready at the fork barrier.
1401 // No lock needed to initialize the team a bit, then free workers.
1402 static inline int
1403 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1404                     kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1405                     enum fork_context_e call_context, microtask_t microtask,
1406                     launch_t invoker, int master_set_numthreads, int level,
1407 #if OMPT_SUPPORT
1408                     ompt_data_t ompt_parallel_data, void *return_address,
1409 #endif
1410                     kmp_va_list ap) {
1411   void **argv;
1412   int i;
1413
1414   parent_team->t.t_ident = loc;
1415   __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1416   parent_team->t.t_argc = argc;
1417   argv = (void **)parent_team->t.t_argv;
1418   for (i = argc - 1; i >= 0; --i) {
1419     *argv++ = va_arg(kmp_va_deref(ap), void *);
1420   }
1421   // Increment our nested depth levels, but not increase the serialization
1422   if (parent_team == master_th->th.th_serial_team) {
1423     // AC: we are in serialized parallel
1424     __kmpc_serialized_parallel(loc, gtid);
1425     KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1426
1427     if (call_context == fork_context_gnu) {
1428       // AC: need to decrement t_serialized for enquiry functions to work
1429       // correctly, will restore at join time
1430       parent_team->t.t_serialized--;
1431       return TRUE;
1432     }
1433
1434 #if OMPD_SUPPORT
1435     parent_team->t.t_pkfn = microtask;
1436 #endif
1437
1438 #if OMPT_SUPPORT
1439     void *dummy;
1440     void **exit_frame_p;
1441     ompt_data_t *implicit_task_data;
1442     ompt_lw_taskteam_t lw_taskteam;
1443
1444     if (ompt_enabled.enabled) {
1445       __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1446                               &ompt_parallel_data, return_address);
1447       exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1448
1449       __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1450       // Don't use lw_taskteam after linking. Content was swapped.
1451
1452       /* OMPT implicit task begin */
1453       implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1454       if (ompt_enabled.ompt_callback_implicit_task) {
1455         OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1456         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1457             ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1458             1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1459       }
1460
1461       /* OMPT state */
1462       master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1463     } else {
1464       exit_frame_p = &dummy;
1465     }
1466 #endif
1467
1468     // AC: need to decrement t_serialized for enquiry functions to work
1469     // correctly, will restore at join time
1470     parent_team->t.t_serialized--;
1471
1472     {
1473       KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1474       KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1475       __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1476 #if OMPT_SUPPORT
1477                              ,
1478                              exit_frame_p
1479 #endif
1480                              );
1481     }
1482
1483 #if OMPT_SUPPORT
1484     if (ompt_enabled.enabled) {
1485       *exit_frame_p = NULL;
1486       OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1487       if (ompt_enabled.ompt_callback_implicit_task) {
1488         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1489             ompt_scope_end, NULL, implicit_task_data, 1,
1490             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1491       }
1492       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1493       __ompt_lw_taskteam_unlink(master_th);
1494       if (ompt_enabled.ompt_callback_parallel_end) {
1495         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1496             &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1497             OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1498       }
1499       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1500     }
1501 #endif
1502     return TRUE;
1503   }
1504
1505   parent_team->t.t_pkfn = microtask;
1506   parent_team->t.t_invoke = invoker;
1507   KMP_ATOMIC_INC(&root->r.r_in_parallel);
1508   parent_team->t.t_active_level++;
1509   parent_team->t.t_level++;
1510   parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1511
1512   // If the threads allocated to the team are less than the thread limit, update
1513   // the thread limit here. th_teams_size.nth is specific to this team nested
1514   // in a teams construct, the team is fully created, and we're about to do
1515   // the actual fork. Best to do this here so that the subsequent uses below
1516   // and in the join have the correct value.
1517   master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1518
1519 #if OMPT_SUPPORT
1520   if (ompt_enabled.enabled) {
1521     ompt_lw_taskteam_t lw_taskteam;
1522     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1523                             return_address);
1524     __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1525   }
1526 #endif
1527
1528   /* Change number of threads in the team if requested */
1529   if (master_set_numthreads) { // The parallel has num_threads clause
1530     if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1531       // AC: only can reduce number of threads dynamically, can't increase
1532       kmp_info_t **other_threads = parent_team->t.t_threads;
1533       // NOTE: if using distributed barrier, we need to run this code block
1534       // even when the team size appears not to have changed from the max.
1535       int old_proc = master_th->th.th_teams_size.nth;
1536       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1537         __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1538         __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1539       }
1540       parent_team->t.t_nproc = master_set_numthreads;
1541       for (i = 0; i < master_set_numthreads; ++i) {
1542         other_threads[i]->th.th_team_nproc = master_set_numthreads;
1543       }
1544     }
1545     // Keep extra threads hot in the team for possible next parallels
1546     master_th->th.th_set_nproc = 0;
1547   }
1548
1549 #if USE_DEBUGGER
1550   if (__kmp_debugging) { // Let debugger override number of threads.
1551     int nth = __kmp_omp_num_threads(loc);
1552     if (nth > 0) { // 0 means debugger doesn't want to change num threads
1553       master_set_numthreads = nth;
1554     }
1555   }
1556 #endif
1557
1558   // Figure out the proc_bind policy for the nested parallel within teams
1559   kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1560   // proc_bind_default means don't update
1561   kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1562   if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1563     proc_bind = proc_bind_false;
1564   } else {
1565     // No proc_bind clause specified; use current proc-bind-var
1566     if (proc_bind == proc_bind_default) {
1567       proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1568     }
1569     /* else: The proc_bind policy was specified explicitly on parallel clause.
1570        This overrides proc-bind-var for this parallel region, but does not
1571        change proc-bind-var. */
1572     // Figure the value of proc-bind-var for the child threads.
1573     if ((level + 1 < __kmp_nested_proc_bind.used) &&
1574         (__kmp_nested_proc_bind.bind_types[level + 1] !=
1575          master_th->th.th_current_task->td_icvs.proc_bind)) {
1576       proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1577     }
1578   }
1579   KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1580   // Need to change the bind-var ICV to correct value for each implicit task
1581   if (proc_bind_icv != proc_bind_default &&
1582       master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1583     kmp_info_t **other_threads = parent_team->t.t_threads;
1584     for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1585       other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1586     }
1587   }
1588   // Reset for next parallel region
1589   master_th->th.th_set_proc_bind = proc_bind_default;
1590
1591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1592   if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1593        KMP_ITT_DEBUG) &&
1594       __kmp_forkjoin_frames_mode == 3 &&
1595       parent_team->t.t_active_level == 1 // only report frames at level 1
1596       && master_th->th.th_teams_size.nteams == 1) {
1597     kmp_uint64 tmp_time = __itt_get_timestamp();
1598     master_th->th.th_frame_time = tmp_time;
1599     parent_team->t.t_region_time = tmp_time;
1600   }
1601   if (__itt_stack_caller_create_ptr) {
1602     KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1603     // create new stack stitching id before entering fork barrier
1604     parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1605   }
1606 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1607 #if KMP_AFFINITY_SUPPORTED
1608   __kmp_partition_places(parent_team);
1609 #endif
1610
1611   KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1612                 "master_th=%p, gtid=%d\n",
1613                 root, parent_team, master_th, gtid));
1614   __kmp_internal_fork(loc, gtid, parent_team);
1615   KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1616                 "master_th=%p, gtid=%d\n",
1617                 root, parent_team, master_th, gtid));
1618
1619   if (call_context == fork_context_gnu)
1620     return TRUE;
1621
1622   /* Invoke microtask for PRIMARY thread */
1623   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1624                 parent_team->t.t_id, parent_team->t.t_pkfn));
1625
1626   if (!parent_team->t.t_invoke(gtid)) {
1627     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1628   }
1629   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1630                 parent_team->t.t_id, parent_team->t.t_pkfn));
1631   KMP_MB(); /* Flush all pending memory write invalidates.  */
1632
1633   KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1634
1635   return TRUE;
1636 }
1637
1638 // Create a serialized parallel region
1639 static inline int
1640 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1641                        kmp_int32 argc, microtask_t microtask, launch_t invoker,
1642                        kmp_info_t *master_th, kmp_team_t *parent_team,
1643 #if OMPT_SUPPORT
1644                        ompt_data_t *ompt_parallel_data, void **return_address,
1645                        ompt_data_t **parent_task_data,
1646 #endif
1647                        kmp_va_list ap) {
1648   kmp_team_t *team;
1649   int i;
1650   void **argv;
1651
1652 /* josh todo: hypothetical question: what do we do for OS X*? */
1653 #if KMP_OS_LINUX &&                                                            \
1654     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1655   void *args[argc];
1656 #else
1657   void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1658 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1659           KMP_ARCH_AARCH64) */
1660
1661   KA_TRACE(
1662       20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1663
1664   __kmpc_serialized_parallel(loc, gtid);
1665
1666 #if OMPD_SUPPORT
1667   master_th->th.th_serial_team->t.t_pkfn = microtask;
1668 #endif
1669
1670   if (call_context == fork_context_intel) {
1671     /* TODO this sucks, use the compiler itself to pass args! :) */
1672     master_th->th.th_serial_team->t.t_ident = loc;
1673     if (!ap) {
1674       // revert change made in __kmpc_serialized_parallel()
1675       master_th->th.th_serial_team->t.t_level--;
1676 // Get args from parent team for teams construct
1677
1678 #if OMPT_SUPPORT
1679       void *dummy;
1680       void **exit_frame_p;
1681       ompt_task_info_t *task_info;
1682       ompt_lw_taskteam_t lw_taskteam;
1683
1684       if (ompt_enabled.enabled) {
1685         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1686                                 ompt_parallel_data, *return_address);
1687
1688         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1689         // don't use lw_taskteam after linking. content was swaped
1690         task_info = OMPT_CUR_TASK_INFO(master_th);
1691         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1692         if (ompt_enabled.ompt_callback_implicit_task) {
1693           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1694           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1695               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1696               &(task_info->task_data), 1,
1697               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1698         }
1699
1700         /* OMPT state */
1701         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1702       } else {
1703         exit_frame_p = &dummy;
1704       }
1705 #endif
1706
1707       {
1708         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1709         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1710         __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1711 #if OMPT_SUPPORT
1712                                ,
1713                                exit_frame_p
1714 #endif
1715                                );
1716       }
1717
1718 #if OMPT_SUPPORT
1719       if (ompt_enabled.enabled) {
1720         *exit_frame_p = NULL;
1721         if (ompt_enabled.ompt_callback_implicit_task) {
1722           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1723               ompt_scope_end, NULL, &(task_info->task_data), 1,
1724               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1725         }
1726         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1727         __ompt_lw_taskteam_unlink(master_th);
1728         if (ompt_enabled.ompt_callback_parallel_end) {
1729           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1730               ompt_parallel_data, *parent_task_data,
1731               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1732         }
1733         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1734       }
1735 #endif
1736     } else if (microtask == (microtask_t)__kmp_teams_master) {
1737       KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1738       team = master_th->th.th_team;
1739       // team->t.t_pkfn = microtask;
1740       team->t.t_invoke = invoker;
1741       __kmp_alloc_argv_entries(argc, team, TRUE);
1742       team->t.t_argc = argc;
1743       argv = (void **)team->t.t_argv;
1744       if (ap) {
1745         for (i = argc - 1; i >= 0; --i)
1746           *argv++ = va_arg(kmp_va_deref(ap), void *);
1747       } else {
1748         for (i = 0; i < argc; ++i)
1749           // Get args from parent team for teams construct
1750           argv[i] = parent_team->t.t_argv[i];
1751       }
1752       // AC: revert change made in __kmpc_serialized_parallel()
1753       //     because initial code in teams should have level=0
1754       team->t.t_level--;
1755       // AC: call special invoker for outer "parallel" of teams construct
1756       invoker(gtid);
1757 #if OMPT_SUPPORT
1758       if (ompt_enabled.enabled) {
1759         ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1760         if (ompt_enabled.ompt_callback_implicit_task) {
1761           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1762               ompt_scope_end, NULL, &(task_info->task_data), 0,
1763               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1764         }
1765         if (ompt_enabled.ompt_callback_parallel_end) {
1766           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1767               ompt_parallel_data, *parent_task_data,
1768               OMPT_INVOKER(call_context) | ompt_parallel_league,
1769               *return_address);
1770         }
1771         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1772       }
1773 #endif
1774     } else {
1775       argv = args;
1776       for (i = argc - 1; i >= 0; --i)
1777         *argv++ = va_arg(kmp_va_deref(ap), void *);
1778       KMP_MB();
1779
1780 #if OMPT_SUPPORT
1781       void *dummy;
1782       void **exit_frame_p;
1783       ompt_task_info_t *task_info;
1784       ompt_lw_taskteam_t lw_taskteam;
1785       ompt_data_t *implicit_task_data;
1786
1787       if (ompt_enabled.enabled) {
1788         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1789                                 ompt_parallel_data, *return_address);
1790         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1791         // don't use lw_taskteam after linking. content was swaped
1792         task_info = OMPT_CUR_TASK_INFO(master_th);
1793         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1794
1795         /* OMPT implicit task begin */
1796         implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1797         if (ompt_enabled.ompt_callback_implicit_task) {
1798           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1799               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1800               implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1801               ompt_task_implicit);
1802           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1803         }
1804
1805         /* OMPT state */
1806         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1807       } else {
1808         exit_frame_p = &dummy;
1809       }
1810 #endif
1811
1812       {
1813         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1814         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1815         __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1816 #if OMPT_SUPPORT
1817                                ,
1818                                exit_frame_p
1819 #endif
1820                                );
1821       }
1822
1823 #if OMPT_SUPPORT
1824       if (ompt_enabled.enabled) {
1825         *exit_frame_p = NULL;
1826         if (ompt_enabled.ompt_callback_implicit_task) {
1827           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828               ompt_scope_end, NULL, &(task_info->task_data), 1,
1829               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1830         }
1831
1832         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1833         __ompt_lw_taskteam_unlink(master_th);
1834         if (ompt_enabled.ompt_callback_parallel_end) {
1835           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1836               ompt_parallel_data, *parent_task_data,
1837               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1838         }
1839         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1840       }
1841 #endif
1842     }
1843   } else if (call_context == fork_context_gnu) {
1844 #if OMPT_SUPPORT
1845     if (ompt_enabled.enabled) {
1846       ompt_lw_taskteam_t lwt;
1847       __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1848                               *return_address);
1849
1850       lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1851       __ompt_lw_taskteam_link(&lwt, master_th, 1);
1852     }
1853 // don't use lw_taskteam after linking. content was swaped
1854 #endif
1855
1856     // we were called from GNU native code
1857     KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1858     return FALSE;
1859   } else {
1860     KMP_ASSERT2(call_context < fork_context_last,
1861                 "__kmp_serial_fork_call: unknown fork_context parameter");
1862   }
1863
1864   KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1865   KMP_MB();
1866   return FALSE;
1867 }
1868
1869 /* most of the work for a fork */
1870 /* return true if we really went parallel, false if serialized */
1871 int __kmp_fork_call(ident_t *loc, int gtid,
1872                     enum fork_context_e call_context, // Intel, GNU, ...
1873                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1874                     kmp_va_list ap) {
1875   void **argv;
1876   int i;
1877   int master_tid;
1878   int master_this_cons;
1879   kmp_team_t *team;
1880   kmp_team_t *parent_team;
1881   kmp_info_t *master_th;
1882   kmp_root_t *root;
1883   int nthreads;
1884   int master_active;
1885   int master_set_numthreads;
1886   int task_thread_limit = 0;
1887   int level;
1888   int active_level;
1889   int teams_level;
1890 #if KMP_NESTED_HOT_TEAMS
1891   kmp_hot_team_ptr_t **p_hot_teams;
1892 #endif
1893   { // KMP_TIME_BLOCK
1894     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1895     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1896
1897     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1898     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1899       /* Some systems prefer the stack for the root thread(s) to start with */
1900       /* some gap from the parent stack to prevent false sharing. */
1901       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1902       /* These 2 lines below are so this does not get optimized out */
1903       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1904         __kmp_stkpadding += (short)((kmp_int64)dummy);
1905     }
1906
1907     /* initialize if needed */
1908     KMP_DEBUG_ASSERT(
1909         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1910     if (!TCR_4(__kmp_init_parallel))
1911       __kmp_parallel_initialize();
1912     __kmp_resume_if_soft_paused();
1913
1914     /* setup current data */
1915     // AC: potentially unsafe, not in sync with library shutdown,
1916     // __kmp_threads can be freed
1917     master_th = __kmp_threads[gtid];
1918
1919     parent_team = master_th->th.th_team;
1920     master_tid = master_th->th.th_info.ds.ds_tid;
1921     master_this_cons = master_th->th.th_local.this_construct;
1922     root = master_th->th.th_root;
1923     master_active = root->r.r_active;
1924     master_set_numthreads = master_th->th.th_set_nproc;
1925     task_thread_limit =
1926         master_th->th.th_current_task->td_icvs.task_thread_limit;
1927
1928 #if OMPT_SUPPORT
1929     ompt_data_t ompt_parallel_data = ompt_data_none;
1930     ompt_data_t *parent_task_data;
1931     ompt_frame_t *ompt_frame;
1932     void *return_address = NULL;
1933
1934     if (ompt_enabled.enabled) {
1935       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1936                                     NULL, NULL);
1937       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1938     }
1939 #endif
1940
1941     // Assign affinity to root thread if it hasn't happened yet
1942     __kmp_assign_root_init_mask();
1943
1944     // Nested level will be an index in the nested nthreads array
1945     level = parent_team->t.t_level;
1946     // used to launch non-serial teams even if nested is not allowed
1947     active_level = parent_team->t.t_active_level;
1948     // needed to check nesting inside the teams
1949     teams_level = master_th->th.th_teams_level;
1950 #if KMP_NESTED_HOT_TEAMS
1951     p_hot_teams = &master_th->th.th_hot_teams;
1952     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1953       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1954           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1955       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1956       // it is either actual or not needed (when active_level > 0)
1957       (*p_hot_teams)[0].hot_team_nth = 1;
1958     }
1959 #endif
1960
1961 #if OMPT_SUPPORT
1962     if (ompt_enabled.enabled) {
1963       if (ompt_enabled.ompt_callback_parallel_begin) {
1964         int team_size = master_set_numthreads
1965                             ? master_set_numthreads
1966                             : get__nproc_2(parent_team, master_tid);
1967         int flags = OMPT_INVOKER(call_context) |
1968                     ((microtask == (microtask_t)__kmp_teams_master)
1969                          ? ompt_parallel_league
1970                          : ompt_parallel_team);
1971         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1972             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1973             return_address);
1974       }
1975       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1976     }
1977 #endif
1978
1979     master_th->th.th_ident = loc;
1980
1981     // Parallel closely nested in teams construct:
1982     if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1983       return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1984                                  call_context, microtask, invoker,
1985                                  master_set_numthreads, level,
1986 #if OMPT_SUPPORT
1987                                  ompt_parallel_data, return_address,
1988 #endif
1989                                  ap);
1990     } // End parallel closely nested in teams construct
1991
1992 #if KMP_DEBUG
1993     if (__kmp_tasking_mode != tskm_immediate_exec) {
1994       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1995                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1996     }
1997 #endif
1998
1999     // Need this to happen before we determine the number of threads, not while
2000     // we are allocating the team
2001     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2002
2003     // Determine the number of threads
2004     int enter_teams =
2005         __kmp_is_entering_teams(active_level, level, teams_level, ap);
2006     if ((!enter_teams &&
2007          (parent_team->t.t_active_level >=
2008           master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2009         (__kmp_library == library_serial)) {
2010       KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2011       nthreads = 1;
2012     } else {
2013       nthreads = master_set_numthreads
2014                      ? master_set_numthreads
2015                      // TODO: get nproc directly from current task
2016                      : get__nproc_2(parent_team, master_tid);
2017       // Use the thread_limit set for the current target task if exists, else go
2018       // with the deduced nthreads
2019       nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2020                      ? task_thread_limit
2021                      : nthreads;
2022       // Check if we need to take forkjoin lock? (no need for serialized
2023       // parallel out of teams construct).
2024       if (nthreads > 1) {
2025         /* determine how many new threads we can use */
2026         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2027         /* AC: If we execute teams from parallel region (on host), then teams
2028            should be created but each can only have 1 thread if nesting is
2029            disabled. If teams called from serial region, then teams and their
2030            threads should be created regardless of the nesting setting. */
2031         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2032                                          nthreads, enter_teams);
2033         if (nthreads == 1) {
2034           // Free lock for single thread execution here; for multi-thread
2035           // execution it will be freed later after team of threads created
2036           // and initialized
2037           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2038         }
2039       }
2040     }
2041     KMP_DEBUG_ASSERT(nthreads > 0);
2042
2043     // If we temporarily changed the set number of threads then restore it now
2044     master_th->th.th_set_nproc = 0;
2045
2046     if (nthreads == 1) {
2047       return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2048                                     invoker, master_th, parent_team,
2049 #if OMPT_SUPPORT
2050                                     &ompt_parallel_data, &return_address,
2051                                     &parent_task_data,
2052 #endif
2053                                     ap);
2054     } // if (nthreads == 1)
2055
2056     // GEH: only modify the executing flag in the case when not serialized
2057     //      serialized case is handled in kmpc_serialized_parallel
2058     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2059                   "curtask=%p, curtask_max_aclevel=%d\n",
2060                   parent_team->t.t_active_level, master_th,
2061                   master_th->th.th_current_task,
2062                   master_th->th.th_current_task->td_icvs.max_active_levels));
2063     // TODO: GEH - cannot do this assertion because root thread not set up as
2064     // executing
2065     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2066     master_th->th.th_current_task->td_flags.executing = 0;
2067
2068     if (!master_th->th.th_teams_microtask || level > teams_level) {
2069       /* Increment our nested depth level */
2070       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2071     }
2072
2073     // See if we need to make a copy of the ICVs.
2074     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2075     if ((level + 1 < __kmp_nested_nth.used) &&
2076         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2077       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2078     } else {
2079       nthreads_icv = 0; // don't update
2080     }
2081
2082     // Figure out the proc_bind_policy for the new team.
2083     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2084     // proc_bind_default means don't update
2085     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2086     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2087       proc_bind = proc_bind_false;
2088     } else {
2089       // No proc_bind clause specified; use current proc-bind-var for this
2090       // parallel region
2091       if (proc_bind == proc_bind_default) {
2092         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2093       }
2094       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2095       if (master_th->th.th_teams_microtask &&
2096           microtask == (microtask_t)__kmp_teams_master) {
2097         proc_bind = __kmp_teams_proc_bind;
2098       }
2099       /* else: The proc_bind policy was specified explicitly on parallel clause.
2100          This overrides proc-bind-var for this parallel region, but does not
2101          change proc-bind-var. */
2102       // Figure the value of proc-bind-var for the child threads.
2103       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2104           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2105            master_th->th.th_current_task->td_icvs.proc_bind)) {
2106         // Do not modify the proc bind icv for the two teams construct forks
2107         // They just let the proc bind icv pass through
2108         if (!master_th->th.th_teams_microtask ||
2109             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2110           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2111       }
2112     }
2113
2114     // Reset for next parallel region
2115     master_th->th.th_set_proc_bind = proc_bind_default;
2116
2117     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2118       kmp_internal_control_t new_icvs;
2119       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2120       new_icvs.next = NULL;
2121       if (nthreads_icv > 0) {
2122         new_icvs.nproc = nthreads_icv;
2123       }
2124       if (proc_bind_icv != proc_bind_default) {
2125         new_icvs.proc_bind = proc_bind_icv;
2126       }
2127
2128       /* allocate a new parallel team */
2129       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2130       team = __kmp_allocate_team(root, nthreads, nthreads,
2131 #if OMPT_SUPPORT
2132                                  ompt_parallel_data,
2133 #endif
2134                                  proc_bind, &new_icvs,
2135                                  argc USE_NESTED_HOT_ARG(master_th));
2136       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2137         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2138     } else {
2139       /* allocate a new parallel team */
2140       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2141       team = __kmp_allocate_team(root, nthreads, nthreads,
2142 #if OMPT_SUPPORT
2143                                  ompt_parallel_data,
2144 #endif
2145                                  proc_bind,
2146                                  &master_th->th.th_current_task->td_icvs,
2147                                  argc USE_NESTED_HOT_ARG(master_th));
2148       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2149         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2150                   &master_th->th.th_current_task->td_icvs);
2151     }
2152     KF_TRACE(
2153         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2154
2155     /* setup the new team */
2156     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2157     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2158     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2159     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2160     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2161 #if OMPT_SUPPORT
2162     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2163                           return_address);
2164 #endif
2165     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2166     // TODO: parent_team->t.t_level == INT_MAX ???
2167     if (!master_th->th.th_teams_microtask || level > teams_level) {
2168       int new_level = parent_team->t.t_level + 1;
2169       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2170       new_level = parent_team->t.t_active_level + 1;
2171       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2172     } else {
2173       // AC: Do not increase parallel level at start of the teams construct
2174       int new_level = parent_team->t.t_level;
2175       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2176       new_level = parent_team->t.t_active_level;
2177       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2178     }
2179     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2180     // set primary thread's schedule as new run-time schedule
2181     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2182
2183     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2184     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2185
2186     // Update the floating point rounding in the team if required.
2187     propagateFPControl(team);
2188 #if OMPD_SUPPORT
2189     if (ompd_state & OMPD_ENABLE_BP)
2190       ompd_bp_parallel_begin();
2191 #endif
2192
2193     if (__kmp_tasking_mode != tskm_immediate_exec) {
2194       // Set primary thread's task team to team's task team. Unless this is hot
2195       // team, it should be NULL.
2196       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2197                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2198       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2199                     "%p, new task_team %p / team %p\n",
2200                     __kmp_gtid_from_thread(master_th),
2201                     master_th->th.th_task_team, parent_team,
2202                     team->t.t_task_team[master_th->th.th_task_state], team));
2203
2204       if (active_level || master_th->th.th_task_team) {
2205         // Take a memo of primary thread's task_state
2206         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2207         if (master_th->th.th_task_state_top >=
2208             master_th->th.th_task_state_stack_sz) { // increase size
2209           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2210           kmp_uint8 *old_stack, *new_stack;
2211           kmp_uint32 i;
2212           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2213           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2214             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2215           }
2216           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2217                ++i) { // zero-init rest of stack
2218             new_stack[i] = 0;
2219           }
2220           old_stack = master_th->th.th_task_state_memo_stack;
2221           master_th->th.th_task_state_memo_stack = new_stack;
2222           master_th->th.th_task_state_stack_sz = new_size;
2223           __kmp_free(old_stack);
2224         }
2225         // Store primary thread's task_state on stack
2226         master_th->th
2227             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2228             master_th->th.th_task_state;
2229         master_th->th.th_task_state_top++;
2230 #if KMP_NESTED_HOT_TEAMS
2231         if (master_th->th.th_hot_teams &&
2232             active_level < __kmp_hot_teams_max_level &&
2233             team == master_th->th.th_hot_teams[active_level].hot_team) {
2234           // Restore primary thread's nested state if nested hot team
2235           master_th->th.th_task_state =
2236               master_th->th
2237                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2238         } else {
2239 #endif
2240           master_th->th.th_task_state = 0;
2241 #if KMP_NESTED_HOT_TEAMS
2242         }
2243 #endif
2244       }
2245 #if !KMP_NESTED_HOT_TEAMS
2246       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2247                        (team == root->r.r_hot_team));
2248 #endif
2249     }
2250
2251     KA_TRACE(
2252         20,
2253         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2254          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2255          team->t.t_nproc));
2256     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2257                      (team->t.t_master_tid == 0 &&
2258                       (team->t.t_parent == root->r.r_root_team ||
2259                        team->t.t_parent->t.t_serialized)));
2260     KMP_MB();
2261
2262     /* now, setup the arguments */
2263     argv = (void **)team->t.t_argv;
2264     if (ap) {
2265       for (i = argc - 1; i >= 0; --i) {
2266         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2267         KMP_CHECK_UPDATE(*argv, new_argv);
2268         argv++;
2269       }
2270     } else {
2271       for (i = 0; i < argc; ++i) {
2272         // Get args from parent team for teams construct
2273         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2274       }
2275     }
2276
2277     /* now actually fork the threads */
2278     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2279     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2280       root->r.r_active = TRUE;
2281
2282     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2283     __kmp_setup_icv_copy(team, nthreads,
2284                          &master_th->th.th_current_task->td_icvs, loc);
2285
2286 #if OMPT_SUPPORT
2287     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2288 #endif
2289
2290     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2291
2292 #if USE_ITT_BUILD
2293     if (team->t.t_active_level == 1 // only report frames at level 1
2294         && !master_th->th.th_teams_microtask) { // not in teams construct
2295 #if USE_ITT_NOTIFY
2296       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2297           (__kmp_forkjoin_frames_mode == 3 ||
2298            __kmp_forkjoin_frames_mode == 1)) {
2299         kmp_uint64 tmp_time = 0;
2300         if (__itt_get_timestamp_ptr)
2301           tmp_time = __itt_get_timestamp();
2302         // Internal fork - report frame begin
2303         master_th->th.th_frame_time = tmp_time;
2304         if (__kmp_forkjoin_frames_mode == 3)
2305           team->t.t_region_time = tmp_time;
2306       } else
2307 // only one notification scheme (either "submit" or "forking/joined", not both)
2308 #endif /* USE_ITT_NOTIFY */
2309         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2310             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2311           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2312           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2313         }
2314     }
2315 #endif /* USE_ITT_BUILD */
2316
2317     /* now go on and do the work */
2318     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2319     KMP_MB();
2320     KF_TRACE(10,
2321              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2322               root, team, master_th, gtid));
2323
2324 #if USE_ITT_BUILD
2325     if (__itt_stack_caller_create_ptr) {
2326       // create new stack stitching id before entering fork barrier
2327       if (!enter_teams) {
2328         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2329         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2330       } else if (parent_team->t.t_serialized) {
2331         // keep stack stitching id in the serialized parent_team;
2332         // current team will be used for parallel inside the teams;
2333         // if parent_team is active, then it already keeps stack stitching id
2334         // for the league of teams
2335         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2336         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2337       }
2338     }
2339 #endif /* USE_ITT_BUILD */
2340
2341     // AC: skip __kmp_internal_fork at teams construct, let only primary
2342     // threads execute
2343     if (ap) {
2344       __kmp_internal_fork(loc, gtid, team);
2345       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2346                     "master_th=%p, gtid=%d\n",
2347                     root, team, master_th, gtid));
2348     }
2349
2350     if (call_context == fork_context_gnu) {
2351       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352       return TRUE;
2353     }
2354
2355     /* Invoke microtask for PRIMARY thread */
2356     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2357                   team->t.t_id, team->t.t_pkfn));
2358   } // END of timer KMP_fork_call block
2359
2360 #if KMP_STATS_ENABLED
2361   // If beginning a teams construct, then change thread state
2362   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2363   if (!ap) {
2364     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2365   }
2366 #endif
2367
2368   if (!team->t.t_invoke(gtid)) {
2369     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2370   }
2371
2372 #if KMP_STATS_ENABLED
2373   // If was beginning of a teams construct, then reset thread state
2374   if (!ap) {
2375     KMP_SET_THREAD_STATE(previous_state);
2376   }
2377 #endif
2378
2379   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2380                 team->t.t_id, team->t.t_pkfn));
2381   KMP_MB(); /* Flush all pending memory write invalidates.  */
2382
2383   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2384 #if OMPT_SUPPORT
2385   if (ompt_enabled.enabled) {
2386     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2387   }
2388 #endif
2389
2390   return TRUE;
2391 }
2392
2393 #if OMPT_SUPPORT
2394 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2395                                             kmp_team_t *team) {
2396   // restore state outside the region
2397   thread->th.ompt_thread_info.state =
2398       ((team->t.t_serialized) ? ompt_state_work_serial
2399                               : ompt_state_work_parallel);
2400 }
2401
2402 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2403                                    kmp_team_t *team, ompt_data_t *parallel_data,
2404                                    int flags, void *codeptr) {
2405   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2406   if (ompt_enabled.ompt_callback_parallel_end) {
2407     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2408         parallel_data, &(task_info->task_data), flags, codeptr);
2409   }
2410
2411   task_info->frame.enter_frame = ompt_data_none;
2412   __kmp_join_restore_state(thread, team);
2413 }
2414 #endif
2415
2416 void __kmp_join_call(ident_t *loc, int gtid
2417 #if OMPT_SUPPORT
2418                      ,
2419                      enum fork_context_e fork_context
2420 #endif
2421                      ,
2422                      int exit_teams) {
2423   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2424   kmp_team_t *team;
2425   kmp_team_t *parent_team;
2426   kmp_info_t *master_th;
2427   kmp_root_t *root;
2428   int master_active;
2429
2430   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2431
2432   /* setup current data */
2433   master_th = __kmp_threads[gtid];
2434   root = master_th->th.th_root;
2435   team = master_th->th.th_team;
2436   parent_team = team->t.t_parent;
2437
2438   master_th->th.th_ident = loc;
2439
2440 #if OMPT_SUPPORT
2441   void *team_microtask = (void *)team->t.t_pkfn;
2442   // For GOMP interface with serialized parallel, need the
2443   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2444   // and end-parallel events.
2445   if (ompt_enabled.enabled &&
2446       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2447     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2448   }
2449 #endif
2450
2451 #if KMP_DEBUG
2452   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2453     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2454                   "th_task_team = %p\n",
2455                   __kmp_gtid_from_thread(master_th), team,
2456                   team->t.t_task_team[master_th->th.th_task_state],
2457                   master_th->th.th_task_team));
2458     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2459                      team->t.t_task_team[master_th->th.th_task_state]);
2460   }
2461 #endif
2462
2463   if (team->t.t_serialized) {
2464     if (master_th->th.th_teams_microtask) {
2465       // We are in teams construct
2466       int level = team->t.t_level;
2467       int tlevel = master_th->th.th_teams_level;
2468       if (level == tlevel) {
2469         // AC: we haven't incremented it earlier at start of teams construct,
2470         //     so do it here - at the end of teams construct
2471         team->t.t_level++;
2472       } else if (level == tlevel + 1) {
2473         // AC: we are exiting parallel inside teams, need to increment
2474         // serialization in order to restore it in the next call to
2475         // __kmpc_end_serialized_parallel
2476         team->t.t_serialized++;
2477       }
2478     }
2479     __kmpc_end_serialized_parallel(loc, gtid);
2480
2481 #if OMPT_SUPPORT
2482     if (ompt_enabled.enabled) {
2483       if (fork_context == fork_context_gnu) {
2484         __ompt_lw_taskteam_unlink(master_th);
2485       }
2486       __kmp_join_restore_state(master_th, parent_team);
2487     }
2488 #endif
2489
2490     return;
2491   }
2492
2493   master_active = team->t.t_master_active;
2494
2495   if (!exit_teams) {
2496     // AC: No barrier for internal teams at exit from teams construct.
2497     //     But there is barrier for external team (league).
2498     __kmp_internal_join(loc, gtid, team);
2499 #if USE_ITT_BUILD
2500     if (__itt_stack_caller_create_ptr) {
2501       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2502       // destroy the stack stitching id after join barrier
2503       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2504       team->t.t_stack_id = NULL;
2505     }
2506 #endif
2507   } else {
2508     master_th->th.th_task_state =
2509         0; // AC: no tasking in teams (out of any parallel)
2510 #if USE_ITT_BUILD
2511     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2512       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2513       // destroy the stack stitching id on exit from the teams construct
2514       // if parent_team is active, then the id will be destroyed later on
2515       // by master of the league of teams
2516       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2517       parent_team->t.t_stack_id = NULL;
2518     }
2519 #endif
2520   }
2521
2522   KMP_MB();
2523
2524 #if OMPT_SUPPORT
2525   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2526   void *codeptr = team->t.ompt_team_info.master_return_address;
2527 #endif
2528
2529 #if USE_ITT_BUILD
2530   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2531   if (team->t.t_active_level == 1 &&
2532       (!master_th->th.th_teams_microtask || /* not in teams construct */
2533        master_th->th.th_teams_size.nteams == 1)) {
2534     master_th->th.th_ident = loc;
2535     // only one notification scheme (either "submit" or "forking/joined", not
2536     // both)
2537     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2538         __kmp_forkjoin_frames_mode == 3)
2539       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2540                              master_th->th.th_frame_time, 0, loc,
2541                              master_th->th.th_team_nproc, 1);
2542     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2543              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2544       __kmp_itt_region_joined(gtid);
2545   } // active_level == 1
2546 #endif /* USE_ITT_BUILD */
2547
2548 #if KMP_AFFINITY_SUPPORTED
2549   if (!exit_teams) {
2550     // Restore master thread's partition.
2551     master_th->th.th_first_place = team->t.t_first_place;
2552     master_th->th.th_last_place = team->t.t_last_place;
2553   }
2554 #endif // KMP_AFFINITY_SUPPORTED
2555
2556   if (master_th->th.th_teams_microtask && !exit_teams &&
2557       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2558       team->t.t_level == master_th->th.th_teams_level + 1) {
2559 // AC: We need to leave the team structure intact at the end of parallel
2560 // inside the teams construct, so that at the next parallel same (hot) team
2561 // works, only adjust nesting levels
2562 #if OMPT_SUPPORT
2563     ompt_data_t ompt_parallel_data = ompt_data_none;
2564     if (ompt_enabled.enabled) {
2565       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2566       if (ompt_enabled.ompt_callback_implicit_task) {
2567         int ompt_team_size = team->t.t_nproc;
2568         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2569             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2570             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2571       }
2572       task_info->frame.exit_frame = ompt_data_none;
2573       task_info->task_data = ompt_data_none;
2574       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2575       __ompt_lw_taskteam_unlink(master_th);
2576     }
2577 #endif
2578     /* Decrement our nested depth level */
2579     team->t.t_level--;
2580     team->t.t_active_level--;
2581     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2582
2583     // Restore number of threads in the team if needed. This code relies on
2584     // the proper adjustment of th_teams_size.nth after the fork in
2585     // __kmp_teams_master on each teams primary thread in the case that
2586     // __kmp_reserve_threads reduced it.
2587     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2588       int old_num = master_th->th.th_team_nproc;
2589       int new_num = master_th->th.th_teams_size.nth;
2590       kmp_info_t **other_threads = team->t.t_threads;
2591       team->t.t_nproc = new_num;
2592       for (int i = 0; i < old_num; ++i) {
2593         other_threads[i]->th.th_team_nproc = new_num;
2594       }
2595       // Adjust states of non-used threads of the team
2596       for (int i = old_num; i < new_num; ++i) {
2597         // Re-initialize thread's barrier data.
2598         KMP_DEBUG_ASSERT(other_threads[i]);
2599         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2600         for (int b = 0; b < bs_last_barrier; ++b) {
2601           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2602           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2603 #if USE_DEBUGGER
2604           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2605 #endif
2606         }
2607         if (__kmp_tasking_mode != tskm_immediate_exec) {
2608           // Synchronize thread's task state
2609           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2610         }
2611       }
2612     }
2613
2614 #if OMPT_SUPPORT
2615     if (ompt_enabled.enabled) {
2616       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2617                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2618     }
2619 #endif
2620
2621     return;
2622   }
2623
2624   /* do cleanup and restore the parent team */
2625   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2626   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2627
2628   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2629
2630   /* jc: The following lock has instructions with REL and ACQ semantics,
2631      separating the parallel user code called in this parallel region
2632      from the serial user code called after this function returns. */
2633   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2634
2635   if (!master_th->th.th_teams_microtask ||
2636       team->t.t_level > master_th->th.th_teams_level) {
2637     /* Decrement our nested depth level */
2638     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2639   }
2640   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2641
2642 #if OMPT_SUPPORT
2643   if (ompt_enabled.enabled) {
2644     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2645     if (ompt_enabled.ompt_callback_implicit_task) {
2646       int flags = (team_microtask == (void *)__kmp_teams_master)
2647                       ? ompt_task_initial
2648                       : ompt_task_implicit;
2649       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2650       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2651           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2652           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2653     }
2654     task_info->frame.exit_frame = ompt_data_none;
2655     task_info->task_data = ompt_data_none;
2656   }
2657 #endif
2658
2659   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2660                 master_th, team));
2661   __kmp_pop_current_task_from_thread(master_th);
2662
2663   master_th->th.th_def_allocator = team->t.t_def_allocator;
2664
2665 #if OMPD_SUPPORT
2666   if (ompd_state & OMPD_ENABLE_BP)
2667     ompd_bp_parallel_end();
2668 #endif
2669   updateHWFPControl(team);
2670
2671   if (root->r.r_active != master_active)
2672     root->r.r_active = master_active;
2673
2674   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2675                             master_th)); // this will free worker threads
2676
2677   /* this race was fun to find. make sure the following is in the critical
2678      region otherwise assertions may fail occasionally since the old team may be
2679      reallocated and the hierarchy appears inconsistent. it is actually safe to
2680      run and won't cause any bugs, but will cause those assertion failures. it's
2681      only one deref&assign so might as well put this in the critical region */
2682   master_th->th.th_team = parent_team;
2683   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2684   master_th->th.th_team_master = parent_team->t.t_threads[0];
2685   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2686
2687   /* restore serialized team, if need be */
2688   if (parent_team->t.t_serialized &&
2689       parent_team != master_th->th.th_serial_team &&
2690       parent_team != root->r.r_root_team) {
2691     __kmp_free_team(root,
2692                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2693     master_th->th.th_serial_team = parent_team;
2694   }
2695
2696   if (__kmp_tasking_mode != tskm_immediate_exec) {
2697     if (master_th->th.th_task_state_top >
2698         0) { // Restore task state from memo stack
2699       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2700       // Remember primary thread's state if we re-use this nested hot team
2701       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2702           master_th->th.th_task_state;
2703       --master_th->th.th_task_state_top; // pop
2704       // Now restore state at this level
2705       master_th->th.th_task_state =
2706           master_th->th
2707               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2708     } else if (team != root->r.r_hot_team) {
2709       // Reset the task state of primary thread if we are not hot team because
2710       // in this case all the worker threads will be free, and their task state
2711       // will be reset. If not reset the primary's, the task state will be
2712       // inconsistent.
2713       master_th->th.th_task_state = 0;
2714     }
2715     // Copy the task team from the parent team to the primary thread
2716     master_th->th.th_task_team =
2717         parent_team->t.t_task_team[master_th->th.th_task_state];
2718     KA_TRACE(20,
2719              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2720               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2721               parent_team));
2722   }
2723
2724   // TODO: GEH - cannot do this assertion because root thread not set up as
2725   // executing
2726   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2727   master_th->th.th_current_task->td_flags.executing = 1;
2728
2729   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2730
2731 #if KMP_AFFINITY_SUPPORTED
2732   if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2733     __kmp_reset_root_init_mask(gtid);
2734   }
2735 #endif
2736 #if OMPT_SUPPORT
2737   int flags =
2738       OMPT_INVOKER(fork_context) |
2739       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2740                                                       : ompt_parallel_team);
2741   if (ompt_enabled.enabled) {
2742     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2743                     codeptr);
2744   }
2745 #endif
2746
2747   KMP_MB();
2748   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2749 }
2750
2751 /* Check whether we should push an internal control record onto the
2752    serial team stack.  If so, do it.  */
2753 void __kmp_save_internal_controls(kmp_info_t *thread) {
2754
2755   if (thread->th.th_team != thread->th.th_serial_team) {
2756     return;
2757   }
2758   if (thread->th.th_team->t.t_serialized > 1) {
2759     int push = 0;
2760
2761     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2762       push = 1;
2763     } else {
2764       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2765           thread->th.th_team->t.t_serialized) {
2766         push = 1;
2767       }
2768     }
2769     if (push) { /* push a record on the serial team's stack */
2770       kmp_internal_control_t *control =
2771           (kmp_internal_control_t *)__kmp_allocate(
2772               sizeof(kmp_internal_control_t));
2773
2774       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2775
2776       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2777
2778       control->next = thread->th.th_team->t.t_control_stack_top;
2779       thread->th.th_team->t.t_control_stack_top = control;
2780     }
2781   }
2782 }
2783
2784 /* Changes set_nproc */
2785 void __kmp_set_num_threads(int new_nth, int gtid) {
2786   kmp_info_t *thread;
2787   kmp_root_t *root;
2788
2789   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2790   KMP_DEBUG_ASSERT(__kmp_init_serial);
2791
2792   if (new_nth < 1)
2793     new_nth = 1;
2794   else if (new_nth > __kmp_max_nth)
2795     new_nth = __kmp_max_nth;
2796
2797   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2798   thread = __kmp_threads[gtid];
2799   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2800     return; // nothing to do
2801
2802   __kmp_save_internal_controls(thread);
2803
2804   set__nproc(thread, new_nth);
2805
2806   // If this omp_set_num_threads() call will cause the hot team size to be
2807   // reduced (in the absence of a num_threads clause), then reduce it now,
2808   // rather than waiting for the next parallel region.
2809   root = thread->th.th_root;
2810   if (__kmp_init_parallel && (!root->r.r_active) &&
2811       (root->r.r_hot_team->t.t_nproc > new_nth)
2812 #if KMP_NESTED_HOT_TEAMS
2813       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2814 #endif
2815   ) {
2816     kmp_team_t *hot_team = root->r.r_hot_team;
2817     int f;
2818
2819     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2820
2821     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2822       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2823     }
2824     // Release the extra threads we don't need any more.
2825     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2826       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2827       if (__kmp_tasking_mode != tskm_immediate_exec) {
2828         // When decreasing team size, threads no longer in the team should unref
2829         // task team.
2830         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2831       }
2832       __kmp_free_thread(hot_team->t.t_threads[f]);
2833       hot_team->t.t_threads[f] = NULL;
2834     }
2835     hot_team->t.t_nproc = new_nth;
2836 #if KMP_NESTED_HOT_TEAMS
2837     if (thread->th.th_hot_teams) {
2838       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2839       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2840     }
2841 #endif
2842
2843     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2844       hot_team->t.b->update_num_threads(new_nth);
2845       __kmp_add_threads_to_team(hot_team, new_nth);
2846     }
2847
2848     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2849
2850     // Update the t_nproc field in the threads that are still active.
2851     for (f = 0; f < new_nth; f++) {
2852       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2853       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2854     }
2855     // Special flag in case omp_set_num_threads() call
2856     hot_team->t.t_size_changed = -1;
2857   }
2858 }
2859
2860 /* Changes max_active_levels */
2861 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2862   kmp_info_t *thread;
2863
2864   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2865                 "%d = (%d)\n",
2866                 gtid, max_active_levels));
2867   KMP_DEBUG_ASSERT(__kmp_init_serial);
2868
2869   // validate max_active_levels
2870   if (max_active_levels < 0) {
2871     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2872     // We ignore this call if the user has specified a negative value.
2873     // The current setting won't be changed. The last valid setting will be
2874     // used. A warning will be issued (if warnings are allowed as controlled by
2875     // the KMP_WARNINGS env var).
2876     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2877                   "max_active_levels for thread %d = (%d)\n",
2878                   gtid, max_active_levels));
2879     return;
2880   }
2881   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2882     // it's OK, the max_active_levels is within the valid range: [ 0;
2883     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2884     // We allow a zero value. (implementation defined behavior)
2885   } else {
2886     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2887                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2888     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2889     // Current upper limit is MAX_INT. (implementation defined behavior)
2890     // If the input exceeds the upper limit, we correct the input to be the
2891     // upper limit. (implementation defined behavior)
2892     // Actually, the flow should never get here until we use MAX_INT limit.
2893   }
2894   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2895                 "max_active_levels for thread %d = (%d)\n",
2896                 gtid, max_active_levels));
2897
2898   thread = __kmp_threads[gtid];
2899
2900   __kmp_save_internal_controls(thread);
2901
2902   set__max_active_levels(thread, max_active_levels);
2903 }
2904
2905 /* Gets max_active_levels */
2906 int __kmp_get_max_active_levels(int gtid) {
2907   kmp_info_t *thread;
2908
2909   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2910   KMP_DEBUG_ASSERT(__kmp_init_serial);
2911
2912   thread = __kmp_threads[gtid];
2913   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2914   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2915                 "curtask_maxaclevel=%d\n",
2916                 gtid, thread->th.th_current_task,
2917                 thread->th.th_current_task->td_icvs.max_active_levels));
2918   return thread->th.th_current_task->td_icvs.max_active_levels;
2919 }
2920
2921 // nteams-var per-device ICV
2922 void __kmp_set_num_teams(int num_teams) {
2923   if (num_teams > 0)
2924     __kmp_nteams = num_teams;
2925 }
2926 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2927 // teams-thread-limit-var per-device ICV
2928 void __kmp_set_teams_thread_limit(int limit) {
2929   if (limit > 0)
2930     __kmp_teams_thread_limit = limit;
2931 }
2932 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2933
2934 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2935 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2936
2937 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2938 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2939   kmp_info_t *thread;
2940   kmp_sched_t orig_kind;
2941   //    kmp_team_t *team;
2942
2943   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2944                 gtid, (int)kind, chunk));
2945   KMP_DEBUG_ASSERT(__kmp_init_serial);
2946
2947   // Check if the kind parameter is valid, correct if needed.
2948   // Valid parameters should fit in one of two intervals - standard or extended:
2949   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2950   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2951   orig_kind = kind;
2952   kind = __kmp_sched_without_mods(kind);
2953
2954   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2955       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2956     // TODO: Hint needs attention in case we change the default schedule.
2957     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2958               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2959               __kmp_msg_null);
2960     kind = kmp_sched_default;
2961     chunk = 0; // ignore chunk value in case of bad kind
2962   }
2963
2964   thread = __kmp_threads[gtid];
2965
2966   __kmp_save_internal_controls(thread);
2967
2968   if (kind < kmp_sched_upper_std) {
2969     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2970       // differ static chunked vs. unchunked:  chunk should be invalid to
2971       // indicate unchunked schedule (which is the default)
2972       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2973     } else {
2974       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2975           __kmp_sch_map[kind - kmp_sched_lower - 1];
2976     }
2977   } else {
2978     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979     //    kmp_sched_lower - 2 ];
2980     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2982                       kmp_sched_lower - 2];
2983   }
2984   __kmp_sched_apply_mods_intkind(
2985       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2986   if (kind == kmp_sched_auto || chunk < 1) {
2987     // ignore parameter chunk for schedule auto
2988     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2989   } else {
2990     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2991   }
2992 }
2993
2994 /* Gets def_sched_var ICV values */
2995 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2996   kmp_info_t *thread;
2997   enum sched_type th_type;
2998
2999   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3000   KMP_DEBUG_ASSERT(__kmp_init_serial);
3001
3002   thread = __kmp_threads[gtid];
3003
3004   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3005   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3006   case kmp_sch_static:
3007   case kmp_sch_static_greedy:
3008   case kmp_sch_static_balanced:
3009     *kind = kmp_sched_static;
3010     __kmp_sched_apply_mods_stdkind(kind, th_type);
3011     *chunk = 0; // chunk was not set, try to show this fact via zero value
3012     return;
3013   case kmp_sch_static_chunked:
3014     *kind = kmp_sched_static;
3015     break;
3016   case kmp_sch_dynamic_chunked:
3017     *kind = kmp_sched_dynamic;
3018     break;
3019   case kmp_sch_guided_chunked:
3020   case kmp_sch_guided_iterative_chunked:
3021   case kmp_sch_guided_analytical_chunked:
3022     *kind = kmp_sched_guided;
3023     break;
3024   case kmp_sch_auto:
3025     *kind = kmp_sched_auto;
3026     break;
3027   case kmp_sch_trapezoidal:
3028     *kind = kmp_sched_trapezoidal;
3029     break;
3030 #if KMP_STATIC_STEAL_ENABLED
3031   case kmp_sch_static_steal:
3032     *kind = kmp_sched_static_steal;
3033     break;
3034 #endif
3035   default:
3036     KMP_FATAL(UnknownSchedulingType, th_type);
3037   }
3038
3039   __kmp_sched_apply_mods_stdkind(kind, th_type);
3040   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3041 }
3042
3043 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3044
3045   int ii, dd;
3046   kmp_team_t *team;
3047   kmp_info_t *thr;
3048
3049   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3050   KMP_DEBUG_ASSERT(__kmp_init_serial);
3051
3052   // validate level
3053   if (level == 0)
3054     return 0;
3055   if (level < 0)
3056     return -1;
3057   thr = __kmp_threads[gtid];
3058   team = thr->th.th_team;
3059   ii = team->t.t_level;
3060   if (level > ii)
3061     return -1;
3062
3063   if (thr->th.th_teams_microtask) {
3064     // AC: we are in teams region where multiple nested teams have same level
3065     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3066     if (level <=
3067         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3068       KMP_DEBUG_ASSERT(ii >= tlevel);
3069       // AC: As we need to pass by the teams league, we need to artificially
3070       // increase ii
3071       if (ii == tlevel) {
3072         ii += 2; // three teams have same level
3073       } else {
3074         ii++; // two teams have same level
3075       }
3076     }
3077   }
3078
3079   if (ii == level)
3080     return __kmp_tid_from_gtid(gtid);
3081
3082   dd = team->t.t_serialized;
3083   level++;
3084   while (ii > level) {
3085     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3086     }
3087     if ((team->t.t_serialized) && (!dd)) {
3088       team = team->t.t_parent;
3089       continue;
3090     }
3091     if (ii > level) {
3092       team = team->t.t_parent;
3093       dd = team->t.t_serialized;
3094       ii--;
3095     }
3096   }
3097
3098   return (dd > 1) ? (0) : (team->t.t_master_tid);
3099 }
3100
3101 int __kmp_get_team_size(int gtid, int level) {
3102
3103   int ii, dd;
3104   kmp_team_t *team;
3105   kmp_info_t *thr;
3106
3107   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3108   KMP_DEBUG_ASSERT(__kmp_init_serial);
3109
3110   // validate level
3111   if (level == 0)
3112     return 1;
3113   if (level < 0)
3114     return -1;
3115   thr = __kmp_threads[gtid];
3116   team = thr->th.th_team;
3117   ii = team->t.t_level;
3118   if (level > ii)
3119     return -1;
3120
3121   if (thr->th.th_teams_microtask) {
3122     // AC: we are in teams region where multiple nested teams have same level
3123     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3124     if (level <=
3125         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3126       KMP_DEBUG_ASSERT(ii >= tlevel);
3127       // AC: As we need to pass by the teams league, we need to artificially
3128       // increase ii
3129       if (ii == tlevel) {
3130         ii += 2; // three teams have same level
3131       } else {
3132         ii++; // two teams have same level
3133       }
3134     }
3135   }
3136
3137   while (ii > level) {
3138     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3139     }
3140     if (team->t.t_serialized && (!dd)) {
3141       team = team->t.t_parent;
3142       continue;
3143     }
3144     if (ii > level) {
3145       team = team->t.t_parent;
3146       ii--;
3147     }
3148   }
3149
3150   return team->t.t_nproc;
3151 }
3152
3153 kmp_r_sched_t __kmp_get_schedule_global() {
3154   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3155   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3156   // independently. So one can get the updated schedule here.
3157
3158   kmp_r_sched_t r_sched;
3159
3160   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3161   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3162   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3163   // different roots (even in OMP 2.5)
3164   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3165   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3166   if (s == kmp_sch_static) {
3167     // replace STATIC with more detailed schedule (balanced or greedy)
3168     r_sched.r_sched_type = __kmp_static;
3169   } else if (s == kmp_sch_guided_chunked) {
3170     // replace GUIDED with more detailed schedule (iterative or analytical)
3171     r_sched.r_sched_type = __kmp_guided;
3172   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3173     r_sched.r_sched_type = __kmp_sched;
3174   }
3175   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3176
3177   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3178     // __kmp_chunk may be wrong here (if it was not ever set)
3179     r_sched.chunk = KMP_DEFAULT_CHUNK;
3180   } else {
3181     r_sched.chunk = __kmp_chunk;
3182   }
3183
3184   return r_sched;
3185 }
3186
3187 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3188    at least argc number of *t_argv entries for the requested team. */
3189 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3190
3191   KMP_DEBUG_ASSERT(team);
3192   if (!realloc || argc > team->t.t_max_argc) {
3193
3194     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3195                    "current entries=%d\n",
3196                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3197     /* if previously allocated heap space for args, free them */
3198     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3199       __kmp_free((void *)team->t.t_argv);
3200
3201     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3202       /* use unused space in the cache line for arguments */
3203       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3204       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3205                      "argv entries\n",
3206                      team->t.t_id, team->t.t_max_argc));
3207       team->t.t_argv = &team->t.t_inline_argv[0];
3208       if (__kmp_storage_map) {
3209         __kmp_print_storage_map_gtid(
3210             -1, &team->t.t_inline_argv[0],
3211             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3212             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3213             team->t.t_id);
3214       }
3215     } else {
3216       /* allocate space for arguments in the heap */
3217       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3218                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3219                                : 2 * argc;
3220       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3221                      "argv entries\n",
3222                      team->t.t_id, team->t.t_max_argc));
3223       team->t.t_argv =
3224           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3225       if (__kmp_storage_map) {
3226         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3227                                      &team->t.t_argv[team->t.t_max_argc],
3228                                      sizeof(void *) * team->t.t_max_argc,
3229                                      "team_%d.t_argv", team->t.t_id);
3230       }
3231     }
3232   }
3233 }
3234
3235 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3236   int i;
3237   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3238   team->t.t_threads =
3239       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3240   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3241       sizeof(dispatch_shared_info_t) * num_disp_buff);
3242   team->t.t_dispatch =
3243       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3244   team->t.t_implicit_task_taskdata =
3245       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3246   team->t.t_max_nproc = max_nth;
3247
3248   /* setup dispatch buffers */
3249   for (i = 0; i < num_disp_buff; ++i) {
3250     team->t.t_disp_buffer[i].buffer_index = i;
3251     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3252   }
3253 }
3254
3255 static void __kmp_free_team_arrays(kmp_team_t *team) {
3256   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3257   int i;
3258   for (i = 0; i < team->t.t_max_nproc; ++i) {
3259     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3260       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3261       team->t.t_dispatch[i].th_disp_buffer = NULL;
3262     }
3263   }
3264 #if KMP_USE_HIER_SCHED
3265   __kmp_dispatch_free_hierarchies(team);
3266 #endif
3267   __kmp_free(team->t.t_threads);
3268   __kmp_free(team->t.t_disp_buffer);
3269   __kmp_free(team->t.t_dispatch);
3270   __kmp_free(team->t.t_implicit_task_taskdata);
3271   team->t.t_threads = NULL;
3272   team->t.t_disp_buffer = NULL;
3273   team->t.t_dispatch = NULL;
3274   team->t.t_implicit_task_taskdata = 0;
3275 }
3276
3277 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3278   kmp_info_t **oldThreads = team->t.t_threads;
3279
3280   __kmp_free(team->t.t_disp_buffer);
3281   __kmp_free(team->t.t_dispatch);
3282   __kmp_free(team->t.t_implicit_task_taskdata);
3283   __kmp_allocate_team_arrays(team, max_nth);
3284
3285   KMP_MEMCPY(team->t.t_threads, oldThreads,
3286              team->t.t_nproc * sizeof(kmp_info_t *));
3287
3288   __kmp_free(oldThreads);
3289 }
3290
3291 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3292
3293   kmp_r_sched_t r_sched =
3294       __kmp_get_schedule_global(); // get current state of scheduling globals
3295
3296   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3297
3298   kmp_internal_control_t g_icvs = {
3299     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3300     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3301     // adjustment of threads (per thread)
3302     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3303     // whether blocktime is explicitly set
3304     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3305 #if KMP_USE_MONITOR
3306     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3307 // intervals
3308 #endif
3309     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3310     // next parallel region (per thread)
3311     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3312     __kmp_cg_max_nth, // int thread_limit;
3313     __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3314     // on task. This is used in the case of target thread_limit
3315     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3316     // for max_active_levels
3317     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3318     // {sched,chunk} pair
3319     __kmp_nested_proc_bind.bind_types[0],
3320     __kmp_default_device,
3321     NULL // struct kmp_internal_control *next;
3322   };
3323
3324   return g_icvs;
3325 }
3326
3327 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3328
3329   kmp_internal_control_t gx_icvs;
3330   gx_icvs.serial_nesting_level =
3331       0; // probably =team->t.t_serial like in save_inter_controls
3332   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3333   gx_icvs.next = NULL;
3334
3335   return gx_icvs;
3336 }
3337
3338 static void __kmp_initialize_root(kmp_root_t *root) {
3339   int f;
3340   kmp_team_t *root_team;
3341   kmp_team_t *hot_team;
3342   int hot_team_max_nth;
3343   kmp_r_sched_t r_sched =
3344       __kmp_get_schedule_global(); // get current state of scheduling globals
3345   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3346   KMP_DEBUG_ASSERT(root);
3347   KMP_ASSERT(!root->r.r_begin);
3348
3349   /* setup the root state structure */
3350   __kmp_init_lock(&root->r.r_begin_lock);
3351   root->r.r_begin = FALSE;
3352   root->r.r_active = FALSE;
3353   root->r.r_in_parallel = 0;
3354   root->r.r_blocktime = __kmp_dflt_blocktime;
3355 #if KMP_AFFINITY_SUPPORTED
3356   root->r.r_affinity_assigned = FALSE;
3357 #endif
3358
3359   /* setup the root team for this task */
3360   /* allocate the root team structure */
3361   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3362
3363   root_team =
3364       __kmp_allocate_team(root,
3365                           1, // new_nproc
3366                           1, // max_nproc
3367 #if OMPT_SUPPORT
3368                           ompt_data_none, // root parallel id
3369 #endif
3370                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3371                           0 // argc
3372                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3373       );
3374 #if USE_DEBUGGER
3375   // Non-NULL value should be assigned to make the debugger display the root
3376   // team.
3377   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3378 #endif
3379
3380   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3381
3382   root->r.r_root_team = root_team;
3383   root_team->t.t_control_stack_top = NULL;
3384
3385   /* initialize root team */
3386   root_team->t.t_threads[0] = NULL;
3387   root_team->t.t_nproc = 1;
3388   root_team->t.t_serialized = 1;
3389   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3390   root_team->t.t_sched.sched = r_sched.sched;
3391   KA_TRACE(
3392       20,
3393       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3394        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3395
3396   /* setup the  hot team for this task */
3397   /* allocate the hot team structure */
3398   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3399
3400   hot_team =
3401       __kmp_allocate_team(root,
3402                           1, // new_nproc
3403                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3404 #if OMPT_SUPPORT
3405                           ompt_data_none, // root parallel id
3406 #endif
3407                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3408                           0 // argc
3409                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3410       );
3411   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3412
3413   root->r.r_hot_team = hot_team;
3414   root_team->t.t_control_stack_top = NULL;
3415
3416   /* first-time initialization */
3417   hot_team->t.t_parent = root_team;
3418
3419   /* initialize hot team */
3420   hot_team_max_nth = hot_team->t.t_max_nproc;
3421   for (f = 0; f < hot_team_max_nth; ++f) {
3422     hot_team->t.t_threads[f] = NULL;
3423   }
3424   hot_team->t.t_nproc = 1;
3425   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3426   hot_team->t.t_sched.sched = r_sched.sched;
3427   hot_team->t.t_size_changed = 0;
3428 }
3429
3430 #ifdef KMP_DEBUG
3431
3432 typedef struct kmp_team_list_item {
3433   kmp_team_p const *entry;
3434   struct kmp_team_list_item *next;
3435 } kmp_team_list_item_t;
3436 typedef kmp_team_list_item_t *kmp_team_list_t;
3437
3438 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3439     kmp_team_list_t list, // List of teams.
3440     kmp_team_p const *team // Team to add.
3441 ) {
3442
3443   // List must terminate with item where both entry and next are NULL.
3444   // Team is added to the list only once.
3445   // List is sorted in ascending order by team id.
3446   // Team id is *not* a key.
3447
3448   kmp_team_list_t l;
3449
3450   KMP_DEBUG_ASSERT(list != NULL);
3451   if (team == NULL) {
3452     return;
3453   }
3454
3455   __kmp_print_structure_team_accum(list, team->t.t_parent);
3456   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3457
3458   // Search list for the team.
3459   l = list;
3460   while (l->next != NULL && l->entry != team) {
3461     l = l->next;
3462   }
3463   if (l->next != NULL) {
3464     return; // Team has been added before, exit.
3465   }
3466
3467   // Team is not found. Search list again for insertion point.
3468   l = list;
3469   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3470     l = l->next;
3471   }
3472
3473   // Insert team.
3474   {
3475     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3476         sizeof(kmp_team_list_item_t));
3477     *item = *l;
3478     l->entry = team;
3479     l->next = item;
3480   }
3481 }
3482
3483 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3484
3485 ) {
3486   __kmp_printf("%s", title);
3487   if (team != NULL) {
3488     __kmp_printf("%2x %p\n", team->t.t_id, team);
3489   } else {
3490     __kmp_printf(" - (nil)\n");
3491   }
3492 }
3493
3494 static void __kmp_print_structure_thread(char const *title,
3495                                          kmp_info_p const *thread) {
3496   __kmp_printf("%s", title);
3497   if (thread != NULL) {
3498     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3499   } else {
3500     __kmp_printf(" - (nil)\n");
3501   }
3502 }
3503
3504 void __kmp_print_structure(void) {
3505
3506   kmp_team_list_t list;
3507
3508   // Initialize list of teams.
3509   list =
3510       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3511   list->entry = NULL;
3512   list->next = NULL;
3513
3514   __kmp_printf("\n------------------------------\nGlobal Thread "
3515                "Table\n------------------------------\n");
3516   {
3517     int gtid;
3518     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3519       __kmp_printf("%2d", gtid);
3520       if (__kmp_threads != NULL) {
3521         __kmp_printf(" %p", __kmp_threads[gtid]);
3522       }
3523       if (__kmp_root != NULL) {
3524         __kmp_printf(" %p", __kmp_root[gtid]);
3525       }
3526       __kmp_printf("\n");
3527     }
3528   }
3529
3530   // Print out __kmp_threads array.
3531   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3532                "----------\n");
3533   if (__kmp_threads != NULL) {
3534     int gtid;
3535     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536       kmp_info_t const *thread = __kmp_threads[gtid];
3537       if (thread != NULL) {
3538         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3539         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3540         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3541         __kmp_print_structure_team("    Serial Team:  ",
3542                                    thread->th.th_serial_team);
3543         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3544         __kmp_print_structure_thread("    Primary:      ",
3545                                      thread->th.th_team_master);
3546         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3547         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3548         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3549         __kmp_print_structure_thread("    Next in pool: ",
3550                                      thread->th.th_next_pool);
3551         __kmp_printf("\n");
3552         __kmp_print_structure_team_accum(list, thread->th.th_team);
3553         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3554       }
3555     }
3556   } else {
3557     __kmp_printf("Threads array is not allocated.\n");
3558   }
3559
3560   // Print out __kmp_root array.
3561   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3562                "--------\n");
3563   if (__kmp_root != NULL) {
3564     int gtid;
3565     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3566       kmp_root_t const *root = __kmp_root[gtid];
3567       if (root != NULL) {
3568         __kmp_printf("GTID %2d %p:\n", gtid, root);
3569         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3570         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3571         __kmp_print_structure_thread("    Uber Thread:  ",
3572                                      root->r.r_uber_thread);
3573         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3574         __kmp_printf("    In Parallel:  %2d\n",
3575                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3576         __kmp_printf("\n");
3577         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3578         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3579       }
3580     }
3581   } else {
3582     __kmp_printf("Ubers array is not allocated.\n");
3583   }
3584
3585   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3586                "--------\n");
3587   while (list->next != NULL) {
3588     kmp_team_p const *team = list->entry;
3589     int i;
3590     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3591     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3592     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3593     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3594     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3595     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3596     for (i = 0; i < team->t.t_nproc; ++i) {
3597       __kmp_printf("    Thread %2d:      ", i);
3598       __kmp_print_structure_thread("", team->t.t_threads[i]);
3599     }
3600     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3601     __kmp_printf("\n");
3602     list = list->next;
3603   }
3604
3605   // Print out __kmp_thread_pool and __kmp_team_pool.
3606   __kmp_printf("\n------------------------------\nPools\n----------------------"
3607                "--------\n");
3608   __kmp_print_structure_thread("Thread pool:          ",
3609                                CCAST(kmp_info_t *, __kmp_thread_pool));
3610   __kmp_print_structure_team("Team pool:            ",
3611                              CCAST(kmp_team_t *, __kmp_team_pool));
3612   __kmp_printf("\n");
3613
3614   // Free team list.
3615   while (list != NULL) {
3616     kmp_team_list_item_t *item = list;
3617     list = list->next;
3618     KMP_INTERNAL_FREE(item);
3619   }
3620 }
3621
3622 #endif
3623
3624 //---------------------------------------------------------------------------
3625 //  Stuff for per-thread fast random number generator
3626 //  Table of primes
3627 static const unsigned __kmp_primes[] = {
3628     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3629     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3630     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3631     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3632     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3633     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3634     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3635     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3636     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3637     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3638     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3639
3640 //---------------------------------------------------------------------------
3641 //  __kmp_get_random: Get a random number using a linear congruential method.
3642 unsigned short __kmp_get_random(kmp_info_t *thread) {
3643   unsigned x = thread->th.th_x;
3644   unsigned short r = (unsigned short)(x >> 16);
3645
3646   thread->th.th_x = x * thread->th.th_a + 1;
3647
3648   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3649                 thread->th.th_info.ds.ds_tid, r));
3650
3651   return r;
3652 }
3653 //--------------------------------------------------------
3654 // __kmp_init_random: Initialize a random number generator
3655 void __kmp_init_random(kmp_info_t *thread) {
3656   unsigned seed = thread->th.th_info.ds.ds_tid;
3657
3658   thread->th.th_a =
3659       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3660   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3661   KA_TRACE(30,
3662            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3663 }
3664
3665 #if KMP_OS_WINDOWS
3666 /* reclaim array entries for root threads that are already dead, returns number
3667  * reclaimed */
3668 static int __kmp_reclaim_dead_roots(void) {
3669   int i, r = 0;
3670
3671   for (i = 0; i < __kmp_threads_capacity; ++i) {
3672     if (KMP_UBER_GTID(i) &&
3673         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3674         !__kmp_root[i]
3675              ->r.r_active) { // AC: reclaim only roots died in non-active state
3676       r += __kmp_unregister_root_other_thread(i);
3677     }
3678   }
3679   return r;
3680 }
3681 #endif
3682
3683 /* This function attempts to create free entries in __kmp_threads and
3684    __kmp_root, and returns the number of free entries generated.
3685
3686    For Windows* OS static library, the first mechanism used is to reclaim array
3687    entries for root threads that are already dead.
3688
3689    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3690    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3691    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3692    threadprivate cache array has been created. Synchronization with
3693    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3694
3695    After any dead root reclamation, if the clipping value allows array expansion
3696    to result in the generation of a total of nNeed free slots, the function does
3697    that expansion. If not, nothing is done beyond the possible initial root
3698    thread reclamation.
3699
3700    If any argument is negative, the behavior is undefined. */
3701 static int __kmp_expand_threads(int nNeed) {
3702   int added = 0;
3703   int minimumRequiredCapacity;
3704   int newCapacity;
3705   kmp_info_t **newThreads;
3706   kmp_root_t **newRoot;
3707
3708   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3709   // resizing __kmp_threads does not need additional protection if foreign
3710   // threads are present
3711
3712 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3713   /* only for Windows static library */
3714   /* reclaim array entries for root threads that are already dead */
3715   added = __kmp_reclaim_dead_roots();
3716
3717   if (nNeed) {
3718     nNeed -= added;
3719     if (nNeed < 0)
3720       nNeed = 0;
3721   }
3722 #endif
3723   if (nNeed <= 0)
3724     return added;
3725
3726   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3727   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3728   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3729   // > __kmp_max_nth in one of two ways:
3730   //
3731   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3732   //    may not be reused by another thread, so we may need to increase
3733   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3734   //
3735   // 2) New foreign root(s) are encountered.  We always register new foreign
3736   //    roots. This may cause a smaller # of threads to be allocated at
3737   //    subsequent parallel regions, but the worker threads hang around (and
3738   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3739   //
3740   // Anyway, that is the reason for moving the check to see if
3741   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3742   // instead of having it performed here. -BB
3743
3744   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3745
3746   /* compute expansion headroom to check if we can expand */
3747   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3748     /* possible expansion too small -- give up */
3749     return added;
3750   }
3751   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3752
3753   newCapacity = __kmp_threads_capacity;
3754   do {
3755     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3756                                                           : __kmp_sys_max_nth;
3757   } while (newCapacity < minimumRequiredCapacity);
3758   newThreads = (kmp_info_t **)__kmp_allocate(
3759       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3760   newRoot =
3761       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3762   KMP_MEMCPY(newThreads, __kmp_threads,
3763              __kmp_threads_capacity * sizeof(kmp_info_t *));
3764   KMP_MEMCPY(newRoot, __kmp_root,
3765              __kmp_threads_capacity * sizeof(kmp_root_t *));
3766   // Put old __kmp_threads array on a list. Any ongoing references to the old
3767   // list will be valid. This list is cleaned up at library shutdown.
3768   kmp_old_threads_list_t *node =
3769       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3770   node->threads = __kmp_threads;
3771   node->next = __kmp_old_threads_list;
3772   __kmp_old_threads_list = node;
3773
3774   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3775   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3776   added += newCapacity - __kmp_threads_capacity;
3777   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3778
3779   if (newCapacity > __kmp_tp_capacity) {
3780     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3781     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3782       __kmp_threadprivate_resize_cache(newCapacity);
3783     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3784       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3785     }
3786     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3787   }
3788
3789   return added;
3790 }
3791
3792 /* Register the current thread as a root thread and obtain our gtid. We must
3793    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3794    thread that calls from __kmp_do_serial_initialize() */
3795 int __kmp_register_root(int initial_thread) {
3796   kmp_info_t *root_thread;
3797   kmp_root_t *root;
3798   int gtid;
3799   int capacity;
3800   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3801   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3802   KMP_MB();
3803
3804   /* 2007-03-02:
3805      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3806      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3807      work as expected -- it may return false (that means there is at least one
3808      empty slot in __kmp_threads array), but it is possible the only free slot
3809      is #0, which is reserved for initial thread and so cannot be used for this
3810      one. Following code workarounds this bug.
3811
3812      However, right solution seems to be not reserving slot #0 for initial
3813      thread because:
3814      (1) there is no magic in slot #0,
3815      (2) we cannot detect initial thread reliably (the first thread which does
3816         serial initialization may be not a real initial thread).
3817   */
3818   capacity = __kmp_threads_capacity;
3819   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3820     --capacity;
3821   }
3822
3823   // If it is not for initializing the hidden helper team, we need to take
3824   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3825   // in __kmp_threads_capacity.
3826   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3827     capacity -= __kmp_hidden_helper_threads_num;
3828   }
3829
3830   /* see if there are too many threads */
3831   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3832     if (__kmp_tp_cached) {
3833       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3834                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3835                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3836     } else {
3837       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3838                   __kmp_msg_null);
3839     }
3840   }
3841
3842   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3843   // 0: initial thread, also a regular OpenMP thread.
3844   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3845   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3846   // regular OpenMP threads.
3847   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3848     // Find an available thread slot for hidden helper thread. Slots for hidden
3849     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3850     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3851                    gtid <= __kmp_hidden_helper_threads_num;
3852          gtid++)
3853       ;
3854     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3855     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3856                  "hidden helper thread: T#%d\n",
3857                  gtid));
3858   } else {
3859     /* find an available thread slot */
3860     // Don't reassign the zero slot since we need that to only be used by
3861     // initial thread. Slots for hidden helper threads should also be skipped.
3862     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3863       gtid = 0;
3864     } else {
3865       for (gtid = __kmp_hidden_helper_threads_num + 1;
3866            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3867         ;
3868     }
3869     KA_TRACE(
3870         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3871     KMP_ASSERT(gtid < __kmp_threads_capacity);
3872   }
3873
3874   /* update global accounting */
3875   __kmp_all_nth++;
3876   TCW_4(__kmp_nth, __kmp_nth + 1);
3877
3878   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3879   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3880   if (__kmp_adjust_gtid_mode) {
3881     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3882       if (TCR_4(__kmp_gtid_mode) != 2) {
3883         TCW_4(__kmp_gtid_mode, 2);
3884       }
3885     } else {
3886       if (TCR_4(__kmp_gtid_mode) != 1) {
3887         TCW_4(__kmp_gtid_mode, 1);
3888       }
3889     }
3890   }
3891
3892 #ifdef KMP_ADJUST_BLOCKTIME
3893   /* Adjust blocktime to zero if necessary            */
3894   /* Middle initialization might not have occurred yet */
3895   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3896     if (__kmp_nth > __kmp_avail_proc) {
3897       __kmp_zero_bt = TRUE;
3898     }
3899   }
3900 #endif /* KMP_ADJUST_BLOCKTIME */
3901
3902   /* setup this new hierarchy */
3903   if (!(root = __kmp_root[gtid])) {
3904     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3905     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3906   }
3907
3908 #if KMP_STATS_ENABLED
3909   // Initialize stats as soon as possible (right after gtid assignment).
3910   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3911   __kmp_stats_thread_ptr->startLife();
3912   KMP_SET_THREAD_STATE(SERIAL_REGION);
3913   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3914 #endif
3915   __kmp_initialize_root(root);
3916
3917   /* setup new root thread structure */
3918   if (root->r.r_uber_thread) {
3919     root_thread = root->r.r_uber_thread;
3920   } else {
3921     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3922     if (__kmp_storage_map) {
3923       __kmp_print_thread_storage_map(root_thread, gtid);
3924     }
3925     root_thread->th.th_info.ds.ds_gtid = gtid;
3926 #if OMPT_SUPPORT
3927     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3928 #endif
3929     root_thread->th.th_root = root;
3930     if (__kmp_env_consistency_check) {
3931       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3932     }
3933 #if USE_FAST_MEMORY
3934     __kmp_initialize_fast_memory(root_thread);
3935 #endif /* USE_FAST_MEMORY */
3936
3937 #if KMP_USE_BGET
3938     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3939     __kmp_initialize_bget(root_thread);
3940 #endif
3941     __kmp_init_random(root_thread); // Initialize random number generator
3942   }
3943
3944   /* setup the serial team held in reserve by the root thread */
3945   if (!root_thread->th.th_serial_team) {
3946     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3947     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3948     root_thread->th.th_serial_team = __kmp_allocate_team(
3949         root, 1, 1,
3950 #if OMPT_SUPPORT
3951         ompt_data_none, // root parallel id
3952 #endif
3953         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3954   }
3955   KMP_ASSERT(root_thread->th.th_serial_team);
3956   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3957                 root_thread->th.th_serial_team));
3958
3959   /* drop root_thread into place */
3960   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3961
3962   root->r.r_root_team->t.t_threads[0] = root_thread;
3963   root->r.r_hot_team->t.t_threads[0] = root_thread;
3964   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3965   // AC: the team created in reserve, not for execution (it is unused for now).
3966   root_thread->th.th_serial_team->t.t_serialized = 0;
3967   root->r.r_uber_thread = root_thread;
3968
3969   /* initialize the thread, get it ready to go */
3970   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3971   TCW_4(__kmp_init_gtid, TRUE);
3972
3973   /* prepare the primary thread for get_gtid() */
3974   __kmp_gtid_set_specific(gtid);
3975
3976 #if USE_ITT_BUILD
3977   __kmp_itt_thread_name(gtid);
3978 #endif /* USE_ITT_BUILD */
3979
3980 #ifdef KMP_TDATA_GTID
3981   __kmp_gtid = gtid;
3982 #endif
3983   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3984   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3985
3986   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3987                 "plain=%u\n",
3988                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3989                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3990                 KMP_INIT_BARRIER_STATE));
3991   { // Initialize barrier data.
3992     int b;
3993     for (b = 0; b < bs_last_barrier; ++b) {
3994       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3995 #if USE_DEBUGGER
3996       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3997 #endif
3998     }
3999   }
4000   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4001                    KMP_INIT_BARRIER_STATE);
4002
4003 #if KMP_AFFINITY_SUPPORTED
4004   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4005   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4006   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4007   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4008 #endif /* KMP_AFFINITY_SUPPORTED */
4009   root_thread->th.th_def_allocator = __kmp_def_allocator;
4010   root_thread->th.th_prev_level = 0;
4011   root_thread->th.th_prev_num_threads = 1;
4012
4013   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4014   tmp->cg_root = root_thread;
4015   tmp->cg_thread_limit = __kmp_cg_max_nth;
4016   tmp->cg_nthreads = 1;
4017   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4018                  " cg_nthreads init to 1\n",
4019                  root_thread, tmp));
4020   tmp->up = NULL;
4021   root_thread->th.th_cg_roots = tmp;
4022
4023   __kmp_root_counter++;
4024
4025 #if OMPT_SUPPORT
4026   if (!initial_thread && ompt_enabled.enabled) {
4027
4028     kmp_info_t *root_thread = ompt_get_thread();
4029
4030     ompt_set_thread_state(root_thread, ompt_state_overhead);
4031
4032     if (ompt_enabled.ompt_callback_thread_begin) {
4033       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4034           ompt_thread_initial, __ompt_get_thread_data_internal());
4035     }
4036     ompt_data_t *task_data;
4037     ompt_data_t *parallel_data;
4038     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4039                                   NULL);
4040     if (ompt_enabled.ompt_callback_implicit_task) {
4041       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4042           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4043     }
4044
4045     ompt_set_thread_state(root_thread, ompt_state_work_serial);
4046   }
4047 #endif
4048 #if OMPD_SUPPORT
4049   if (ompd_state & OMPD_ENABLE_BP)
4050     ompd_bp_thread_begin();
4051 #endif
4052
4053   KMP_MB();
4054   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4055
4056   return gtid;
4057 }
4058
4059 #if KMP_NESTED_HOT_TEAMS
4060 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4061                                 const int max_level) {
4062   int i, n, nth;
4063   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4064   if (!hot_teams || !hot_teams[level].hot_team) {
4065     return 0;
4066   }
4067   KMP_DEBUG_ASSERT(level < max_level);
4068   kmp_team_t *team = hot_teams[level].hot_team;
4069   nth = hot_teams[level].hot_team_nth;
4070   n = nth - 1; // primary thread is not freed
4071   if (level < max_level - 1) {
4072     for (i = 0; i < nth; ++i) {
4073       kmp_info_t *th = team->t.t_threads[i];
4074       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4075       if (i > 0 && th->th.th_hot_teams) {
4076         __kmp_free(th->th.th_hot_teams);
4077         th->th.th_hot_teams = NULL;
4078       }
4079     }
4080   }
4081   __kmp_free_team(root, team, NULL);
4082   return n;
4083 }
4084 #endif
4085
4086 // Resets a root thread and clear its root and hot teams.
4087 // Returns the number of __kmp_threads entries directly and indirectly freed.
4088 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4089   kmp_team_t *root_team = root->r.r_root_team;
4090   kmp_team_t *hot_team = root->r.r_hot_team;
4091   int n = hot_team->t.t_nproc;
4092   int i;
4093
4094   KMP_DEBUG_ASSERT(!root->r.r_active);
4095
4096   root->r.r_root_team = NULL;
4097   root->r.r_hot_team = NULL;
4098   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4099   // before call to __kmp_free_team().
4100   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4101 #if KMP_NESTED_HOT_TEAMS
4102   if (__kmp_hot_teams_max_level >
4103       0) { // need to free nested hot teams and their threads if any
4104     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4105       kmp_info_t *th = hot_team->t.t_threads[i];
4106       if (__kmp_hot_teams_max_level > 1) {
4107         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4108       }
4109       if (th->th.th_hot_teams) {
4110         __kmp_free(th->th.th_hot_teams);
4111         th->th.th_hot_teams = NULL;
4112       }
4113     }
4114   }
4115 #endif
4116   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4117
4118   // Before we can reap the thread, we need to make certain that all other
4119   // threads in the teams that had this root as ancestor have stopped trying to
4120   // steal tasks.
4121   if (__kmp_tasking_mode != tskm_immediate_exec) {
4122     __kmp_wait_to_unref_task_teams();
4123   }
4124
4125 #if KMP_OS_WINDOWS
4126   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4127   KA_TRACE(
4128       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4129            "\n",
4130            (LPVOID) & (root->r.r_uber_thread->th),
4131            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4132   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4133 #endif /* KMP_OS_WINDOWS */
4134
4135 #if OMPD_SUPPORT
4136   if (ompd_state & OMPD_ENABLE_BP)
4137     ompd_bp_thread_end();
4138 #endif
4139
4140 #if OMPT_SUPPORT
4141   ompt_data_t *task_data;
4142   ompt_data_t *parallel_data;
4143   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4144                                 NULL);
4145   if (ompt_enabled.ompt_callback_implicit_task) {
4146     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4147         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4148   }
4149   if (ompt_enabled.ompt_callback_thread_end) {
4150     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4151         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4152   }
4153 #endif
4154
4155   TCW_4(__kmp_nth,
4156         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4157   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4158   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4159                  " to %d\n",
4160                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4161                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4162   if (i == 1) {
4163     // need to free contention group structure
4164     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4165                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4166     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4167     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4168     root->r.r_uber_thread->th.th_cg_roots = NULL;
4169   }
4170   __kmp_reap_thread(root->r.r_uber_thread, 1);
4171
4172   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4173   // instead of freeing.
4174   root->r.r_uber_thread = NULL;
4175   /* mark root as no longer in use */
4176   root->r.r_begin = FALSE;
4177
4178   return n;
4179 }
4180
4181 void __kmp_unregister_root_current_thread(int gtid) {
4182   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4183   /* this lock should be ok, since unregister_root_current_thread is never
4184      called during an abort, only during a normal close. furthermore, if you
4185      have the forkjoin lock, you should never try to get the initz lock */
4186   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4187   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4188     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4189                   "exiting T#%d\n",
4190                   gtid));
4191     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4192     return;
4193   }
4194   kmp_root_t *root = __kmp_root[gtid];
4195
4196   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4197   KMP_ASSERT(KMP_UBER_GTID(gtid));
4198   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4199   KMP_ASSERT(root->r.r_active == FALSE);
4200
4201   KMP_MB();
4202
4203   kmp_info_t *thread = __kmp_threads[gtid];
4204   kmp_team_t *team = thread->th.th_team;
4205   kmp_task_team_t *task_team = thread->th.th_task_team;
4206
4207   // we need to wait for the proxy tasks before finishing the thread
4208   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4209                             task_team->tt.tt_hidden_helper_task_encountered)) {
4210 #if OMPT_SUPPORT
4211     // the runtime is shutting down so we won't report any events
4212     thread->th.ompt_thread_info.state = ompt_state_undefined;
4213 #endif
4214     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4215   }
4216
4217   __kmp_reset_root(gtid, root);
4218
4219   KMP_MB();
4220   KC_TRACE(10,
4221            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4222
4223   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4224 }
4225
4226 #if KMP_OS_WINDOWS
4227 /* __kmp_forkjoin_lock must be already held
4228    Unregisters a root thread that is not the current thread.  Returns the number
4229    of __kmp_threads entries freed as a result. */
4230 static int __kmp_unregister_root_other_thread(int gtid) {
4231   kmp_root_t *root = __kmp_root[gtid];
4232   int r;
4233
4234   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4235   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4236   KMP_ASSERT(KMP_UBER_GTID(gtid));
4237   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4238   KMP_ASSERT(root->r.r_active == FALSE);
4239
4240   r = __kmp_reset_root(gtid, root);
4241   KC_TRACE(10,
4242            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4243   return r;
4244 }
4245 #endif
4246
4247 #if KMP_DEBUG
4248 void __kmp_task_info() {
4249
4250   kmp_int32 gtid = __kmp_entry_gtid();
4251   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4252   kmp_info_t *this_thr = __kmp_threads[gtid];
4253   kmp_team_t *steam = this_thr->th.th_serial_team;
4254   kmp_team_t *team = this_thr->th.th_team;
4255
4256   __kmp_printf(
4257       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4258       "ptask=%p\n",
4259       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4260       team->t.t_implicit_task_taskdata[tid].td_parent);
4261 }
4262 #endif // KMP_DEBUG
4263
4264 /* TODO optimize with one big memclr, take out what isn't needed, split
4265    responsibility to workers as much as possible, and delay initialization of
4266    features as much as possible  */
4267 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4268                                   int tid, int gtid) {
4269   /* this_thr->th.th_info.ds.ds_gtid is setup in
4270      kmp_allocate_thread/create_worker.
4271      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4272   KMP_DEBUG_ASSERT(this_thr != NULL);
4273   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4274   KMP_DEBUG_ASSERT(team);
4275   KMP_DEBUG_ASSERT(team->t.t_threads);
4276   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4277   kmp_info_t *master = team->t.t_threads[0];
4278   KMP_DEBUG_ASSERT(master);
4279   KMP_DEBUG_ASSERT(master->th.th_root);
4280
4281   KMP_MB();
4282
4283   TCW_SYNC_PTR(this_thr->th.th_team, team);
4284
4285   this_thr->th.th_info.ds.ds_tid = tid;
4286   this_thr->th.th_set_nproc = 0;
4287   if (__kmp_tasking_mode != tskm_immediate_exec)
4288     // When tasking is possible, threads are not safe to reap until they are
4289     // done tasking; this will be set when tasking code is exited in wait
4290     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4291   else // no tasking --> always safe to reap
4292     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4293   this_thr->th.th_set_proc_bind = proc_bind_default;
4294 #if KMP_AFFINITY_SUPPORTED
4295   this_thr->th.th_new_place = this_thr->th.th_current_place;
4296 #endif
4297   this_thr->th.th_root = master->th.th_root;
4298
4299   /* setup the thread's cache of the team structure */
4300   this_thr->th.th_team_nproc = team->t.t_nproc;
4301   this_thr->th.th_team_master = master;
4302   this_thr->th.th_team_serialized = team->t.t_serialized;
4303
4304   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4305
4306   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4307                 tid, gtid, this_thr, this_thr->th.th_current_task));
4308
4309   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4310                            team, tid, TRUE);
4311
4312   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4313                 tid, gtid, this_thr, this_thr->th.th_current_task));
4314   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4315   // __kmp_initialize_team()?
4316
4317   /* TODO no worksharing in speculative threads */
4318   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4319
4320   this_thr->th.th_local.this_construct = 0;
4321
4322   if (!this_thr->th.th_pri_common) {
4323     this_thr->th.th_pri_common =
4324         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4325     if (__kmp_storage_map) {
4326       __kmp_print_storage_map_gtid(
4327           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4328           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4329     }
4330     this_thr->th.th_pri_head = NULL;
4331   }
4332
4333   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4334       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4335     // Make new thread's CG root same as primary thread's
4336     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4337     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4338     if (tmp) {
4339       // worker changes CG, need to check if old CG should be freed
4340       int i = tmp->cg_nthreads--;
4341       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4342                      " on node %p of thread %p to %d\n",
4343                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4344       if (i == 1) {
4345         __kmp_free(tmp); // last thread left CG --> free it
4346       }
4347     }
4348     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4349     // Increment new thread's CG root's counter to add the new thread
4350     this_thr->th.th_cg_roots->cg_nthreads++;
4351     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4352                    " node %p of thread %p to %d\n",
4353                    this_thr, this_thr->th.th_cg_roots,
4354                    this_thr->th.th_cg_roots->cg_root,
4355                    this_thr->th.th_cg_roots->cg_nthreads));
4356     this_thr->th.th_current_task->td_icvs.thread_limit =
4357         this_thr->th.th_cg_roots->cg_thread_limit;
4358   }
4359
4360   /* Initialize dynamic dispatch */
4361   {
4362     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4363     // Use team max_nproc since this will never change for the team.
4364     size_t disp_size =
4365         sizeof(dispatch_private_info_t) *
4366         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4367     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4368                   team->t.t_max_nproc));
4369     KMP_ASSERT(dispatch);
4370     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4371     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4372
4373     dispatch->th_disp_index = 0;
4374     dispatch->th_doacross_buf_idx = 0;
4375     if (!dispatch->th_disp_buffer) {
4376       dispatch->th_disp_buffer =
4377           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4378
4379       if (__kmp_storage_map) {
4380         __kmp_print_storage_map_gtid(
4381             gtid, &dispatch->th_disp_buffer[0],
4382             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4383                                           ? 1
4384                                           : __kmp_dispatch_num_buffers],
4385             disp_size,
4386             "th_%d.th_dispatch.th_disp_buffer "
4387             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4388             gtid, team->t.t_id, gtid);
4389       }
4390     } else {
4391       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4392     }
4393
4394     dispatch->th_dispatch_pr_current = 0;
4395     dispatch->th_dispatch_sh_current = 0;
4396
4397     dispatch->th_deo_fcn = 0; /* ORDERED     */
4398     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4399   }
4400
4401   this_thr->th.th_next_pool = NULL;
4402
4403   if (!this_thr->th.th_task_state_memo_stack) {
4404     size_t i;
4405     this_thr->th.th_task_state_memo_stack =
4406         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4407     this_thr->th.th_task_state_top = 0;
4408     this_thr->th.th_task_state_stack_sz = 4;
4409     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4410          ++i) // zero init the stack
4411       this_thr->th.th_task_state_memo_stack[i] = 0;
4412   }
4413
4414   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4415   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4416
4417   KMP_MB();
4418 }
4419
4420 /* allocate a new thread for the requesting team. this is only called from
4421    within a forkjoin critical section. we will first try to get an available
4422    thread from the thread pool. if none is available, we will fork a new one
4423    assuming we are able to create a new one. this should be assured, as the
4424    caller should check on this first. */
4425 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4426                                   int new_tid) {
4427   kmp_team_t *serial_team;
4428   kmp_info_t *new_thr;
4429   int new_gtid;
4430
4431   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4432   KMP_DEBUG_ASSERT(root && team);
4433 #if !KMP_NESTED_HOT_TEAMS
4434   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4435 #endif
4436   KMP_MB();
4437
4438   /* first, try to get one from the thread pool */
4439   if (__kmp_thread_pool) {
4440     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442     if (new_thr == __kmp_thread_pool_insert_pt) {
4443       __kmp_thread_pool_insert_pt = NULL;
4444     }
4445     TCW_4(new_thr->th.th_in_pool, FALSE);
4446     __kmp_suspend_initialize_thread(new_thr);
4447     __kmp_lock_suspend_mx(new_thr);
4448     if (new_thr->th.th_active_in_pool == TRUE) {
4449       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451       new_thr->th.th_active_in_pool = FALSE;
4452     }
4453     __kmp_unlock_suspend_mx(new_thr);
4454
4455     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457     KMP_ASSERT(!new_thr->th.th_team);
4458     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459
4460     /* setup the thread structure */
4461     __kmp_initialize_info(new_thr, team, new_tid,
4462                           new_thr->th.th_info.ds.ds_gtid);
4463     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464
4465     TCW_4(__kmp_nth, __kmp_nth + 1);
4466
4467     new_thr->th.th_task_state = 0;
4468     new_thr->th.th_task_state_top = 0;
4469     new_thr->th.th_task_state_stack_sz = 4;
4470
4471     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4472       // Make sure pool thread has transitioned to waiting on own thread struct
4473       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4474       // Thread activated in __kmp_allocate_team when increasing team size
4475     }
4476
4477 #ifdef KMP_ADJUST_BLOCKTIME
4478     /* Adjust blocktime back to zero if necessary */
4479     /* Middle initialization might not have occurred yet */
4480     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4481       if (__kmp_nth > __kmp_avail_proc) {
4482         __kmp_zero_bt = TRUE;
4483       }
4484     }
4485 #endif /* KMP_ADJUST_BLOCKTIME */
4486
4487 #if KMP_DEBUG
4488     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4489     // KMP_BARRIER_PARENT_FLAG.
4490     int b;
4491     kmp_balign_t *balign = new_thr->th.th_bar;
4492     for (b = 0; b < bs_last_barrier; ++b)
4493       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4494 #endif
4495
4496     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4497                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4498
4499     KMP_MB();
4500     return new_thr;
4501   }
4502
4503   /* no, well fork a new one */
4504   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4505   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4506
4507 #if KMP_USE_MONITOR
4508   // If this is the first worker thread the RTL is creating, then also
4509   // launch the monitor thread.  We try to do this as early as possible.
4510   if (!TCR_4(__kmp_init_monitor)) {
4511     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4512     if (!TCR_4(__kmp_init_monitor)) {
4513       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4514       TCW_4(__kmp_init_monitor, 1);
4515       __kmp_create_monitor(&__kmp_monitor);
4516       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4517 #if KMP_OS_WINDOWS
4518       // AC: wait until monitor has started. This is a fix for CQ232808.
4519       // The reason is that if the library is loaded/unloaded in a loop with
4520       // small (parallel) work in between, then there is high probability that
4521       // monitor thread started after the library shutdown. At shutdown it is
4522       // too late to cope with the problem, because when the primary thread is
4523       // in DllMain (process detach) the monitor has no chances to start (it is
4524       // blocked), and primary thread has no means to inform the monitor that
4525       // the library has gone, because all the memory which the monitor can
4526       // access is going to be released/reset.
4527       while (TCR_4(__kmp_init_monitor) < 2) {
4528         KMP_YIELD(TRUE);
4529       }
4530       KF_TRACE(10, ("after monitor thread has started\n"));
4531 #endif
4532     }
4533     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4534   }
4535 #endif
4536
4537   KMP_MB();
4538
4539   {
4540     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4541                              ? 1
4542                              : __kmp_hidden_helper_threads_num + 1;
4543
4544     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4545          ++new_gtid) {
4546       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4547     }
4548
4549     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4550       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4551     }
4552   }
4553
4554   /* allocate space for it. */
4555   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4556
4557   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4558
4559 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4560   // suppress race conditions detection on synchronization flags in debug mode
4561   // this helps to analyze library internals eliminating false positives
4562   __itt_suppress_mark_range(
4563       __itt_suppress_range, __itt_suppress_threading_errors,
4564       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4565   __itt_suppress_mark_range(
4566       __itt_suppress_range, __itt_suppress_threading_errors,
4567       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4568 #if KMP_OS_WINDOWS
4569   __itt_suppress_mark_range(
4570       __itt_suppress_range, __itt_suppress_threading_errors,
4571       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4572 #else
4573   __itt_suppress_mark_range(__itt_suppress_range,
4574                             __itt_suppress_threading_errors,
4575                             &new_thr->th.th_suspend_init_count,
4576                             sizeof(new_thr->th.th_suspend_init_count));
4577 #endif
4578   // TODO: check if we need to also suppress b_arrived flags
4579   __itt_suppress_mark_range(__itt_suppress_range,
4580                             __itt_suppress_threading_errors,
4581                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4582                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4583   __itt_suppress_mark_range(__itt_suppress_range,
4584                             __itt_suppress_threading_errors,
4585                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4586                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4587   __itt_suppress_mark_range(__itt_suppress_range,
4588                             __itt_suppress_threading_errors,
4589                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4590                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4591 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4592   if (__kmp_storage_map) {
4593     __kmp_print_thread_storage_map(new_thr, new_gtid);
4594   }
4595
4596   // add the reserve serialized team, initialized from the team's primary thread
4597   {
4598     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4599     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4600     new_thr->th.th_serial_team = serial_team =
4601         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4602 #if OMPT_SUPPORT
4603                                           ompt_data_none, // root parallel id
4604 #endif
4605                                           proc_bind_default, &r_icvs,
4606                                           0 USE_NESTED_HOT_ARG(NULL));
4607   }
4608   KMP_ASSERT(serial_team);
4609   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4610   // execution (it is unused for now).
4611   serial_team->t.t_threads[0] = new_thr;
4612   KF_TRACE(10,
4613            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4614             new_thr));
4615
4616   /* setup the thread structures */
4617   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4618
4619 #if USE_FAST_MEMORY
4620   __kmp_initialize_fast_memory(new_thr);
4621 #endif /* USE_FAST_MEMORY */
4622
4623 #if KMP_USE_BGET
4624   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4625   __kmp_initialize_bget(new_thr);
4626 #endif
4627
4628   __kmp_init_random(new_thr); // Initialize random number generator
4629
4630   /* Initialize these only once when thread is grabbed for a team allocation */
4631   KA_TRACE(20,
4632            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4633             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4634
4635   int b;
4636   kmp_balign_t *balign = new_thr->th.th_bar;
4637   for (b = 0; b < bs_last_barrier; ++b) {
4638     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4639     balign[b].bb.team = NULL;
4640     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4641     balign[b].bb.use_oncore_barrier = 0;
4642   }
4643
4644   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4645   new_thr->th.th_sleep_loc_type = flag_unset;
4646
4647   new_thr->th.th_spin_here = FALSE;
4648   new_thr->th.th_next_waiting = 0;
4649 #if KMP_OS_UNIX
4650   new_thr->th.th_blocking = false;
4651 #endif
4652
4653 #if KMP_AFFINITY_SUPPORTED
4654   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4655   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4656   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4657   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4658 #endif
4659   new_thr->th.th_def_allocator = __kmp_def_allocator;
4660   new_thr->th.th_prev_level = 0;
4661   new_thr->th.th_prev_num_threads = 1;
4662
4663   TCW_4(new_thr->th.th_in_pool, FALSE);
4664   new_thr->th.th_active_in_pool = FALSE;
4665   TCW_4(new_thr->th.th_active, TRUE);
4666
4667   /* adjust the global counters */
4668   __kmp_all_nth++;
4669   __kmp_nth++;
4670
4671   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4672   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4673   if (__kmp_adjust_gtid_mode) {
4674     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4675       if (TCR_4(__kmp_gtid_mode) != 2) {
4676         TCW_4(__kmp_gtid_mode, 2);
4677       }
4678     } else {
4679       if (TCR_4(__kmp_gtid_mode) != 1) {
4680         TCW_4(__kmp_gtid_mode, 1);
4681       }
4682     }
4683   }
4684
4685 #ifdef KMP_ADJUST_BLOCKTIME
4686   /* Adjust blocktime back to zero if necessary       */
4687   /* Middle initialization might not have occurred yet */
4688   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4689     if (__kmp_nth > __kmp_avail_proc) {
4690       __kmp_zero_bt = TRUE;
4691     }
4692   }
4693 #endif /* KMP_ADJUST_BLOCKTIME */
4694
4695 #if KMP_AFFINITY_SUPPORTED
4696   // Set the affinity and topology information for new thread
4697   __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4698 #endif
4699
4700   /* actually fork it and create the new worker thread */
4701   KF_TRACE(
4702       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4703   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4704   KF_TRACE(10,
4705            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4706
4707   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4708                 new_gtid));
4709   KMP_MB();
4710   return new_thr;
4711 }
4712
4713 /* Reinitialize team for reuse.
4714    The hot team code calls this case at every fork barrier, so EPCC barrier
4715    test are extremely sensitive to changes in it, esp. writes to the team
4716    struct, which cause a cache invalidation in all threads.
4717    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4718 static void __kmp_reinitialize_team(kmp_team_t *team,
4719                                     kmp_internal_control_t *new_icvs,
4720                                     ident_t *loc) {
4721   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4722                 team->t.t_threads[0], team));
4723   KMP_DEBUG_ASSERT(team && new_icvs);
4724   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4725   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4726
4727   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4728   // Copy ICVs to the primary thread's implicit taskdata
4729   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4730   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4731
4732   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4733                 team->t.t_threads[0], team));
4734 }
4735
4736 /* Initialize the team data structure.
4737    This assumes the t_threads and t_max_nproc are already set.
4738    Also, we don't touch the arguments */
4739 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4740                                   kmp_internal_control_t *new_icvs,
4741                                   ident_t *loc) {
4742   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4743
4744   /* verify */
4745   KMP_DEBUG_ASSERT(team);
4746   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4747   KMP_DEBUG_ASSERT(team->t.t_threads);
4748   KMP_MB();
4749
4750   team->t.t_master_tid = 0; /* not needed */
4751   /* team->t.t_master_bar;        not needed */
4752   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4753   team->t.t_nproc = new_nproc;
4754
4755   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4756   team->t.t_next_pool = NULL;
4757   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4758    * up hot team */
4759
4760   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4761   team->t.t_invoke = NULL; /* not needed */
4762
4763   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4764   team->t.t_sched.sched = new_icvs->sched.sched;
4765
4766 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4767   team->t.t_fp_control_saved = FALSE; /* not needed */
4768   team->t.t_x87_fpu_control_word = 0; /* not needed */
4769   team->t.t_mxcsr = 0; /* not needed */
4770 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4771
4772   team->t.t_construct = 0;
4773
4774   team->t.t_ordered.dt.t_value = 0;
4775   team->t.t_master_active = FALSE;
4776
4777 #ifdef KMP_DEBUG
4778   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4779 #endif
4780 #if KMP_OS_WINDOWS
4781   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4782 #endif
4783
4784   team->t.t_control_stack_top = NULL;
4785
4786   __kmp_reinitialize_team(team, new_icvs, loc);
4787
4788   KMP_MB();
4789   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4790 }
4791
4792 #if KMP_AFFINITY_SUPPORTED
4793 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4794                                           int first, int last, int newp) {
4795   th->th.th_first_place = first;
4796   th->th.th_last_place = last;
4797   th->th.th_new_place = newp;
4798   if (newp != th->th.th_current_place) {
4799     if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4800       team->t.t_display_affinity = 1;
4801     // Copy topology information associated with the new place
4802     th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4803     th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4804   }
4805 }
4806
4807 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4808 // It calculates the worker + primary thread's partition based upon the parent
4809 // thread's partition, and binds each worker to a thread in their partition.
4810 // The primary thread's partition should already include its current binding.
4811 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4812   // Do not partition places for the hidden helper team
4813   if (KMP_HIDDEN_HELPER_TEAM(team))
4814     return;
4815   // Copy the primary thread's place partition to the team struct
4816   kmp_info_t *master_th = team->t.t_threads[0];
4817   KMP_DEBUG_ASSERT(master_th != NULL);
4818   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4819   int first_place = master_th->th.th_first_place;
4820   int last_place = master_th->th.th_last_place;
4821   int masters_place = master_th->th.th_current_place;
4822   int num_masks = __kmp_affinity.num_masks;
4823   team->t.t_first_place = first_place;
4824   team->t.t_last_place = last_place;
4825
4826   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4827                 "bound to place %d partition = [%d,%d]\n",
4828                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4829                 team->t.t_id, masters_place, first_place, last_place));
4830
4831   switch (proc_bind) {
4832
4833   case proc_bind_default:
4834     // Serial teams might have the proc_bind policy set to proc_bind_default.
4835     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4836     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4837     break;
4838
4839   case proc_bind_primary: {
4840     int f;
4841     int n_th = team->t.t_nproc;
4842     for (f = 1; f < n_th; f++) {
4843       kmp_info_t *th = team->t.t_threads[f];
4844       KMP_DEBUG_ASSERT(th != NULL);
4845       __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4846
4847       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4848                      "partition = [%d,%d]\n",
4849                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4850                      f, masters_place, first_place, last_place));
4851     }
4852   } break;
4853
4854   case proc_bind_close: {
4855     int f;
4856     int n_th = team->t.t_nproc;
4857     int n_places;
4858     if (first_place <= last_place) {
4859       n_places = last_place - first_place + 1;
4860     } else {
4861       n_places = num_masks - first_place + last_place + 1;
4862     }
4863     if (n_th <= n_places) {
4864       int place = masters_place;
4865       for (f = 1; f < n_th; f++) {
4866         kmp_info_t *th = team->t.t_threads[f];
4867         KMP_DEBUG_ASSERT(th != NULL);
4868
4869         if (place == last_place) {
4870           place = first_place;
4871         } else if (place == (num_masks - 1)) {
4872           place = 0;
4873         } else {
4874           place++;
4875         }
4876         __kmp_set_thread_place(team, th, first_place, last_place, place);
4877
4878         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4879                        "partition = [%d,%d]\n",
4880                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4881                        team->t.t_id, f, place, first_place, last_place));
4882       }
4883     } else {
4884       int S, rem, gap, s_count;
4885       S = n_th / n_places;
4886       s_count = 0;
4887       rem = n_th - (S * n_places);
4888       gap = rem > 0 ? n_places / rem : n_places;
4889       int place = masters_place;
4890       int gap_ct = gap;
4891       for (f = 0; f < n_th; f++) {
4892         kmp_info_t *th = team->t.t_threads[f];
4893         KMP_DEBUG_ASSERT(th != NULL);
4894
4895         __kmp_set_thread_place(team, th, first_place, last_place, place);
4896         s_count++;
4897
4898         if ((s_count == S) && rem && (gap_ct == gap)) {
4899           // do nothing, add an extra thread to place on next iteration
4900         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4901           // we added an extra thread to this place; move to next place
4902           if (place == last_place) {
4903             place = first_place;
4904           } else if (place == (num_masks - 1)) {
4905             place = 0;
4906           } else {
4907             place++;
4908           }
4909           s_count = 0;
4910           gap_ct = 1;
4911           rem--;
4912         } else if (s_count == S) { // place full; don't add extra
4913           if (place == last_place) {
4914             place = first_place;
4915           } else if (place == (num_masks - 1)) {
4916             place = 0;
4917           } else {
4918             place++;
4919           }
4920           gap_ct++;
4921           s_count = 0;
4922         }
4923
4924         KA_TRACE(100,
4925                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4926                   "partition = [%d,%d]\n",
4927                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4928                   th->th.th_new_place, first_place, last_place));
4929       }
4930       KMP_DEBUG_ASSERT(place == masters_place);
4931     }
4932   } break;
4933
4934   case proc_bind_spread: {
4935     int f;
4936     int n_th = team->t.t_nproc;
4937     int n_places;
4938     int thidx;
4939     if (first_place <= last_place) {
4940       n_places = last_place - first_place + 1;
4941     } else {
4942       n_places = num_masks - first_place + last_place + 1;
4943     }
4944     if (n_th <= n_places) {
4945       int place = -1;
4946
4947       if (n_places != num_masks) {
4948         int S = n_places / n_th;
4949         int s_count, rem, gap, gap_ct;
4950
4951         place = masters_place;
4952         rem = n_places - n_th * S;
4953         gap = rem ? n_th / rem : 1;
4954         gap_ct = gap;
4955         thidx = n_th;
4956         if (update_master_only == 1)
4957           thidx = 1;
4958         for (f = 0; f < thidx; f++) {
4959           kmp_info_t *th = team->t.t_threads[f];
4960           KMP_DEBUG_ASSERT(th != NULL);
4961
4962           int fplace = place, nplace = place;
4963           s_count = 1;
4964           while (s_count < S) {
4965             if (place == last_place) {
4966               place = first_place;
4967             } else if (place == (num_masks - 1)) {
4968               place = 0;
4969             } else {
4970               place++;
4971             }
4972             s_count++;
4973           }
4974           if (rem && (gap_ct == gap)) {
4975             if (place == last_place) {
4976               place = first_place;
4977             } else if (place == (num_masks - 1)) {
4978               place = 0;
4979             } else {
4980               place++;
4981             }
4982             rem--;
4983             gap_ct = 0;
4984           }
4985           __kmp_set_thread_place(team, th, fplace, place, nplace);
4986           gap_ct++;
4987
4988           if (place == last_place) {
4989             place = first_place;
4990           } else if (place == (num_masks - 1)) {
4991             place = 0;
4992           } else {
4993             place++;
4994           }
4995
4996           KA_TRACE(100,
4997                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4998                     "partition = [%d,%d], num_masks: %u\n",
4999                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5000                     f, th->th.th_new_place, th->th.th_first_place,
5001                     th->th.th_last_place, num_masks));
5002         }
5003       } else {
5004         /* Having uniform space of available computation places I can create
5005            T partitions of round(P/T) size and put threads into the first
5006            place of each partition. */
5007         double current = static_cast<double>(masters_place);
5008         double spacing =
5009             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5010         int first, last;
5011         kmp_info_t *th;
5012
5013         thidx = n_th + 1;
5014         if (update_master_only == 1)
5015           thidx = 1;
5016         for (f = 0; f < thidx; f++) {
5017           first = static_cast<int>(current);
5018           last = static_cast<int>(current + spacing) - 1;
5019           KMP_DEBUG_ASSERT(last >= first);
5020           if (first >= n_places) {
5021             if (masters_place) {
5022               first -= n_places;
5023               last -= n_places;
5024               if (first == (masters_place + 1)) {
5025                 KMP_DEBUG_ASSERT(f == n_th);
5026                 first--;
5027               }
5028               if (last == masters_place) {
5029                 KMP_DEBUG_ASSERT(f == (n_th - 1));
5030                 last--;
5031               }
5032             } else {
5033               KMP_DEBUG_ASSERT(f == n_th);
5034               first = 0;
5035               last = 0;
5036             }
5037           }
5038           if (last >= n_places) {
5039             last = (n_places - 1);
5040           }
5041           place = first;
5042           current += spacing;
5043           if (f < n_th) {
5044             KMP_DEBUG_ASSERT(0 <= first);
5045             KMP_DEBUG_ASSERT(n_places > first);
5046             KMP_DEBUG_ASSERT(0 <= last);
5047             KMP_DEBUG_ASSERT(n_places > last);
5048             KMP_DEBUG_ASSERT(last_place >= first_place);
5049             th = team->t.t_threads[f];
5050             KMP_DEBUG_ASSERT(th);
5051             __kmp_set_thread_place(team, th, first, last, place);
5052             KA_TRACE(100,
5053                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5054                       "partition = [%d,%d], spacing = %.4f\n",
5055                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5056                       team->t.t_id, f, th->th.th_new_place,
5057                       th->th.th_first_place, th->th.th_last_place, spacing));
5058           }
5059         }
5060       }
5061       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5062     } else {
5063       int S, rem, gap, s_count;
5064       S = n_th / n_places;
5065       s_count = 0;
5066       rem = n_th - (S * n_places);
5067       gap = rem > 0 ? n_places / rem : n_places;
5068       int place = masters_place;
5069       int gap_ct = gap;
5070       thidx = n_th;
5071       if (update_master_only == 1)
5072         thidx = 1;
5073       for (f = 0; f < thidx; f++) {
5074         kmp_info_t *th = team->t.t_threads[f];
5075         KMP_DEBUG_ASSERT(th != NULL);
5076
5077         __kmp_set_thread_place(team, th, place, place, place);
5078         s_count++;
5079
5080         if ((s_count == S) && rem && (gap_ct == gap)) {
5081           // do nothing, add an extra thread to place on next iteration
5082         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5083           // we added an extra thread to this place; move on to next place
5084           if (place == last_place) {
5085             place = first_place;
5086           } else if (place == (num_masks - 1)) {
5087             place = 0;
5088           } else {
5089             place++;
5090           }
5091           s_count = 0;
5092           gap_ct = 1;
5093           rem--;
5094         } else if (s_count == S) { // place is full; don't add extra thread
5095           if (place == last_place) {
5096             place = first_place;
5097           } else if (place == (num_masks - 1)) {
5098             place = 0;
5099           } else {
5100             place++;
5101           }
5102           gap_ct++;
5103           s_count = 0;
5104         }
5105
5106         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5107                        "partition = [%d,%d]\n",
5108                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5109                        team->t.t_id, f, th->th.th_new_place,
5110                        th->th.th_first_place, th->th.th_last_place));
5111       }
5112       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5113     }
5114   } break;
5115
5116   default:
5117     break;
5118   }
5119
5120   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5121 }
5122
5123 #endif // KMP_AFFINITY_SUPPORTED
5124
5125 /* allocate a new team data structure to use.  take one off of the free pool if
5126    available */
5127 kmp_team_t *
5128 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5129 #if OMPT_SUPPORT
5130                     ompt_data_t ompt_parallel_data,
5131 #endif
5132                     kmp_proc_bind_t new_proc_bind,
5133                     kmp_internal_control_t *new_icvs,
5134                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5135   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5136   int f;
5137   kmp_team_t *team;
5138   int use_hot_team = !root->r.r_active;
5139   int level = 0;
5140   int do_place_partition = 1;
5141
5142   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5143   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5144   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5145   KMP_MB();
5146
5147 #if KMP_NESTED_HOT_TEAMS
5148   kmp_hot_team_ptr_t *hot_teams;
5149   if (master) {
5150     team = master->th.th_team;
5151     level = team->t.t_active_level;
5152     if (master->th.th_teams_microtask) { // in teams construct?
5153       if (master->th.th_teams_size.nteams > 1 &&
5154           ( // #teams > 1
5155               team->t.t_pkfn ==
5156                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5157               master->th.th_teams_level <
5158                   team->t.t_level)) { // or nested parallel inside the teams
5159         ++level; // not increment if #teams==1, or for outer fork of the teams;
5160         // increment otherwise
5161       }
5162       // Do not perform the place partition if inner fork of the teams
5163       // Wait until nested parallel region encountered inside teams construct
5164       if ((master->th.th_teams_size.nteams == 1 &&
5165            master->th.th_teams_level >= team->t.t_level) ||
5166           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5167         do_place_partition = 0;
5168     }
5169     hot_teams = master->th.th_hot_teams;
5170     if (level < __kmp_hot_teams_max_level && hot_teams &&
5171         hot_teams[level].hot_team) {
5172       // hot team has already been allocated for given level
5173       use_hot_team = 1;
5174     } else {
5175       use_hot_team = 0;
5176     }
5177   } else {
5178     // check we won't access uninitialized hot_teams, just in case
5179     KMP_DEBUG_ASSERT(new_nproc == 1);
5180   }
5181 #endif
5182   // Optimization to use a "hot" team
5183   if (use_hot_team && new_nproc > 1) {
5184     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5185 #if KMP_NESTED_HOT_TEAMS
5186     team = hot_teams[level].hot_team;
5187 #else
5188     team = root->r.r_hot_team;
5189 #endif
5190 #if KMP_DEBUG
5191     if (__kmp_tasking_mode != tskm_immediate_exec) {
5192       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5193                     "task_team[1] = %p before reinit\n",
5194                     team->t.t_task_team[0], team->t.t_task_team[1]));
5195     }
5196 #endif
5197
5198     if (team->t.t_nproc != new_nproc &&
5199         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5200       // Distributed barrier may need a resize
5201       int old_nthr = team->t.t_nproc;
5202       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5203     }
5204
5205     // If not doing the place partition, then reset the team's proc bind
5206     // to indicate that partitioning of all threads still needs to take place
5207     if (do_place_partition == 0)
5208       team->t.t_proc_bind = proc_bind_default;
5209     // Has the number of threads changed?
5210     /* Let's assume the most common case is that the number of threads is
5211        unchanged, and put that case first. */
5212     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5213       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5214       // This case can mean that omp_set_num_threads() was called and the hot
5215       // team size was already reduced, so we check the special flag
5216       if (team->t.t_size_changed == -1) {
5217         team->t.t_size_changed = 1;
5218       } else {
5219         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5220       }
5221
5222       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5223       kmp_r_sched_t new_sched = new_icvs->sched;
5224       // set primary thread's schedule as new run-time schedule
5225       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5226
5227       __kmp_reinitialize_team(team, new_icvs,
5228                               root->r.r_uber_thread->th.th_ident);
5229
5230       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5231                     team->t.t_threads[0], team));
5232       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5233
5234 #if KMP_AFFINITY_SUPPORTED
5235       if ((team->t.t_size_changed == 0) &&
5236           (team->t.t_proc_bind == new_proc_bind)) {
5237         if (new_proc_bind == proc_bind_spread) {
5238           if (do_place_partition) {
5239             // add flag to update only master for spread
5240             __kmp_partition_places(team, 1);
5241           }
5242         }
5243         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5244                        "proc_bind = %d, partition = [%d,%d]\n",
5245                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5246                        team->t.t_last_place));
5247       } else {
5248         if (do_place_partition) {
5249           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5250           __kmp_partition_places(team);
5251         }
5252       }
5253 #else
5254       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5255 #endif /* KMP_AFFINITY_SUPPORTED */
5256     } else if (team->t.t_nproc > new_nproc) {
5257       KA_TRACE(20,
5258                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5259                 new_nproc));
5260
5261       team->t.t_size_changed = 1;
5262       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5263         // Barrier size already reduced earlier in this function
5264         // Activate team threads via th_used_in_team
5265         __kmp_add_threads_to_team(team, new_nproc);
5266       }
5267 #if KMP_NESTED_HOT_TEAMS
5268       if (__kmp_hot_teams_mode == 0) {
5269         // AC: saved number of threads should correspond to team's value in this
5270         // mode, can be bigger in mode 1, when hot team has threads in reserve
5271         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5272         hot_teams[level].hot_team_nth = new_nproc;
5273 #endif // KMP_NESTED_HOT_TEAMS
5274         /* release the extra threads we don't need any more */
5275         for (f = new_nproc; f < team->t.t_nproc; f++) {
5276           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5277           if (__kmp_tasking_mode != tskm_immediate_exec) {
5278             // When decreasing team size, threads no longer in the team should
5279             // unref task team.
5280             team->t.t_threads[f]->th.th_task_team = NULL;
5281           }
5282           __kmp_free_thread(team->t.t_threads[f]);
5283           team->t.t_threads[f] = NULL;
5284         }
5285 #if KMP_NESTED_HOT_TEAMS
5286       } // (__kmp_hot_teams_mode == 0)
5287       else {
5288         // When keeping extra threads in team, switch threads to wait on own
5289         // b_go flag
5290         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5291           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5293           for (int b = 0; b < bs_last_barrier; ++b) {
5294             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5295               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5296             }
5297             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5298           }
5299         }
5300       }
5301 #endif // KMP_NESTED_HOT_TEAMS
5302       team->t.t_nproc = new_nproc;
5303       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5304       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5305       __kmp_reinitialize_team(team, new_icvs,
5306                               root->r.r_uber_thread->th.th_ident);
5307
5308       // Update remaining threads
5309       for (f = 0; f < new_nproc; ++f) {
5310         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5311       }
5312
5313       // restore the current task state of the primary thread: should be the
5314       // implicit task
5315       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5316                     team->t.t_threads[0], team));
5317
5318       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5319
5320 #ifdef KMP_DEBUG
5321       for (f = 0; f < team->t.t_nproc; f++) {
5322         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5323                          team->t.t_threads[f]->th.th_team_nproc ==
5324                              team->t.t_nproc);
5325       }
5326 #endif
5327
5328       if (do_place_partition) {
5329         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5330 #if KMP_AFFINITY_SUPPORTED
5331         __kmp_partition_places(team);
5332 #endif
5333       }
5334     } else { // team->t.t_nproc < new_nproc
5335
5336       KA_TRACE(20,
5337                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5338                 new_nproc));
5339       int old_nproc = team->t.t_nproc; // save old value and use to update only
5340       team->t.t_size_changed = 1;
5341
5342 #if KMP_NESTED_HOT_TEAMS
5343       int avail_threads = hot_teams[level].hot_team_nth;
5344       if (new_nproc < avail_threads)
5345         avail_threads = new_nproc;
5346       kmp_info_t **other_threads = team->t.t_threads;
5347       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5348         // Adjust barrier data of reserved threads (if any) of the team
5349         // Other data will be set in __kmp_initialize_info() below.
5350         int b;
5351         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5352         for (b = 0; b < bs_last_barrier; ++b) {
5353           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5354           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5355 #if USE_DEBUGGER
5356           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5357 #endif
5358         }
5359       }
5360       if (hot_teams[level].hot_team_nth >= new_nproc) {
5361         // we have all needed threads in reserve, no need to allocate any
5362         // this only possible in mode 1, cannot have reserved threads in mode 0
5363         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5364         team->t.t_nproc = new_nproc; // just get reserved threads involved
5365       } else {
5366         // We may have some threads in reserve, but not enough;
5367         // get reserved threads involved if any.
5368         team->t.t_nproc = hot_teams[level].hot_team_nth;
5369         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5370 #endif // KMP_NESTED_HOT_TEAMS
5371         if (team->t.t_max_nproc < new_nproc) {
5372           /* reallocate larger arrays */
5373           __kmp_reallocate_team_arrays(team, new_nproc);
5374           __kmp_reinitialize_team(team, new_icvs, NULL);
5375         }
5376
5377 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5378         /* Temporarily set full mask for primary thread before creation of
5379            workers. The reason is that workers inherit the affinity from the
5380            primary thread, so if a lot of workers are created on the single
5381            core quickly, they don't get a chance to set their own affinity for
5382            a long time. */
5383         kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5384 #endif
5385
5386         /* allocate new threads for the hot team */
5387         for (f = team->t.t_nproc; f < new_nproc; f++) {
5388           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5389           KMP_DEBUG_ASSERT(new_worker);
5390           team->t.t_threads[f] = new_worker;
5391
5392           KA_TRACE(20,
5393                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5394                     "join=%llu, plain=%llu\n",
5395                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5396                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5397                     team->t.t_bar[bs_plain_barrier].b_arrived));
5398
5399           { // Initialize barrier data for new threads.
5400             int b;
5401             kmp_balign_t *balign = new_worker->th.th_bar;
5402             for (b = 0; b < bs_last_barrier; ++b) {
5403               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5404               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5405                                KMP_BARRIER_PARENT_FLAG);
5406 #if USE_DEBUGGER
5407               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5408 #endif
5409             }
5410           }
5411         }
5412
5413 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5414         /* Restore initial primary thread's affinity mask */
5415         new_temp_affinity.restore();
5416 #endif
5417 #if KMP_NESTED_HOT_TEAMS
5418       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5419 #endif // KMP_NESTED_HOT_TEAMS
5420       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5421         // Barrier size already increased earlier in this function
5422         // Activate team threads via th_used_in_team
5423         __kmp_add_threads_to_team(team, new_nproc);
5424       }
5425       /* make sure everyone is syncronized */
5426       // new threads below
5427       __kmp_initialize_team(team, new_nproc, new_icvs,
5428                             root->r.r_uber_thread->th.th_ident);
5429
5430       /* reinitialize the threads */
5431       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5432       for (f = 0; f < team->t.t_nproc; ++f)
5433         __kmp_initialize_info(team->t.t_threads[f], team, f,
5434                               __kmp_gtid_from_tid(f, team));
5435
5436       // set th_task_state for new threads in hot team with older thread's state
5437       kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5438       for (f = old_nproc; f < team->t.t_nproc; ++f)
5439         team->t.t_threads[f]->th.th_task_state = old_state;
5440
5441 #ifdef KMP_DEBUG
5442       for (f = 0; f < team->t.t_nproc; ++f) {
5443         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5444                          team->t.t_threads[f]->th.th_team_nproc ==
5445                              team->t.t_nproc);
5446       }
5447 #endif
5448
5449       if (do_place_partition) {
5450         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5451 #if KMP_AFFINITY_SUPPORTED
5452         __kmp_partition_places(team);
5453 #endif
5454       }
5455     } // Check changes in number of threads
5456
5457     kmp_info_t *master = team->t.t_threads[0];
5458     if (master->th.th_teams_microtask) {
5459       for (f = 1; f < new_nproc; ++f) {
5460         // propagate teams construct specific info to workers
5461         kmp_info_t *thr = team->t.t_threads[f];
5462         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5463         thr->th.th_teams_level = master->th.th_teams_level;
5464         thr->th.th_teams_size = master->th.th_teams_size;
5465       }
5466     }
5467 #if KMP_NESTED_HOT_TEAMS
5468     if (level) {
5469       // Sync barrier state for nested hot teams, not needed for outermost hot
5470       // team.
5471       for (f = 1; f < new_nproc; ++f) {
5472         kmp_info_t *thr = team->t.t_threads[f];
5473         int b;
5474         kmp_balign_t *balign = thr->th.th_bar;
5475         for (b = 0; b < bs_last_barrier; ++b) {
5476           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5477           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5478 #if USE_DEBUGGER
5479           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5480 #endif
5481         }
5482       }
5483     }
5484 #endif // KMP_NESTED_HOT_TEAMS
5485
5486     /* reallocate space for arguments if necessary */
5487     __kmp_alloc_argv_entries(argc, team, TRUE);
5488     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5489     // The hot team re-uses the previous task team,
5490     // if untouched during the previous release->gather phase.
5491
5492     KF_TRACE(10, (" hot_team = %p\n", team));
5493
5494 #if KMP_DEBUG
5495     if (__kmp_tasking_mode != tskm_immediate_exec) {
5496       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5497                     "task_team[1] = %p after reinit\n",
5498                     team->t.t_task_team[0], team->t.t_task_team[1]));
5499     }
5500 #endif
5501
5502 #if OMPT_SUPPORT
5503     __ompt_team_assign_id(team, ompt_parallel_data);
5504 #endif
5505
5506     KMP_MB();
5507
5508     return team;
5509   }
5510
5511   /* next, let's try to take one from the team pool */
5512   KMP_MB();
5513   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5514     /* TODO: consider resizing undersized teams instead of reaping them, now
5515        that we have a resizing mechanism */
5516     if (team->t.t_max_nproc >= max_nproc) {
5517       /* take this team from the team pool */
5518       __kmp_team_pool = team->t.t_next_pool;
5519
5520       if (max_nproc > 1 &&
5521           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5522         if (!team->t.b) { // Allocate barrier structure
5523           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5524         }
5525       }
5526
5527       /* setup the team for fresh use */
5528       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5529
5530       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5531                     "task_team[1] %p to NULL\n",
5532                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5533       team->t.t_task_team[0] = NULL;
5534       team->t.t_task_team[1] = NULL;
5535
5536       /* reallocate space for arguments if necessary */
5537       __kmp_alloc_argv_entries(argc, team, TRUE);
5538       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5539
5540       KA_TRACE(
5541           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5542                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5543       { // Initialize barrier data.
5544         int b;
5545         for (b = 0; b < bs_last_barrier; ++b) {
5546           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5547 #if USE_DEBUGGER
5548           team->t.t_bar[b].b_master_arrived = 0;
5549           team->t.t_bar[b].b_team_arrived = 0;
5550 #endif
5551         }
5552       }
5553
5554       team->t.t_proc_bind = new_proc_bind;
5555
5556       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5557                     team->t.t_id));
5558
5559 #if OMPT_SUPPORT
5560       __ompt_team_assign_id(team, ompt_parallel_data);
5561 #endif
5562
5563       KMP_MB();
5564
5565       return team;
5566     }
5567
5568     /* reap team if it is too small, then loop back and check the next one */
5569     // not sure if this is wise, but, will be redone during the hot-teams
5570     // rewrite.
5571     /* TODO: Use technique to find the right size hot-team, don't reap them */
5572     team = __kmp_reap_team(team);
5573     __kmp_team_pool = team;
5574   }
5575
5576   /* nothing available in the pool, no matter, make a new team! */
5577   KMP_MB();
5578   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5579
5580   /* and set it up */
5581   team->t.t_max_nproc = max_nproc;
5582   if (max_nproc > 1 &&
5583       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5584     // Allocate barrier structure
5585     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5586   }
5587
5588   /* NOTE well, for some reason allocating one big buffer and dividing it up
5589      seems to really hurt performance a lot on the P4, so, let's not use this */
5590   __kmp_allocate_team_arrays(team, max_nproc);
5591
5592   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5593   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5594
5595   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5596                 "%p to NULL\n",
5597                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5598   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5599   // memory, no need to duplicate
5600   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5601   // memory, no need to duplicate
5602
5603   if (__kmp_storage_map) {
5604     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5605   }
5606
5607   /* allocate space for arguments */
5608   __kmp_alloc_argv_entries(argc, team, FALSE);
5609   team->t.t_argc = argc;
5610
5611   KA_TRACE(20,
5612            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5613             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5614   { // Initialize barrier data.
5615     int b;
5616     for (b = 0; b < bs_last_barrier; ++b) {
5617       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5618 #if USE_DEBUGGER
5619       team->t.t_bar[b].b_master_arrived = 0;
5620       team->t.t_bar[b].b_team_arrived = 0;
5621 #endif
5622     }
5623   }
5624
5625   team->t.t_proc_bind = new_proc_bind;
5626
5627 #if OMPT_SUPPORT
5628   __ompt_team_assign_id(team, ompt_parallel_data);
5629   team->t.ompt_serialized_team_info = NULL;
5630 #endif
5631
5632   KMP_MB();
5633
5634   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5635                 team->t.t_id));
5636
5637   return team;
5638 }
5639
5640 /* TODO implement hot-teams at all levels */
5641 /* TODO implement lazy thread release on demand (disband request) */
5642
5643 /* free the team.  return it to the team pool.  release all the threads
5644  * associated with it */
5645 void __kmp_free_team(kmp_root_t *root,
5646                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5647   int f;
5648   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5649                 team->t.t_id));
5650
5651   /* verify state */
5652   KMP_DEBUG_ASSERT(root);
5653   KMP_DEBUG_ASSERT(team);
5654   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5655   KMP_DEBUG_ASSERT(team->t.t_threads);
5656
5657   int use_hot_team = team == root->r.r_hot_team;
5658 #if KMP_NESTED_HOT_TEAMS
5659   int level;
5660   if (master) {
5661     level = team->t.t_active_level - 1;
5662     if (master->th.th_teams_microtask) { // in teams construct?
5663       if (master->th.th_teams_size.nteams > 1) {
5664         ++level; // level was not increased in teams construct for
5665         // team_of_masters
5666       }
5667       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5668           master->th.th_teams_level == team->t.t_level) {
5669         ++level; // level was not increased in teams construct for
5670         // team_of_workers before the parallel
5671       } // team->t.t_level will be increased inside parallel
5672     }
5673 #if KMP_DEBUG
5674     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5675 #endif
5676     if (level < __kmp_hot_teams_max_level) {
5677       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5678       use_hot_team = 1;
5679     }
5680   }
5681 #endif // KMP_NESTED_HOT_TEAMS
5682
5683   /* team is done working */
5684   TCW_SYNC_PTR(team->t.t_pkfn,
5685                NULL); // Important for Debugging Support Library.
5686 #if KMP_OS_WINDOWS
5687   team->t.t_copyin_counter = 0; // init counter for possible reuse
5688 #endif
5689   // Do not reset pointer to parent team to NULL for hot teams.
5690
5691   /* if we are non-hot team, release our threads */
5692   if (!use_hot_team) {
5693     if (__kmp_tasking_mode != tskm_immediate_exec) {
5694       // Wait for threads to reach reapable state
5695       for (f = 1; f < team->t.t_nproc; ++f) {
5696         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5697         kmp_info_t *th = team->t.t_threads[f];
5698         volatile kmp_uint32 *state = &th->th.th_reap_state;
5699         while (*state != KMP_SAFE_TO_REAP) {
5700 #if KMP_OS_WINDOWS
5701           // On Windows a thread can be killed at any time, check this
5702           DWORD ecode;
5703           if (!__kmp_is_thread_alive(th, &ecode)) {
5704             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5705             break;
5706           }
5707 #endif
5708           // first check if thread is sleeping
5709           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5710           if (fl.is_sleeping())
5711             fl.resume(__kmp_gtid_from_thread(th));
5712           KMP_CPU_PAUSE();
5713         }
5714       }
5715
5716       // Delete task teams
5717       int tt_idx;
5718       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5719         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5720         if (task_team != NULL) {
5721           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5722             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5723             team->t.t_threads[f]->th.th_task_team = NULL;
5724           }
5725           KA_TRACE(
5726               20,
5727               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5728                __kmp_get_gtid(), task_team, team->t.t_id));
5729 #if KMP_NESTED_HOT_TEAMS
5730           __kmp_free_task_team(master, task_team);
5731 #endif
5732           team->t.t_task_team[tt_idx] = NULL;
5733         }
5734       }
5735     }
5736
5737     // Reset pointer to parent team only for non-hot teams.
5738     team->t.t_parent = NULL;
5739     team->t.t_level = 0;
5740     team->t.t_active_level = 0;
5741
5742     /* free the worker threads */
5743     for (f = 1; f < team->t.t_nproc; ++f) {
5744       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5746         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5747                                     1, 2);
5748       }
5749       __kmp_free_thread(team->t.t_threads[f]);
5750     }
5751
5752     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5753       if (team->t.b) {
5754         // wake up thread at old location
5755         team->t.b->go_release();
5756         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5757           for (f = 1; f < team->t.t_nproc; ++f) {
5758             if (team->t.b->sleep[f].sleep) {
5759               __kmp_atomic_resume_64(
5760                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5761                   (kmp_atomic_flag_64<> *)NULL);
5762             }
5763           }
5764         }
5765         // Wait for threads to be removed from team
5766         for (int f = 1; f < team->t.t_nproc; ++f) {
5767           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5768             KMP_CPU_PAUSE();
5769         }
5770       }
5771     }
5772
5773     for (f = 1; f < team->t.t_nproc; ++f) {
5774       team->t.t_threads[f] = NULL;
5775     }
5776
5777     if (team->t.t_max_nproc > 1 &&
5778         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5779       distributedBarrier::deallocate(team->t.b);
5780       team->t.b = NULL;
5781     }
5782     /* put the team back in the team pool */
5783     /* TODO limit size of team pool, call reap_team if pool too large */
5784     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5785     __kmp_team_pool = (volatile kmp_team_t *)team;
5786   } else { // Check if team was created for primary threads in teams construct
5787     // See if first worker is a CG root
5788     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5789                      team->t.t_threads[1]->th.th_cg_roots);
5790     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5791       // Clean up the CG root nodes on workers so that this team can be re-used
5792       for (f = 1; f < team->t.t_nproc; ++f) {
5793         kmp_info_t *thr = team->t.t_threads[f];
5794         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5795                          thr->th.th_cg_roots->cg_root == thr);
5796         // Pop current CG root off list
5797         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5798         thr->th.th_cg_roots = tmp->up;
5799         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5800                        " up to node %p. cg_nthreads was %d\n",
5801                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5802         int i = tmp->cg_nthreads--;
5803         if (i == 1) {
5804           __kmp_free(tmp); // free CG if we are the last thread in it
5805         }
5806         // Restore current task's thread_limit from CG root
5807         if (thr->th.th_cg_roots)
5808           thr->th.th_current_task->td_icvs.thread_limit =
5809               thr->th.th_cg_roots->cg_thread_limit;
5810       }
5811     }
5812   }
5813
5814   KMP_MB();
5815 }
5816
5817 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5818 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5819   kmp_team_t *next_pool = team->t.t_next_pool;
5820
5821   KMP_DEBUG_ASSERT(team);
5822   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5823   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5824   KMP_DEBUG_ASSERT(team->t.t_threads);
5825   KMP_DEBUG_ASSERT(team->t.t_argv);
5826
5827   /* TODO clean the threads that are a part of this? */
5828
5829   /* free stuff */
5830   __kmp_free_team_arrays(team);
5831   if (team->t.t_argv != &team->t.t_inline_argv[0])
5832     __kmp_free((void *)team->t.t_argv);
5833   __kmp_free(team);
5834
5835   KMP_MB();
5836   return next_pool;
5837 }
5838
5839 // Free the thread.  Don't reap it, just place it on the pool of available
5840 // threads.
5841 //
5842 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5843 // binding for the affinity mechanism to be useful.
5844 //
5845 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5846 // However, we want to avoid a potential performance problem by always
5847 // scanning through the list to find the correct point at which to insert
5848 // the thread (potential N**2 behavior).  To do this we keep track of the
5849 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5850 // With single-level parallelism, threads will always be added to the tail
5851 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5852 // parallelism, all bets are off and we may need to scan through the entire
5853 // free list.
5854 //
5855 // This change also has a potentially large performance benefit, for some
5856 // applications.  Previously, as threads were freed from the hot team, they
5857 // would be placed back on the free list in inverse order.  If the hot team
5858 // grew back to it's original size, then the freed thread would be placed
5859 // back on the hot team in reverse order.  This could cause bad cache
5860 // locality problems on programs where the size of the hot team regularly
5861 // grew and shrunk.
5862 //
5863 // Now, for single-level parallelism, the OMP tid is always == gtid.
5864 void __kmp_free_thread(kmp_info_t *this_th) {
5865   int gtid;
5866   kmp_info_t **scan;
5867
5868   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5869                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5870
5871   KMP_DEBUG_ASSERT(this_th);
5872
5873   // When moving thread to pool, switch thread to wait on own b_go flag, and
5874   // uninitialized (NULL team).
5875   int b;
5876   kmp_balign_t *balign = this_th->th.th_bar;
5877   for (b = 0; b < bs_last_barrier; ++b) {
5878     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5879       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5880     balign[b].bb.team = NULL;
5881     balign[b].bb.leaf_kids = 0;
5882   }
5883   this_th->th.th_task_state = 0;
5884   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5885
5886   /* put thread back on the free pool */
5887   TCW_PTR(this_th->th.th_team, NULL);
5888   TCW_PTR(this_th->th.th_root, NULL);
5889   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5890
5891   while (this_th->th.th_cg_roots) {
5892     this_th->th.th_cg_roots->cg_nthreads--;
5893     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5894                    " %p of thread  %p to %d\n",
5895                    this_th, this_th->th.th_cg_roots,
5896                    this_th->th.th_cg_roots->cg_root,
5897                    this_th->th.th_cg_roots->cg_nthreads));
5898     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5899     if (tmp->cg_root == this_th) { // Thread is a cg_root
5900       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5901       KA_TRACE(
5902           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5903       this_th->th.th_cg_roots = tmp->up;
5904       __kmp_free(tmp);
5905     } else { // Worker thread
5906       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5907         __kmp_free(tmp);
5908       }
5909       this_th->th.th_cg_roots = NULL;
5910       break;
5911     }
5912   }
5913
5914   /* If the implicit task assigned to this thread can be used by other threads
5915    * -> multiple threads can share the data and try to free the task at
5916    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5917    * with higher probability when hot team is disabled but can occurs even when
5918    * the hot team is enabled */
5919   __kmp_free_implicit_task(this_th);
5920   this_th->th.th_current_task = NULL;
5921
5922   // If the __kmp_thread_pool_insert_pt is already past the new insert
5923   // point, then we need to re-scan the entire list.
5924   gtid = this_th->th.th_info.ds.ds_gtid;
5925   if (__kmp_thread_pool_insert_pt != NULL) {
5926     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5927     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5928       __kmp_thread_pool_insert_pt = NULL;
5929     }
5930   }
5931
5932   // Scan down the list to find the place to insert the thread.
5933   // scan is the address of a link in the list, possibly the address of
5934   // __kmp_thread_pool itself.
5935   //
5936   // In the absence of nested parallelism, the for loop will have 0 iterations.
5937   if (__kmp_thread_pool_insert_pt != NULL) {
5938     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5939   } else {
5940     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5941   }
5942   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5943        scan = &((*scan)->th.th_next_pool))
5944     ;
5945
5946   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5947   // to its address.
5948   TCW_PTR(this_th->th.th_next_pool, *scan);
5949   __kmp_thread_pool_insert_pt = *scan = this_th;
5950   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5951                    (this_th->th.th_info.ds.ds_gtid <
5952                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5953   TCW_4(this_th->th.th_in_pool, TRUE);
5954   __kmp_suspend_initialize_thread(this_th);
5955   __kmp_lock_suspend_mx(this_th);
5956   if (this_th->th.th_active == TRUE) {
5957     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5958     this_th->th.th_active_in_pool = TRUE;
5959   }
5960 #if KMP_DEBUG
5961   else {
5962     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5963   }
5964 #endif
5965   __kmp_unlock_suspend_mx(this_th);
5966
5967   TCW_4(__kmp_nth, __kmp_nth - 1);
5968
5969 #ifdef KMP_ADJUST_BLOCKTIME
5970   /* Adjust blocktime back to user setting or default if necessary */
5971   /* Middle initialization might never have occurred                */
5972   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5973     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5974     if (__kmp_nth <= __kmp_avail_proc) {
5975       __kmp_zero_bt = FALSE;
5976     }
5977   }
5978 #endif /* KMP_ADJUST_BLOCKTIME */
5979
5980   KMP_MB();
5981 }
5982
5983 /* ------------------------------------------------------------------------ */
5984
5985 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5986 #if OMP_PROFILING_SUPPORT
5987   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5988   // TODO: add a configuration option for time granularity
5989   if (ProfileTraceFile)
5990     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5991 #endif
5992
5993   int gtid = this_thr->th.th_info.ds.ds_gtid;
5994   /*    void                 *stack_data;*/
5995   kmp_team_t **volatile pteam;
5996
5997   KMP_MB();
5998   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5999
6000   if (__kmp_env_consistency_check) {
6001     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6002   }
6003
6004 #if OMPD_SUPPORT
6005   if (ompd_state & OMPD_ENABLE_BP)
6006     ompd_bp_thread_begin();
6007 #endif
6008
6009 #if OMPT_SUPPORT
6010   ompt_data_t *thread_data = nullptr;
6011   if (ompt_enabled.enabled) {
6012     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6013     *thread_data = ompt_data_none;
6014
6015     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6016     this_thr->th.ompt_thread_info.wait_id = 0;
6017     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6018     this_thr->th.ompt_thread_info.parallel_flags = 0;
6019     if (ompt_enabled.ompt_callback_thread_begin) {
6020       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6021           ompt_thread_worker, thread_data);
6022     }
6023     this_thr->th.ompt_thread_info.state = ompt_state_idle;
6024   }
6025 #endif
6026
6027   /* This is the place where threads wait for work */
6028   while (!TCR_4(__kmp_global.g.g_done)) {
6029     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6030     KMP_MB();
6031
6032     /* wait for work to do */
6033     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6034
6035     /* No tid yet since not part of a team */
6036     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6037
6038 #if OMPT_SUPPORT
6039     if (ompt_enabled.enabled) {
6040       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6041     }
6042 #endif
6043
6044     pteam = &this_thr->th.th_team;
6045
6046     /* have we been allocated? */
6047     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6048       /* we were just woken up, so run our new task */
6049       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6050         int rc;
6051         KA_TRACE(20,
6052                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6053                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6054                   (*pteam)->t.t_pkfn));
6055
6056         updateHWFPControl(*pteam);
6057
6058 #if OMPT_SUPPORT
6059         if (ompt_enabled.enabled) {
6060           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6061         }
6062 #endif
6063
6064         rc = (*pteam)->t.t_invoke(gtid);
6065         KMP_ASSERT(rc);
6066
6067         KMP_MB();
6068         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6069                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6070                       (*pteam)->t.t_pkfn));
6071       }
6072 #if OMPT_SUPPORT
6073       if (ompt_enabled.enabled) {
6074         /* no frame set while outside task */
6075         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6076
6077         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6078       }
6079 #endif
6080       /* join barrier after parallel region */
6081       __kmp_join_barrier(gtid);
6082     }
6083   }
6084
6085 #if OMPD_SUPPORT
6086   if (ompd_state & OMPD_ENABLE_BP)
6087     ompd_bp_thread_end();
6088 #endif
6089
6090 #if OMPT_SUPPORT
6091   if (ompt_enabled.ompt_callback_thread_end) {
6092     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6093   }
6094 #endif
6095
6096   this_thr->th.th_task_team = NULL;
6097   /* run the destructors for the threadprivate data for this thread */
6098   __kmp_common_destroy_gtid(gtid);
6099
6100   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6101   KMP_MB();
6102
6103 #if OMP_PROFILING_SUPPORT
6104   llvm::timeTraceProfilerFinishThread();
6105 #endif
6106   return this_thr;
6107 }
6108
6109 /* ------------------------------------------------------------------------ */
6110
6111 void __kmp_internal_end_dest(void *specific_gtid) {
6112   // Make sure no significant bits are lost
6113   int gtid;
6114   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6115
6116   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6117   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6118    * this is because 0 is reserved for the nothing-stored case */
6119
6120   __kmp_internal_end_thread(gtid);
6121 }
6122
6123 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6124
6125 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6126   __kmp_internal_end_atexit();
6127 }
6128
6129 #endif
6130
6131 /* [Windows] josh: when the atexit handler is called, there may still be more
6132    than one thread alive */
6133 void __kmp_internal_end_atexit(void) {
6134   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6135   /* [Windows]
6136      josh: ideally, we want to completely shutdown the library in this atexit
6137      handler, but stat code that depends on thread specific data for gtid fails
6138      because that data becomes unavailable at some point during the shutdown, so
6139      we call __kmp_internal_end_thread instead. We should eventually remove the
6140      dependency on __kmp_get_specific_gtid in the stat code and use
6141      __kmp_internal_end_library to cleanly shutdown the library.
6142
6143      // TODO: Can some of this comment about GVS be removed?
6144      I suspect that the offending stat code is executed when the calling thread
6145      tries to clean up a dead root thread's data structures, resulting in GVS
6146      code trying to close the GVS structures for that thread, but since the stat
6147      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6148      the calling thread is cleaning up itself instead of another thread, it get
6149      confused. This happens because allowing a thread to unregister and cleanup
6150      another thread is a recent modification for addressing an issue.
6151      Based on the current design (20050722), a thread may end up
6152      trying to unregister another thread only if thread death does not trigger
6153      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6154      thread specific data destructor function to detect thread death. For
6155      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6156      is nothing.  Thus, the workaround is applicable only for Windows static
6157      stat library. */
6158   __kmp_internal_end_library(-1);
6159 #if KMP_OS_WINDOWS
6160   __kmp_close_console();
6161 #endif
6162 }
6163
6164 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6165   // It is assumed __kmp_forkjoin_lock is acquired.
6166
6167   int gtid;
6168
6169   KMP_DEBUG_ASSERT(thread != NULL);
6170
6171   gtid = thread->th.th_info.ds.ds_gtid;
6172
6173   if (!is_root) {
6174     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6175       /* Assume the threads are at the fork barrier here */
6176       KA_TRACE(
6177           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6178                gtid));
6179       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6180         while (
6181             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6182           KMP_CPU_PAUSE();
6183         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6184       } else {
6185         /* Need release fence here to prevent seg faults for tree forkjoin
6186            barrier (GEH) */
6187         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6188                            thread);
6189         __kmp_release_64(&flag);
6190       }
6191     }
6192
6193     // Terminate OS thread.
6194     __kmp_reap_worker(thread);
6195
6196     // The thread was killed asynchronously.  If it was actively
6197     // spinning in the thread pool, decrement the global count.
6198     //
6199     // There is a small timing hole here - if the worker thread was just waking
6200     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6201     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6202     // the global counter might not get updated.
6203     //
6204     // Currently, this can only happen as the library is unloaded,
6205     // so there are no harmful side effects.
6206     if (thread->th.th_active_in_pool) {
6207       thread->th.th_active_in_pool = FALSE;
6208       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6209       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6210     }
6211   }
6212
6213   __kmp_free_implicit_task(thread);
6214
6215 // Free the fast memory for tasking
6216 #if USE_FAST_MEMORY
6217   __kmp_free_fast_memory(thread);
6218 #endif /* USE_FAST_MEMORY */
6219
6220   __kmp_suspend_uninitialize_thread(thread);
6221
6222   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6223   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6224
6225   --__kmp_all_nth;
6226   // __kmp_nth was decremented when thread is added to the pool.
6227
6228 #ifdef KMP_ADJUST_BLOCKTIME
6229   /* Adjust blocktime back to user setting or default if necessary */
6230   /* Middle initialization might never have occurred                */
6231   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6232     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6233     if (__kmp_nth <= __kmp_avail_proc) {
6234       __kmp_zero_bt = FALSE;
6235     }
6236   }
6237 #endif /* KMP_ADJUST_BLOCKTIME */
6238
6239   /* free the memory being used */
6240   if (__kmp_env_consistency_check) {
6241     if (thread->th.th_cons) {
6242       __kmp_free_cons_stack(thread->th.th_cons);
6243       thread->th.th_cons = NULL;
6244     }
6245   }
6246
6247   if (thread->th.th_pri_common != NULL) {
6248     __kmp_free(thread->th.th_pri_common);
6249     thread->th.th_pri_common = NULL;
6250   }
6251
6252   if (thread->th.th_task_state_memo_stack != NULL) {
6253     __kmp_free(thread->th.th_task_state_memo_stack);
6254     thread->th.th_task_state_memo_stack = NULL;
6255   }
6256
6257 #if KMP_USE_BGET
6258   if (thread->th.th_local.bget_data != NULL) {
6259     __kmp_finalize_bget(thread);
6260   }
6261 #endif
6262
6263 #if KMP_AFFINITY_SUPPORTED
6264   if (thread->th.th_affin_mask != NULL) {
6265     KMP_CPU_FREE(thread->th.th_affin_mask);
6266     thread->th.th_affin_mask = NULL;
6267   }
6268 #endif /* KMP_AFFINITY_SUPPORTED */
6269
6270 #if KMP_USE_HIER_SCHED
6271   if (thread->th.th_hier_bar_data != NULL) {
6272     __kmp_free(thread->th.th_hier_bar_data);
6273     thread->th.th_hier_bar_data = NULL;
6274   }
6275 #endif
6276
6277   __kmp_reap_team(thread->th.th_serial_team);
6278   thread->th.th_serial_team = NULL;
6279   __kmp_free(thread);
6280
6281   KMP_MB();
6282
6283 } // __kmp_reap_thread
6284
6285 static void __kmp_itthash_clean(kmp_info_t *th) {
6286 #if USE_ITT_NOTIFY
6287   if (__kmp_itt_region_domains.count > 0) {
6288     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6289       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6290       while (bucket) {
6291         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6292         __kmp_thread_free(th, bucket);
6293         bucket = next;
6294       }
6295     }
6296   }
6297   if (__kmp_itt_barrier_domains.count > 0) {
6298     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6299       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6300       while (bucket) {
6301         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6302         __kmp_thread_free(th, bucket);
6303         bucket = next;
6304       }
6305     }
6306   }
6307 #endif
6308 }
6309
6310 static void __kmp_internal_end(void) {
6311   int i;
6312
6313   /* First, unregister the library */
6314   __kmp_unregister_library();
6315
6316 #if KMP_OS_WINDOWS
6317   /* In Win static library, we can't tell when a root actually dies, so we
6318      reclaim the data structures for any root threads that have died but not
6319      unregistered themselves, in order to shut down cleanly.
6320      In Win dynamic library we also can't tell when a thread dies.  */
6321   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6322 // dead roots
6323 #endif
6324
6325   for (i = 0; i < __kmp_threads_capacity; i++)
6326     if (__kmp_root[i])
6327       if (__kmp_root[i]->r.r_active)
6328         break;
6329   KMP_MB(); /* Flush all pending memory write invalidates.  */
6330   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6331
6332   if (i < __kmp_threads_capacity) {
6333 #if KMP_USE_MONITOR
6334     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6335     KMP_MB(); /* Flush all pending memory write invalidates.  */
6336
6337     // Need to check that monitor was initialized before reaping it. If we are
6338     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6339     // __kmp_monitor will appear to contain valid data, but it is only valid in
6340     // the parent process, not the child.
6341     // New behavior (201008): instead of keying off of the flag
6342     // __kmp_init_parallel, the monitor thread creation is keyed off
6343     // of the new flag __kmp_init_monitor.
6344     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6345     if (TCR_4(__kmp_init_monitor)) {
6346       __kmp_reap_monitor(&__kmp_monitor);
6347       TCW_4(__kmp_init_monitor, 0);
6348     }
6349     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6350     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6351 #endif // KMP_USE_MONITOR
6352   } else {
6353 /* TODO move this to cleanup code */
6354 #ifdef KMP_DEBUG
6355     /* make sure that everything has properly ended */
6356     for (i = 0; i < __kmp_threads_capacity; i++) {
6357       if (__kmp_root[i]) {
6358         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6359         //                    there can be uber threads alive here
6360         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6361       }
6362     }
6363 #endif
6364
6365     KMP_MB();
6366
6367     // Reap the worker threads.
6368     // This is valid for now, but be careful if threads are reaped sooner.
6369     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6370       // Get the next thread from the pool.
6371       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6372       __kmp_thread_pool = thread->th.th_next_pool;
6373       // Reap it.
6374       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6375       thread->th.th_next_pool = NULL;
6376       thread->th.th_in_pool = FALSE;
6377       __kmp_reap_thread(thread, 0);
6378     }
6379     __kmp_thread_pool_insert_pt = NULL;
6380
6381     // Reap teams.
6382     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6383       // Get the next team from the pool.
6384       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6385       __kmp_team_pool = team->t.t_next_pool;
6386       // Reap it.
6387       team->t.t_next_pool = NULL;
6388       __kmp_reap_team(team);
6389     }
6390
6391     __kmp_reap_task_teams();
6392
6393 #if KMP_OS_UNIX
6394     // Threads that are not reaped should not access any resources since they
6395     // are going to be deallocated soon, so the shutdown sequence should wait
6396     // until all threads either exit the final spin-waiting loop or begin
6397     // sleeping after the given blocktime.
6398     for (i = 0; i < __kmp_threads_capacity; i++) {
6399       kmp_info_t *thr = __kmp_threads[i];
6400       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6401         KMP_CPU_PAUSE();
6402     }
6403 #endif
6404
6405     for (i = 0; i < __kmp_threads_capacity; ++i) {
6406       // TBD: Add some checking...
6407       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6408     }
6409
6410     /* Make sure all threadprivate destructors get run by joining with all
6411        worker threads before resetting this flag */
6412     TCW_SYNC_4(__kmp_init_common, FALSE);
6413
6414     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6415     KMP_MB();
6416
6417 #if KMP_USE_MONITOR
6418     // See note above: One of the possible fixes for CQ138434 / CQ140126
6419     //
6420     // FIXME: push both code fragments down and CSE them?
6421     // push them into __kmp_cleanup() ?
6422     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6423     if (TCR_4(__kmp_init_monitor)) {
6424       __kmp_reap_monitor(&__kmp_monitor);
6425       TCW_4(__kmp_init_monitor, 0);
6426     }
6427     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6428     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6429 #endif
6430   } /* else !__kmp_global.t_active */
6431   TCW_4(__kmp_init_gtid, FALSE);
6432   KMP_MB(); /* Flush all pending memory write invalidates.  */
6433
6434   __kmp_cleanup();
6435 #if OMPT_SUPPORT
6436   ompt_fini();
6437 #endif
6438 }
6439
6440 void __kmp_internal_end_library(int gtid_req) {
6441   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6442   /* this shouldn't be a race condition because __kmp_internal_end() is the
6443      only place to clear __kmp_serial_init */
6444   /* we'll check this later too, after we get the lock */
6445   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6446   // redundant, because the next check will work in any case.
6447   if (__kmp_global.g.g_abort) {
6448     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6449     /* TODO abort? */
6450     return;
6451   }
6452   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6453     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6454     return;
6455   }
6456
6457   // If hidden helper team has been initialized, we need to deinit it
6458   if (TCR_4(__kmp_init_hidden_helper) &&
6459       !TCR_4(__kmp_hidden_helper_team_done)) {
6460     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6461     // First release the main thread to let it continue its work
6462     __kmp_hidden_helper_main_thread_release();
6463     // Wait until the hidden helper team has been destroyed
6464     __kmp_hidden_helper_threads_deinitz_wait();
6465   }
6466
6467   KMP_MB(); /* Flush all pending memory write invalidates.  */
6468   /* find out who we are and what we should do */
6469   {
6470     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6471     KA_TRACE(
6472         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6473     if (gtid == KMP_GTID_SHUTDOWN) {
6474       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6475                     "already shutdown\n"));
6476       return;
6477     } else if (gtid == KMP_GTID_MONITOR) {
6478       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6479                     "registered, or system shutdown\n"));
6480       return;
6481     } else if (gtid == KMP_GTID_DNE) {
6482       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6483                     "shutdown\n"));
6484       /* we don't know who we are, but we may still shutdown the library */
6485     } else if (KMP_UBER_GTID(gtid)) {
6486       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6487       if (__kmp_root[gtid]->r.r_active) {
6488         __kmp_global.g.g_abort = -1;
6489         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6490         __kmp_unregister_library();
6491         KA_TRACE(10,
6492                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6493                   gtid));
6494         return;
6495       } else {
6496         __kmp_itthash_clean(__kmp_threads[gtid]);
6497         KA_TRACE(
6498             10,
6499             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6500         __kmp_unregister_root_current_thread(gtid);
6501       }
6502     } else {
6503 /* worker threads may call this function through the atexit handler, if they
6504  * call exit() */
6505 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6506    TODO: do a thorough shutdown instead */
6507 #ifdef DUMP_DEBUG_ON_EXIT
6508       if (__kmp_debug_buf)
6509         __kmp_dump_debug_buffer();
6510 #endif
6511       // added unregister library call here when we switch to shm linux
6512       // if we don't, it will leave lots of files in /dev/shm
6513       // cleanup shared memory file before exiting.
6514       __kmp_unregister_library();
6515       return;
6516     }
6517   }
6518   /* synchronize the termination process */
6519   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6520
6521   /* have we already finished */
6522   if (__kmp_global.g.g_abort) {
6523     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6524     /* TODO abort? */
6525     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6526     return;
6527   }
6528   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6529     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6530     return;
6531   }
6532
6533   /* We need this lock to enforce mutex between this reading of
6534      __kmp_threads_capacity and the writing by __kmp_register_root.
6535      Alternatively, we can use a counter of roots that is atomically updated by
6536      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6537      __kmp_internal_end_*.  */
6538   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6539
6540   /* now we can safely conduct the actual termination */
6541   __kmp_internal_end();
6542
6543   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6544   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6545
6546   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6547
6548 #ifdef DUMP_DEBUG_ON_EXIT
6549   if (__kmp_debug_buf)
6550     __kmp_dump_debug_buffer();
6551 #endif
6552
6553 #if KMP_OS_WINDOWS
6554   __kmp_close_console();
6555 #endif
6556
6557   __kmp_fini_allocator();
6558
6559 } // __kmp_internal_end_library
6560
6561 void __kmp_internal_end_thread(int gtid_req) {
6562   int i;
6563
6564   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6565   /* this shouldn't be a race condition because __kmp_internal_end() is the
6566    * only place to clear __kmp_serial_init */
6567   /* we'll check this later too, after we get the lock */
6568   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6569   // redundant, because the next check will work in any case.
6570   if (__kmp_global.g.g_abort) {
6571     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6572     /* TODO abort? */
6573     return;
6574   }
6575   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6576     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6577     return;
6578   }
6579
6580   // If hidden helper team has been initialized, we need to deinit it
6581   if (TCR_4(__kmp_init_hidden_helper) &&
6582       !TCR_4(__kmp_hidden_helper_team_done)) {
6583     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6584     // First release the main thread to let it continue its work
6585     __kmp_hidden_helper_main_thread_release();
6586     // Wait until the hidden helper team has been destroyed
6587     __kmp_hidden_helper_threads_deinitz_wait();
6588   }
6589
6590   KMP_MB(); /* Flush all pending memory write invalidates.  */
6591
6592   /* find out who we are and what we should do */
6593   {
6594     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6595     KA_TRACE(10,
6596              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6597     if (gtid == KMP_GTID_SHUTDOWN) {
6598       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6599                     "already shutdown\n"));
6600       return;
6601     } else if (gtid == KMP_GTID_MONITOR) {
6602       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6603                     "registered, or system shutdown\n"));
6604       return;
6605     } else if (gtid == KMP_GTID_DNE) {
6606       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6607                     "shutdown\n"));
6608       return;
6609       /* we don't know who we are */
6610     } else if (KMP_UBER_GTID(gtid)) {
6611       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6612       if (__kmp_root[gtid]->r.r_active) {
6613         __kmp_global.g.g_abort = -1;
6614         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6615         KA_TRACE(10,
6616                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6617                   gtid));
6618         return;
6619       } else {
6620         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6621                       gtid));
6622         __kmp_unregister_root_current_thread(gtid);
6623       }
6624     } else {
6625       /* just a worker thread, let's leave */
6626       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6627
6628       if (gtid >= 0) {
6629         __kmp_threads[gtid]->th.th_task_team = NULL;
6630       }
6631
6632       KA_TRACE(10,
6633                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6634                 gtid));
6635       return;
6636     }
6637   }
6638 #if KMP_DYNAMIC_LIB
6639   if (__kmp_pause_status != kmp_hard_paused)
6640   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6641   // because we will better shutdown later in the library destructor.
6642   {
6643     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6644     return;
6645   }
6646 #endif
6647   /* synchronize the termination process */
6648   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6649
6650   /* have we already finished */
6651   if (__kmp_global.g.g_abort) {
6652     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6653     /* TODO abort? */
6654     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655     return;
6656   }
6657   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6658     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6659     return;
6660   }
6661
6662   /* We need this lock to enforce mutex between this reading of
6663      __kmp_threads_capacity and the writing by __kmp_register_root.
6664      Alternatively, we can use a counter of roots that is atomically updated by
6665      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6666      __kmp_internal_end_*.  */
6667
6668   /* should we finish the run-time?  are all siblings done? */
6669   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6670
6671   for (i = 0; i < __kmp_threads_capacity; ++i) {
6672     if (KMP_UBER_GTID(i)) {
6673       KA_TRACE(
6674           10,
6675           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6676       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6677       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678       return;
6679     }
6680   }
6681
6682   /* now we can safely conduct the actual termination */
6683
6684   __kmp_internal_end();
6685
6686   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6687   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6688
6689   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6690
6691 #ifdef DUMP_DEBUG_ON_EXIT
6692   if (__kmp_debug_buf)
6693     __kmp_dump_debug_buffer();
6694 #endif
6695 } // __kmp_internal_end_thread
6696
6697 // -----------------------------------------------------------------------------
6698 // Library registration stuff.
6699
6700 static long __kmp_registration_flag = 0;
6701 // Random value used to indicate library initialization.
6702 static char *__kmp_registration_str = NULL;
6703 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6704
6705 static inline char *__kmp_reg_status_name() {
6706 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6707    each thread. If registration and unregistration go in different threads
6708    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6709    env var can not be found, because the name will contain different pid. */
6710 // macOS* complains about name being too long with additional getuid()
6711 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6712   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6713                           (int)getuid());
6714 #else
6715   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6716 #endif
6717 } // __kmp_reg_status_get
6718
6719 #if defined(KMP_USE_SHM)
6720 bool __kmp_shm_available = false;
6721 bool __kmp_tmp_available = false;
6722 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6723 char *temp_reg_status_file_name = nullptr;
6724 #endif
6725
6726 void __kmp_register_library_startup(void) {
6727
6728   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6729   int done = 0;
6730   union {
6731     double dtime;
6732     long ltime;
6733   } time;
6734 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6735   __kmp_initialize_system_tick();
6736 #endif
6737   __kmp_read_system_time(&time.dtime);
6738   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6739   __kmp_registration_str =
6740       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6741                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6742
6743   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6744                 __kmp_registration_str));
6745
6746   while (!done) {
6747
6748     char *value = NULL; // Actual value of the environment variable.
6749
6750 #if defined(KMP_USE_SHM)
6751     char *shm_name = nullptr;
6752     char *data1 = nullptr;
6753     __kmp_shm_available = __kmp_detect_shm();
6754     if (__kmp_shm_available) {
6755       int fd1 = -1;
6756       shm_name = __kmp_str_format("/%s", name);
6757       int shm_preexist = 0;
6758       fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6759       if ((fd1 == -1) && (errno == EEXIST)) {
6760         // file didn't open because it already exists.
6761         // try opening existing file
6762         fd1 = shm_open(shm_name, O_RDWR, 0666);
6763         if (fd1 == -1) { // file didn't open
6764           KMP_WARNING(FunctionError, "Can't open SHM");
6765           __kmp_shm_available = false;
6766         } else { // able to open existing file
6767           shm_preexist = 1;
6768         }
6769       }
6770       if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6771         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6772           KMP_WARNING(FunctionError, "Can't set size of SHM");
6773           __kmp_shm_available = false;
6774         }
6775       }
6776       if (__kmp_shm_available) { // SHM exists, now map it
6777         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6778                              fd1, 0);
6779         if (data1 == MAP_FAILED) { // failed to map shared memory
6780           KMP_WARNING(FunctionError, "Can't map SHM");
6781           __kmp_shm_available = false;
6782         }
6783       }
6784       if (__kmp_shm_available) { // SHM mapped
6785         if (shm_preexist == 0) { // set data to SHM, set value
6786           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6787         }
6788         // Read value from either what we just wrote or existing file.
6789         value = __kmp_str_format("%s", data1); // read value from SHM
6790         munmap(data1, SHM_SIZE);
6791       }
6792       if (fd1 != -1)
6793         close(fd1);
6794     }
6795     if (!__kmp_shm_available)
6796       __kmp_tmp_available = __kmp_detect_tmp();
6797     if (!__kmp_shm_available && __kmp_tmp_available) {
6798       // SHM failed to work due to an error other than that the file already
6799       // exists. Try to create a temp file under /tmp.
6800       // If /tmp isn't accessible, fall back to using environment variable.
6801       // TODO: /tmp might not always be the temporary directory. For now we will
6802       // not consider TMPDIR.
6803       int fd1 = -1;
6804       temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6805       int tmp_preexist = 0;
6806       fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6807       if ((fd1 == -1) && (errno == EEXIST)) {
6808         // file didn't open because it already exists.
6809         // try opening existing file
6810         fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6811         if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6812           KMP_WARNING(FunctionError, "Can't open TEMP");
6813           __kmp_tmp_available = false;
6814         } else {
6815           tmp_preexist = 1;
6816         }
6817       }
6818       if (__kmp_tmp_available && tmp_preexist == 0) {
6819         // we created /tmp file now set size
6820         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6821           KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6822           __kmp_tmp_available = false;
6823         }
6824       }
6825       if (__kmp_tmp_available) {
6826         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6827                              fd1, 0);
6828         if (data1 == MAP_FAILED) { // failed to map /tmp
6829           KMP_WARNING(FunctionError, "Can't map /tmp");
6830           __kmp_tmp_available = false;
6831         }
6832       }
6833       if (__kmp_tmp_available) {
6834         if (tmp_preexist == 0) { // set data to TMP, set value
6835           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6836         }
6837         // Read value from either what we just wrote or existing file.
6838         value = __kmp_str_format("%s", data1); // read value from SHM
6839         munmap(data1, SHM_SIZE);
6840       }
6841       if (fd1 != -1)
6842         close(fd1);
6843     }
6844     if (!__kmp_shm_available && !__kmp_tmp_available) {
6845       // no /dev/shm and no /tmp -- fall back to environment variable
6846       // Set environment variable, but do not overwrite if it exists.
6847       __kmp_env_set(name, __kmp_registration_str, 0);
6848       // read value to see if it got set
6849       value = __kmp_env_get(name);
6850     }
6851 #else // Windows and unix with static library
6852     // Set environment variable, but do not overwrite if it exists.
6853     __kmp_env_set(name, __kmp_registration_str, 0);
6854     // read value to see if it got set
6855     value = __kmp_env_get(name);
6856 #endif
6857
6858     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6859       done = 1; // Ok, environment variable set successfully, exit the loop.
6860     } else {
6861       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6862       // Check whether it alive or dead.
6863       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6864       char *tail = value;
6865       char *flag_addr_str = NULL;
6866       char *flag_val_str = NULL;
6867       char const *file_name = NULL;
6868       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6869       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6870       file_name = tail;
6871       if (tail != NULL) {
6872         unsigned long *flag_addr = 0;
6873         unsigned long flag_val = 0;
6874         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6875         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6876         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6877           // First, check whether environment-encoded address is mapped into
6878           // addr space.
6879           // If so, dereference it to see if it still has the right value.
6880           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6881             neighbor = 1;
6882           } else {
6883             // If not, then we know the other copy of the library is no longer
6884             // running.
6885             neighbor = 2;
6886           }
6887         }
6888       }
6889       switch (neighbor) {
6890       case 0: // Cannot parse environment variable -- neighbor status unknown.
6891         // Assume it is the incompatible format of future version of the
6892         // library. Assume the other library is alive.
6893         // WARN( ... ); // TODO: Issue a warning.
6894         file_name = "unknown library";
6895         KMP_FALLTHROUGH();
6896       // Attention! Falling to the next case. That's intentional.
6897       case 1: { // Neighbor is alive.
6898         // Check it is allowed.
6899         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6900         if (!__kmp_str_match_true(duplicate_ok)) {
6901           // That's not allowed. Issue fatal error.
6902           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6903                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6904         }
6905         KMP_INTERNAL_FREE(duplicate_ok);
6906         __kmp_duplicate_library_ok = 1;
6907         done = 1; // Exit the loop.
6908       } break;
6909       case 2: { // Neighbor is dead.
6910
6911 #if defined(KMP_USE_SHM)
6912         if (__kmp_shm_available) { // close shared memory.
6913           shm_unlink(shm_name); // this removes file in /dev/shm
6914         } else if (__kmp_tmp_available) {
6915           unlink(temp_reg_status_file_name); // this removes the temp file
6916         } else {
6917           // Clear the variable and try to register library again.
6918           __kmp_env_unset(name);
6919         }
6920 #else
6921         // Clear the variable and try to register library again.
6922         __kmp_env_unset(name);
6923 #endif
6924       } break;
6925       default: {
6926         KMP_DEBUG_ASSERT(0);
6927       } break;
6928       }
6929     }
6930     KMP_INTERNAL_FREE((void *)value);
6931 #if defined(KMP_USE_SHM)
6932     if (shm_name)
6933       KMP_INTERNAL_FREE((void *)shm_name);
6934 #endif
6935   } // while
6936   KMP_INTERNAL_FREE((void *)name);
6937
6938 } // func __kmp_register_library_startup
6939
6940 void __kmp_unregister_library(void) {
6941
6942   char *name = __kmp_reg_status_name();
6943   char *value = NULL;
6944
6945 #if defined(KMP_USE_SHM)
6946   char *shm_name = nullptr;
6947   int fd1;
6948   if (__kmp_shm_available) {
6949     shm_name = __kmp_str_format("/%s", name);
6950     fd1 = shm_open(shm_name, O_RDONLY, 0666);
6951     if (fd1 != -1) { // File opened successfully
6952       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6953       if (data1 != MAP_FAILED) {
6954         value = __kmp_str_format("%s", data1); // read value from SHM
6955         munmap(data1, SHM_SIZE);
6956       }
6957       close(fd1);
6958     }
6959   } else if (__kmp_tmp_available) { // try /tmp
6960     fd1 = open(temp_reg_status_file_name, O_RDONLY);
6961     if (fd1 != -1) { // File opened successfully
6962       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6963       if (data1 != MAP_FAILED) {
6964         value = __kmp_str_format("%s", data1); // read value from /tmp
6965         munmap(data1, SHM_SIZE);
6966       }
6967       close(fd1);
6968     }
6969   } else { // fall back to envirable
6970     value = __kmp_env_get(name);
6971   }
6972 #else
6973   value = __kmp_env_get(name);
6974 #endif
6975
6976   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6977   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6978   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6979 //  Ok, this is our variable. Delete it.
6980 #if defined(KMP_USE_SHM)
6981     if (__kmp_shm_available) {
6982       shm_unlink(shm_name); // this removes file in /dev/shm
6983     } else if (__kmp_tmp_available) {
6984       unlink(temp_reg_status_file_name); // this removes the temp file
6985     } else {
6986       __kmp_env_unset(name);
6987     }
6988 #else
6989     __kmp_env_unset(name);
6990 #endif
6991   }
6992
6993 #if defined(KMP_USE_SHM)
6994   if (shm_name)
6995     KMP_INTERNAL_FREE(shm_name);
6996   if (temp_reg_status_file_name)
6997     KMP_INTERNAL_FREE(temp_reg_status_file_name);
6998 #endif
6999
7000   KMP_INTERNAL_FREE(__kmp_registration_str);
7001   KMP_INTERNAL_FREE(value);
7002   KMP_INTERNAL_FREE(name);
7003
7004   __kmp_registration_flag = 0;
7005   __kmp_registration_str = NULL;
7006
7007 } // __kmp_unregister_library
7008
7009 // End of Library registration stuff.
7010 // -----------------------------------------------------------------------------
7011
7012 #if KMP_MIC_SUPPORTED
7013
7014 static void __kmp_check_mic_type() {
7015   kmp_cpuid_t cpuid_state = {0};
7016   kmp_cpuid_t *cs_p = &cpuid_state;
7017   __kmp_x86_cpuid(1, 0, cs_p);
7018   // We don't support mic1 at the moment
7019   if ((cs_p->eax & 0xff0) == 0xB10) {
7020     __kmp_mic_type = mic2;
7021   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7022     __kmp_mic_type = mic3;
7023   } else {
7024     __kmp_mic_type = non_mic;
7025   }
7026 }
7027
7028 #endif /* KMP_MIC_SUPPORTED */
7029
7030 #if KMP_HAVE_UMWAIT
7031 static void __kmp_user_level_mwait_init() {
7032   struct kmp_cpuid buf;
7033   __kmp_x86_cpuid(7, 0, &buf);
7034   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7035   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7036   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7037   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7038                 __kmp_umwait_enabled));
7039 }
7040 #elif KMP_HAVE_MWAIT
7041 #ifndef AT_INTELPHIUSERMWAIT
7042 // Spurious, non-existent value that should always fail to return anything.
7043 // Will be replaced with the correct value when we know that.
7044 #define AT_INTELPHIUSERMWAIT 10000
7045 #endif
7046 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7047 // earlier OS is used to build the RTL, we'll use the following internal
7048 // function when the entry is not found.
7049 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7050 unsigned long getauxval(unsigned long) { return 0; }
7051
7052 static void __kmp_user_level_mwait_init() {
7053   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7054   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7055   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7056   // KMP_USER_LEVEL_MWAIT was set to TRUE.
7057   if (__kmp_mic_type == mic3) {
7058     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7059     if ((res & 0x1) || __kmp_user_level_mwait) {
7060       __kmp_mwait_enabled = TRUE;
7061       if (__kmp_user_level_mwait) {
7062         KMP_INFORM(EnvMwaitWarn);
7063       }
7064     } else {
7065       __kmp_mwait_enabled = FALSE;
7066     }
7067   }
7068   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7069                 "__kmp_mwait_enabled = %d\n",
7070                 __kmp_mic_type, __kmp_mwait_enabled));
7071 }
7072 #endif /* KMP_HAVE_UMWAIT */
7073
7074 static void __kmp_do_serial_initialize(void) {
7075   int i, gtid;
7076   size_t size;
7077
7078   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7079
7080   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7081   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7082   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7083   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7084   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7085
7086 #if OMPT_SUPPORT
7087   ompt_pre_init();
7088 #endif
7089 #if OMPD_SUPPORT
7090   __kmp_env_dump();
7091   ompd_init();
7092 #endif
7093
7094   __kmp_validate_locks();
7095
7096 #if ENABLE_LIBOMPTARGET
7097   /* Initialize functions from libomptarget */
7098   __kmp_init_omptarget();
7099 #endif
7100
7101   /* Initialize internal memory allocator */
7102   __kmp_init_allocator();
7103
7104   /* Register the library startup via an environment variable or via mapped
7105      shared memory file and check to see whether another copy of the library is
7106      already registered. Since forked child process is often terminated, we
7107      postpone the registration till middle initialization in the child */
7108   if (__kmp_need_register_serial)
7109     __kmp_register_library_startup();
7110
7111   /* TODO reinitialization of library */
7112   if (TCR_4(__kmp_global.g.g_done)) {
7113     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7114   }
7115
7116   __kmp_global.g.g_abort = 0;
7117   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7118
7119 /* initialize the locks */
7120 #if KMP_USE_ADAPTIVE_LOCKS
7121 #if KMP_DEBUG_ADAPTIVE_LOCKS
7122   __kmp_init_speculative_stats();
7123 #endif
7124 #endif
7125 #if KMP_STATS_ENABLED
7126   __kmp_stats_init();
7127 #endif
7128   __kmp_init_lock(&__kmp_global_lock);
7129   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7130   __kmp_init_lock(&__kmp_debug_lock);
7131   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7132   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7133   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7134   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7135   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7136   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7137   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7138   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7139   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7140   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7141   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7142   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7143   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7144   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7145   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7146 #if KMP_USE_MONITOR
7147   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7148 #endif
7149   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7150
7151   /* conduct initialization and initial setup of configuration */
7152
7153   __kmp_runtime_initialize();
7154
7155 #if KMP_MIC_SUPPORTED
7156   __kmp_check_mic_type();
7157 #endif
7158
7159 // Some global variable initialization moved here from kmp_env_initialize()
7160 #ifdef KMP_DEBUG
7161   kmp_diag = 0;
7162 #endif
7163   __kmp_abort_delay = 0;
7164
7165   // From __kmp_init_dflt_team_nth()
7166   /* assume the entire machine will be used */
7167   __kmp_dflt_team_nth_ub = __kmp_xproc;
7168   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7169     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7170   }
7171   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7172     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7173   }
7174   __kmp_max_nth = __kmp_sys_max_nth;
7175   __kmp_cg_max_nth = __kmp_sys_max_nth;
7176   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7177   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7178     __kmp_teams_max_nth = __kmp_sys_max_nth;
7179   }
7180
7181   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7182   // part
7183   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7184 #if KMP_USE_MONITOR
7185   __kmp_monitor_wakeups =
7186       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7187   __kmp_bt_intervals =
7188       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189 #endif
7190   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7191   __kmp_library = library_throughput;
7192   // From KMP_SCHEDULE initialization
7193   __kmp_static = kmp_sch_static_balanced;
7194 // AC: do not use analytical here, because it is non-monotonous
7195 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7196 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7197 // need to repeat assignment
7198 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7199 // bit control and barrier method control parts
7200 #if KMP_FAST_REDUCTION_BARRIER
7201 #define kmp_reduction_barrier_gather_bb ((int)1)
7202 #define kmp_reduction_barrier_release_bb ((int)1)
7203 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7204 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7205 #endif // KMP_FAST_REDUCTION_BARRIER
7206   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7207     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7208     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7209     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7210     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7211 #if KMP_FAST_REDUCTION_BARRIER
7212     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7213       // lin_64 ): hyper,1
7214       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7215       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7216       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7217       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7218     }
7219 #endif // KMP_FAST_REDUCTION_BARRIER
7220   }
7221 #if KMP_FAST_REDUCTION_BARRIER
7222 #undef kmp_reduction_barrier_release_pat
7223 #undef kmp_reduction_barrier_gather_pat
7224 #undef kmp_reduction_barrier_release_bb
7225 #undef kmp_reduction_barrier_gather_bb
7226 #endif // KMP_FAST_REDUCTION_BARRIER
7227 #if KMP_MIC_SUPPORTED
7228   if (__kmp_mic_type == mic2) { // KNC
7229     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7230     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7231     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7232         1; // forkjoin release
7233     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7234     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7235   }
7236 #if KMP_FAST_REDUCTION_BARRIER
7237   if (__kmp_mic_type == mic2) { // KNC
7238     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7239     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7240   }
7241 #endif // KMP_FAST_REDUCTION_BARRIER
7242 #endif // KMP_MIC_SUPPORTED
7243
7244 // From KMP_CHECKS initialization
7245 #ifdef KMP_DEBUG
7246   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7247 #else
7248   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7249 #endif
7250
7251   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7252   __kmp_foreign_tp = TRUE;
7253
7254   __kmp_global.g.g_dynamic = FALSE;
7255   __kmp_global.g.g_dynamic_mode = dynamic_default;
7256
7257   __kmp_init_nesting_mode();
7258
7259   __kmp_env_initialize(NULL);
7260
7261 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7262   __kmp_user_level_mwait_init();
7263 #endif
7264 // Print all messages in message catalog for testing purposes.
7265 #ifdef KMP_DEBUG
7266   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7267   if (__kmp_str_match_true(val)) {
7268     kmp_str_buf_t buffer;
7269     __kmp_str_buf_init(&buffer);
7270     __kmp_i18n_dump_catalog(&buffer);
7271     __kmp_printf("%s", buffer.str);
7272     __kmp_str_buf_free(&buffer);
7273   }
7274   __kmp_env_free(&val);
7275 #endif
7276
7277   __kmp_threads_capacity =
7278       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7279   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7280   __kmp_tp_capacity = __kmp_default_tp_capacity(
7281       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7282
7283   // If the library is shut down properly, both pools must be NULL. Just in
7284   // case, set them to NULL -- some memory may leak, but subsequent code will
7285   // work even if pools are not freed.
7286   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7287   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7288   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7289   __kmp_thread_pool = NULL;
7290   __kmp_thread_pool_insert_pt = NULL;
7291   __kmp_team_pool = NULL;
7292
7293   /* Allocate all of the variable sized records */
7294   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7295    * expandable */
7296   /* Since allocation is cache-aligned, just add extra padding at the end */
7297   size =
7298       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7299       CACHE_LINE;
7300   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7301   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7302                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7303
7304   /* init thread counts */
7305   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7306                    0); // Asserts fail if the library is reinitializing and
7307   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7308   __kmp_all_nth = 0;
7309   __kmp_nth = 0;
7310
7311   /* setup the uber master thread and hierarchy */
7312   gtid = __kmp_register_root(TRUE);
7313   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7314   KMP_ASSERT(KMP_UBER_GTID(gtid));
7315   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7316
7317   KMP_MB(); /* Flush all pending memory write invalidates.  */
7318
7319   __kmp_common_initialize();
7320
7321 #if KMP_OS_UNIX
7322   /* invoke the child fork handler */
7323   __kmp_register_atfork();
7324 #endif
7325
7326 #if !KMP_DYNAMIC_LIB ||                                                        \
7327     ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7328   {
7329     /* Invoke the exit handler when the program finishes, only for static
7330        library and macOS* dynamic. For other dynamic libraries, we already
7331        have _fini and DllMain. */
7332     int rc = atexit(__kmp_internal_end_atexit);
7333     if (rc != 0) {
7334       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7335                   __kmp_msg_null);
7336     }
7337   }
7338 #endif
7339
7340 #if KMP_HANDLE_SIGNALS
7341 #if KMP_OS_UNIX
7342   /* NOTE: make sure that this is called before the user installs their own
7343      signal handlers so that the user handlers are called first. this way they
7344      can return false, not call our handler, avoid terminating the library, and
7345      continue execution where they left off. */
7346   __kmp_install_signals(FALSE);
7347 #endif /* KMP_OS_UNIX */
7348 #if KMP_OS_WINDOWS
7349   __kmp_install_signals(TRUE);
7350 #endif /* KMP_OS_WINDOWS */
7351 #endif
7352
7353   /* we have finished the serial initialization */
7354   __kmp_init_counter++;
7355
7356   __kmp_init_serial = TRUE;
7357
7358   if (__kmp_version) {
7359     __kmp_print_version_1();
7360   }
7361
7362   if (__kmp_settings) {
7363     __kmp_env_print();
7364   }
7365
7366   if (__kmp_display_env || __kmp_display_env_verbose) {
7367     __kmp_env_print_2();
7368   }
7369
7370 #if OMPT_SUPPORT
7371   ompt_post_init();
7372 #endif
7373
7374   KMP_MB();
7375
7376   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7377 }
7378
7379 void __kmp_serial_initialize(void) {
7380   if (__kmp_init_serial) {
7381     return;
7382   }
7383   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7384   if (__kmp_init_serial) {
7385     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7386     return;
7387   }
7388   __kmp_do_serial_initialize();
7389   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7390 }
7391
7392 static void __kmp_do_middle_initialize(void) {
7393   int i, j;
7394   int prev_dflt_team_nth;
7395
7396   if (!__kmp_init_serial) {
7397     __kmp_do_serial_initialize();
7398   }
7399
7400   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7401
7402   if (UNLIKELY(!__kmp_need_register_serial)) {
7403     // We are in a forked child process. The registration was skipped during
7404     // serial initialization in __kmp_atfork_child handler. Do it here.
7405     __kmp_register_library_startup();
7406   }
7407
7408   // Save the previous value for the __kmp_dflt_team_nth so that
7409   // we can avoid some reinitialization if it hasn't changed.
7410   prev_dflt_team_nth = __kmp_dflt_team_nth;
7411
7412 #if KMP_AFFINITY_SUPPORTED
7413   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7414   // number of cores on the machine.
7415   __kmp_affinity_initialize(__kmp_affinity);
7416
7417 #endif /* KMP_AFFINITY_SUPPORTED */
7418
7419   KMP_ASSERT(__kmp_xproc > 0);
7420   if (__kmp_avail_proc == 0) {
7421     __kmp_avail_proc = __kmp_xproc;
7422   }
7423
7424   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7425   // correct them now
7426   j = 0;
7427   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7428     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7429         __kmp_avail_proc;
7430     j++;
7431   }
7432
7433   if (__kmp_dflt_team_nth == 0) {
7434 #ifdef KMP_DFLT_NTH_CORES
7435     // Default #threads = #cores
7436     __kmp_dflt_team_nth = __kmp_ncores;
7437     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7438                   "__kmp_ncores (%d)\n",
7439                   __kmp_dflt_team_nth));
7440 #else
7441     // Default #threads = #available OS procs
7442     __kmp_dflt_team_nth = __kmp_avail_proc;
7443     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7444                   "__kmp_avail_proc(%d)\n",
7445                   __kmp_dflt_team_nth));
7446 #endif /* KMP_DFLT_NTH_CORES */
7447   }
7448
7449   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7450     __kmp_dflt_team_nth = KMP_MIN_NTH;
7451   }
7452   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7453     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7454   }
7455
7456   if (__kmp_nesting_mode > 0)
7457     __kmp_set_nesting_mode_threads();
7458
7459   // There's no harm in continuing if the following check fails,
7460   // but it indicates an error in the previous logic.
7461   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7462
7463   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7464     // Run through the __kmp_threads array and set the num threads icv for each
7465     // root thread that is currently registered with the RTL (which has not
7466     // already explicitly set its nthreads-var with a call to
7467     // omp_set_num_threads()).
7468     for (i = 0; i < __kmp_threads_capacity; i++) {
7469       kmp_info_t *thread = __kmp_threads[i];
7470       if (thread == NULL)
7471         continue;
7472       if (thread->th.th_current_task->td_icvs.nproc != 0)
7473         continue;
7474
7475       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7476     }
7477   }
7478   KA_TRACE(
7479       20,
7480       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7481        __kmp_dflt_team_nth));
7482
7483 #ifdef KMP_ADJUST_BLOCKTIME
7484   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7485   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7486     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7487     if (__kmp_nth > __kmp_avail_proc) {
7488       __kmp_zero_bt = TRUE;
7489     }
7490   }
7491 #endif /* KMP_ADJUST_BLOCKTIME */
7492
7493   /* we have finished middle initialization */
7494   TCW_SYNC_4(__kmp_init_middle, TRUE);
7495
7496   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7497 }
7498
7499 void __kmp_middle_initialize(void) {
7500   if (__kmp_init_middle) {
7501     return;
7502   }
7503   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7504   if (__kmp_init_middle) {
7505     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7506     return;
7507   }
7508   __kmp_do_middle_initialize();
7509   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7510 }
7511
7512 void __kmp_parallel_initialize(void) {
7513   int gtid = __kmp_entry_gtid(); // this might be a new root
7514
7515   /* synchronize parallel initialization (for sibling) */
7516   if (TCR_4(__kmp_init_parallel))
7517     return;
7518   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7519   if (TCR_4(__kmp_init_parallel)) {
7520     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7521     return;
7522   }
7523
7524   /* TODO reinitialization after we have already shut down */
7525   if (TCR_4(__kmp_global.g.g_done)) {
7526     KA_TRACE(
7527         10,
7528         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7529     __kmp_infinite_loop();
7530   }
7531
7532   /* jc: The lock __kmp_initz_lock is already held, so calling
7533      __kmp_serial_initialize would cause a deadlock.  So we call
7534      __kmp_do_serial_initialize directly. */
7535   if (!__kmp_init_middle) {
7536     __kmp_do_middle_initialize();
7537   }
7538   __kmp_assign_root_init_mask();
7539   __kmp_resume_if_hard_paused();
7540
7541   /* begin initialization */
7542   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7543   KMP_ASSERT(KMP_UBER_GTID(gtid));
7544
7545 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7546   // Save the FP control regs.
7547   // Worker threads will set theirs to these values at thread startup.
7548   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7549   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7550   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7551 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7552
7553 #if KMP_OS_UNIX
7554 #if KMP_HANDLE_SIGNALS
7555   /*  must be after __kmp_serial_initialize  */
7556   __kmp_install_signals(TRUE);
7557 #endif
7558 #endif
7559
7560   __kmp_suspend_initialize();
7561
7562 #if defined(USE_LOAD_BALANCE)
7563   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7564     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7565   }
7566 #else
7567   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7568     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7569   }
7570 #endif
7571
7572   if (__kmp_version) {
7573     __kmp_print_version_2();
7574   }
7575
7576   /* we have finished parallel initialization */
7577   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7578
7579   KMP_MB();
7580   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7581
7582   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7583 }
7584
7585 void __kmp_hidden_helper_initialize() {
7586   if (TCR_4(__kmp_init_hidden_helper))
7587     return;
7588
7589   // __kmp_parallel_initialize is required before we initialize hidden helper
7590   if (!TCR_4(__kmp_init_parallel))
7591     __kmp_parallel_initialize();
7592
7593   // Double check. Note that this double check should not be placed before
7594   // __kmp_parallel_initialize as it will cause dead lock.
7595   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7596   if (TCR_4(__kmp_init_hidden_helper)) {
7597     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7598     return;
7599   }
7600
7601 #if KMP_AFFINITY_SUPPORTED
7602   // Initialize hidden helper affinity settings.
7603   // The above __kmp_parallel_initialize() will initialize
7604   // regular affinity (and topology) if not already done.
7605   if (!__kmp_hh_affinity.flags.initialized)
7606     __kmp_affinity_initialize(__kmp_hh_affinity);
7607 #endif
7608
7609   // Set the count of hidden helper tasks to be executed to zero
7610   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7611
7612   // Set the global variable indicating that we're initializing hidden helper
7613   // team/threads
7614   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7615
7616   // Platform independent initialization
7617   __kmp_do_initialize_hidden_helper_threads();
7618
7619   // Wait here for the finish of initialization of hidden helper teams
7620   __kmp_hidden_helper_threads_initz_wait();
7621
7622   // We have finished hidden helper initialization
7623   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7624
7625   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7626 }
7627
7628 /* ------------------------------------------------------------------------ */
7629
7630 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7631                                    kmp_team_t *team) {
7632   kmp_disp_t *dispatch;
7633
7634   KMP_MB();
7635
7636   /* none of the threads have encountered any constructs, yet. */
7637   this_thr->th.th_local.this_construct = 0;
7638 #if KMP_CACHE_MANAGE
7639   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7640 #endif /* KMP_CACHE_MANAGE */
7641   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7642   KMP_DEBUG_ASSERT(dispatch);
7643   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7644   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7645   // this_thr->th.th_info.ds.ds_tid ] );
7646
7647   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7648   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7649   if (__kmp_env_consistency_check)
7650     __kmp_push_parallel(gtid, team->t.t_ident);
7651
7652   KMP_MB(); /* Flush all pending memory write invalidates.  */
7653 }
7654
7655 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7656                                   kmp_team_t *team) {
7657   if (__kmp_env_consistency_check)
7658     __kmp_pop_parallel(gtid, team->t.t_ident);
7659
7660   __kmp_finish_implicit_task(this_thr);
7661 }
7662
7663 int __kmp_invoke_task_func(int gtid) {
7664   int rc;
7665   int tid = __kmp_tid_from_gtid(gtid);
7666   kmp_info_t *this_thr = __kmp_threads[gtid];
7667   kmp_team_t *team = this_thr->th.th_team;
7668
7669   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7670 #if USE_ITT_BUILD
7671   if (__itt_stack_caller_create_ptr) {
7672     // inform ittnotify about entering user's code
7673     if (team->t.t_stack_id != NULL) {
7674       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7675     } else {
7676       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7677       __kmp_itt_stack_callee_enter(
7678           (__itt_caller)team->t.t_parent->t.t_stack_id);
7679     }
7680   }
7681 #endif /* USE_ITT_BUILD */
7682 #if INCLUDE_SSC_MARKS
7683   SSC_MARK_INVOKING();
7684 #endif
7685
7686 #if OMPT_SUPPORT
7687   void *dummy;
7688   void **exit_frame_p;
7689   ompt_data_t *my_task_data;
7690   ompt_data_t *my_parallel_data;
7691   int ompt_team_size;
7692
7693   if (ompt_enabled.enabled) {
7694     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7695                          .ompt_task_info.frame.exit_frame.ptr);
7696   } else {
7697     exit_frame_p = &dummy;
7698   }
7699
7700   my_task_data =
7701       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7702   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7703   if (ompt_enabled.ompt_callback_implicit_task) {
7704     ompt_team_size = team->t.t_nproc;
7705     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7706         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7707         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7708     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7709   }
7710 #endif
7711
7712 #if KMP_STATS_ENABLED
7713   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7714   if (previous_state == stats_state_e::TEAMS_REGION) {
7715     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7716   } else {
7717     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7718   }
7719   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7720 #endif
7721
7722   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7723                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7724 #if OMPT_SUPPORT
7725                               ,
7726                               exit_frame_p
7727 #endif
7728   );
7729 #if OMPT_SUPPORT
7730   *exit_frame_p = NULL;
7731   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7732 #endif
7733
7734 #if KMP_STATS_ENABLED
7735   if (previous_state == stats_state_e::TEAMS_REGION) {
7736     KMP_SET_THREAD_STATE(previous_state);
7737   }
7738   KMP_POP_PARTITIONED_TIMER();
7739 #endif
7740
7741 #if USE_ITT_BUILD
7742   if (__itt_stack_caller_create_ptr) {
7743     // inform ittnotify about leaving user's code
7744     if (team->t.t_stack_id != NULL) {
7745       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7746     } else {
7747       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7748       __kmp_itt_stack_callee_leave(
7749           (__itt_caller)team->t.t_parent->t.t_stack_id);
7750     }
7751   }
7752 #endif /* USE_ITT_BUILD */
7753   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7754
7755   return rc;
7756 }
7757
7758 void __kmp_teams_master(int gtid) {
7759   // This routine is called by all primary threads in teams construct
7760   kmp_info_t *thr = __kmp_threads[gtid];
7761   kmp_team_t *team = thr->th.th_team;
7762   ident_t *loc = team->t.t_ident;
7763   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7764   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7765   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7766   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7767                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7768
7769   // This thread is a new CG root.  Set up the proper variables.
7770   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7771   tmp->cg_root = thr; // Make thr the CG root
7772   // Init to thread limit stored when league primary threads were forked
7773   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7774   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7775   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7776                  " cg_nthreads to 1\n",
7777                  thr, tmp));
7778   tmp->up = thr->th.th_cg_roots;
7779   thr->th.th_cg_roots = tmp;
7780
7781 // Launch league of teams now, but not let workers execute
7782 // (they hang on fork barrier until next parallel)
7783 #if INCLUDE_SSC_MARKS
7784   SSC_MARK_FORKING();
7785 #endif
7786   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7787                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7788                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7789 #if INCLUDE_SSC_MARKS
7790   SSC_MARK_JOINING();
7791 #endif
7792   // If the team size was reduced from the limit, set it to the new size
7793   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7794     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7795   // AC: last parameter "1" eliminates join barrier which won't work because
7796   // worker threads are in a fork barrier waiting for more parallel regions
7797   __kmp_join_call(loc, gtid
7798 #if OMPT_SUPPORT
7799                   ,
7800                   fork_context_intel
7801 #endif
7802                   ,
7803                   1);
7804 }
7805
7806 int __kmp_invoke_teams_master(int gtid) {
7807   kmp_info_t *this_thr = __kmp_threads[gtid];
7808   kmp_team_t *team = this_thr->th.th_team;
7809 #if KMP_DEBUG
7810   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7811     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7812                      (void *)__kmp_teams_master);
7813 #endif
7814   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7815 #if OMPT_SUPPORT
7816   int tid = __kmp_tid_from_gtid(gtid);
7817   ompt_data_t *task_data =
7818       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7819   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7820   if (ompt_enabled.ompt_callback_implicit_task) {
7821     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7822         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7823         ompt_task_initial);
7824     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7825   }
7826 #endif
7827   __kmp_teams_master(gtid);
7828 #if OMPT_SUPPORT
7829   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7830 #endif
7831   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7832   return 1;
7833 }
7834
7835 /* this sets the requested number of threads for the next parallel region
7836    encountered by this team. since this should be enclosed in the forkjoin
7837    critical section it should avoid race conditions with asymmetrical nested
7838    parallelism */
7839
7840 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7841   kmp_info_t *thr = __kmp_threads[gtid];
7842
7843   if (num_threads > 0)
7844     thr->th.th_set_nproc = num_threads;
7845 }
7846
7847 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7848                                     int num_threads) {
7849   KMP_DEBUG_ASSERT(thr);
7850   // Remember the number of threads for inner parallel regions
7851   if (!TCR_4(__kmp_init_middle))
7852     __kmp_middle_initialize(); // get internal globals calculated
7853   __kmp_assign_root_init_mask();
7854   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7855   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7856
7857   if (num_threads == 0) {
7858     if (__kmp_teams_thread_limit > 0) {
7859       num_threads = __kmp_teams_thread_limit;
7860     } else {
7861       num_threads = __kmp_avail_proc / num_teams;
7862     }
7863     // adjust num_threads w/o warning as it is not user setting
7864     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7865     // no thread_limit clause specified -  do not change thread-limit-var ICV
7866     if (num_threads > __kmp_dflt_team_nth) {
7867       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7868     }
7869     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7870       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7871     } // prevent team size to exceed thread-limit-var
7872     if (num_teams * num_threads > __kmp_teams_max_nth) {
7873       num_threads = __kmp_teams_max_nth / num_teams;
7874     }
7875     if (num_threads == 0) {
7876       num_threads = 1;
7877     }
7878   } else {
7879     if (num_threads < 0) {
7880       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7881                 __kmp_msg_null);
7882       num_threads = 1;
7883     }
7884     // This thread will be the primary thread of the league primary threads
7885     // Store new thread limit; old limit is saved in th_cg_roots list
7886     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7887     // num_threads = min(num_threads, nthreads-var)
7888     if (num_threads > __kmp_dflt_team_nth) {
7889       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7890     }
7891     if (num_teams * num_threads > __kmp_teams_max_nth) {
7892       int new_threads = __kmp_teams_max_nth / num_teams;
7893       if (new_threads == 0) {
7894         new_threads = 1;
7895       }
7896       if (new_threads != num_threads) {
7897         if (!__kmp_reserve_warn) { // user asked for too many threads
7898           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7899           __kmp_msg(kmp_ms_warning,
7900                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7901                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7902         }
7903       }
7904       num_threads = new_threads;
7905     }
7906   }
7907   thr->th.th_teams_size.nth = num_threads;
7908 }
7909
7910 /* this sets the requested number of teams for the teams region and/or
7911    the number of threads for the next parallel region encountered  */
7912 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7913                           int num_threads) {
7914   kmp_info_t *thr = __kmp_threads[gtid];
7915   if (num_teams < 0) {
7916     // OpenMP specification requires requested values to be positive,
7917     // but people can send us any value, so we'd better check
7918     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7919               __kmp_msg_null);
7920     num_teams = 1;
7921   }
7922   if (num_teams == 0) {
7923     if (__kmp_nteams > 0) {
7924       num_teams = __kmp_nteams;
7925     } else {
7926       num_teams = 1; // default number of teams is 1.
7927     }
7928   }
7929   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7930     if (!__kmp_reserve_warn) {
7931       __kmp_reserve_warn = 1;
7932       __kmp_msg(kmp_ms_warning,
7933                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7934                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7935     }
7936     num_teams = __kmp_teams_max_nth;
7937   }
7938   // Set number of teams (number of threads in the outer "parallel" of the
7939   // teams)
7940   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7941
7942   __kmp_push_thread_limit(thr, num_teams, num_threads);
7943 }
7944
7945 /* This sets the requested number of teams for the teams region and/or
7946    the number of threads for the next parallel region encountered  */
7947 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7948                              int num_teams_ub, int num_threads) {
7949   kmp_info_t *thr = __kmp_threads[gtid];
7950   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7951   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7952   KMP_DEBUG_ASSERT(num_threads >= 0);
7953
7954   if (num_teams_lb > num_teams_ub) {
7955     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7956                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7957   }
7958
7959   int num_teams = 1; // defalt number of teams is 1.
7960
7961   if (num_teams_lb == 0 && num_teams_ub > 0)
7962     num_teams_lb = num_teams_ub;
7963
7964   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7965     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7966     if (num_teams > __kmp_teams_max_nth) {
7967       if (!__kmp_reserve_warn) {
7968         __kmp_reserve_warn = 1;
7969         __kmp_msg(kmp_ms_warning,
7970                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7971                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7972       }
7973       num_teams = __kmp_teams_max_nth;
7974     }
7975   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7976     num_teams = num_teams_ub;
7977   } else { // num_teams_lb <= num_teams <= num_teams_ub
7978     if (num_threads <= 0) {
7979       if (num_teams_ub > __kmp_teams_max_nth) {
7980         num_teams = num_teams_lb;
7981       } else {
7982         num_teams = num_teams_ub;
7983       }
7984     } else {
7985       num_teams = (num_threads > __kmp_teams_max_nth)
7986                       ? num_teams
7987                       : __kmp_teams_max_nth / num_threads;
7988       if (num_teams < num_teams_lb) {
7989         num_teams = num_teams_lb;
7990       } else if (num_teams > num_teams_ub) {
7991         num_teams = num_teams_ub;
7992       }
7993     }
7994   }
7995   // Set number of teams (number of threads in the outer "parallel" of the
7996   // teams)
7997   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7998
7999   __kmp_push_thread_limit(thr, num_teams, num_threads);
8000 }
8001
8002 // Set the proc_bind var to use in the following parallel region.
8003 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8004   kmp_info_t *thr = __kmp_threads[gtid];
8005   thr->th.th_set_proc_bind = proc_bind;
8006 }
8007
8008 /* Launch the worker threads into the microtask. */
8009
8010 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8011   kmp_info_t *this_thr = __kmp_threads[gtid];
8012
8013 #ifdef KMP_DEBUG
8014   int f;
8015 #endif /* KMP_DEBUG */
8016
8017   KMP_DEBUG_ASSERT(team);
8018   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8019   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8020   KMP_MB(); /* Flush all pending memory write invalidates.  */
8021
8022   team->t.t_construct = 0; /* no single directives seen yet */
8023   team->t.t_ordered.dt.t_value =
8024       0; /* thread 0 enters the ordered section first */
8025
8026   /* Reset the identifiers on the dispatch buffer */
8027   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8028   if (team->t.t_max_nproc > 1) {
8029     int i;
8030     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8031       team->t.t_disp_buffer[i].buffer_index = i;
8032       team->t.t_disp_buffer[i].doacross_buf_idx = i;
8033     }
8034   } else {
8035     team->t.t_disp_buffer[0].buffer_index = 0;
8036     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8037   }
8038
8039   KMP_MB(); /* Flush all pending memory write invalidates.  */
8040   KMP_ASSERT(this_thr->th.th_team == team);
8041
8042 #ifdef KMP_DEBUG
8043   for (f = 0; f < team->t.t_nproc; f++) {
8044     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8045                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8046   }
8047 #endif /* KMP_DEBUG */
8048
8049   /* release the worker threads so they may begin working */
8050   __kmp_fork_barrier(gtid, 0);
8051 }
8052
8053 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8054   kmp_info_t *this_thr = __kmp_threads[gtid];
8055
8056   KMP_DEBUG_ASSERT(team);
8057   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8058   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8059   KMP_MB(); /* Flush all pending memory write invalidates.  */
8060
8061   /* Join barrier after fork */
8062
8063 #ifdef KMP_DEBUG
8064   if (__kmp_threads[gtid] &&
8065       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8066     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8067                  __kmp_threads[gtid]);
8068     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8069                  "team->t.t_nproc=%d\n",
8070                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8071                  team->t.t_nproc);
8072     __kmp_print_structure();
8073   }
8074   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8075                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8076 #endif /* KMP_DEBUG */
8077
8078   __kmp_join_barrier(gtid); /* wait for everyone */
8079 #if OMPT_SUPPORT
8080   if (ompt_enabled.enabled &&
8081       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8082     int ds_tid = this_thr->th.th_info.ds.ds_tid;
8083     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8084     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8085 #if OMPT_OPTIONAL
8086     void *codeptr = NULL;
8087     if (KMP_MASTER_TID(ds_tid) &&
8088         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8089          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8090       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8091
8092     if (ompt_enabled.ompt_callback_sync_region_wait) {
8093       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8094           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8095           codeptr);
8096     }
8097     if (ompt_enabled.ompt_callback_sync_region) {
8098       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8099           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8100           codeptr);
8101     }
8102 #endif
8103     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8104       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8105           ompt_scope_end, NULL, task_data, 0, ds_tid,
8106           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8107     }
8108   }
8109 #endif
8110
8111   KMP_MB(); /* Flush all pending memory write invalidates.  */
8112   KMP_ASSERT(this_thr->th.th_team == team);
8113 }
8114
8115 /* ------------------------------------------------------------------------ */
8116
8117 #ifdef USE_LOAD_BALANCE
8118
8119 // Return the worker threads actively spinning in the hot team, if we
8120 // are at the outermost level of parallelism.  Otherwise, return 0.
8121 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8122   int i;
8123   int retval;
8124   kmp_team_t *hot_team;
8125
8126   if (root->r.r_active) {
8127     return 0;
8128   }
8129   hot_team = root->r.r_hot_team;
8130   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8131     return hot_team->t.t_nproc - 1; // Don't count primary thread
8132   }
8133
8134   // Skip the primary thread - it is accounted for elsewhere.
8135   retval = 0;
8136   for (i = 1; i < hot_team->t.t_nproc; i++) {
8137     if (hot_team->t.t_threads[i]->th.th_active) {
8138       retval++;
8139     }
8140   }
8141   return retval;
8142 }
8143
8144 // Perform an automatic adjustment to the number of
8145 // threads used by the next parallel region.
8146 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8147   int retval;
8148   int pool_active;
8149   int hot_team_active;
8150   int team_curr_active;
8151   int system_active;
8152
8153   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8154                 set_nproc));
8155   KMP_DEBUG_ASSERT(root);
8156   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8157                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8158   KMP_DEBUG_ASSERT(set_nproc > 1);
8159
8160   if (set_nproc == 1) {
8161     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8162     return 1;
8163   }
8164
8165   // Threads that are active in the thread pool, active in the hot team for this
8166   // particular root (if we are at the outer par level), and the currently
8167   // executing thread (to become the primary thread) are available to add to the
8168   // new team, but are currently contributing to the system load, and must be
8169   // accounted for.
8170   pool_active = __kmp_thread_pool_active_nth;
8171   hot_team_active = __kmp_active_hot_team_nproc(root);
8172   team_curr_active = pool_active + hot_team_active + 1;
8173
8174   // Check the system load.
8175   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8176   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8177                 "hot team active = %d\n",
8178                 system_active, pool_active, hot_team_active));
8179
8180   if (system_active < 0) {
8181     // There was an error reading the necessary info from /proc, so use the
8182     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8183     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8184     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8185     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8186
8187     // Make this call behave like the thread limit algorithm.
8188     retval = __kmp_avail_proc - __kmp_nth +
8189              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8190     if (retval > set_nproc) {
8191       retval = set_nproc;
8192     }
8193     if (retval < KMP_MIN_NTH) {
8194       retval = KMP_MIN_NTH;
8195     }
8196
8197     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8198                   retval));
8199     return retval;
8200   }
8201
8202   // There is a slight delay in the load balance algorithm in detecting new
8203   // running procs. The real system load at this instant should be at least as
8204   // large as the #active omp thread that are available to add to the team.
8205   if (system_active < team_curr_active) {
8206     system_active = team_curr_active;
8207   }
8208   retval = __kmp_avail_proc - system_active + team_curr_active;
8209   if (retval > set_nproc) {
8210     retval = set_nproc;
8211   }
8212   if (retval < KMP_MIN_NTH) {
8213     retval = KMP_MIN_NTH;
8214   }
8215
8216   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8217   return retval;
8218 } // __kmp_load_balance_nproc()
8219
8220 #endif /* USE_LOAD_BALANCE */
8221
8222 /* ------------------------------------------------------------------------ */
8223
8224 /* NOTE: this is called with the __kmp_init_lock held */
8225 void __kmp_cleanup(void) {
8226   int f;
8227
8228   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8229
8230   if (TCR_4(__kmp_init_parallel)) {
8231 #if KMP_HANDLE_SIGNALS
8232     __kmp_remove_signals();
8233 #endif
8234     TCW_4(__kmp_init_parallel, FALSE);
8235   }
8236
8237   if (TCR_4(__kmp_init_middle)) {
8238 #if KMP_AFFINITY_SUPPORTED
8239     __kmp_affinity_uninitialize();
8240 #endif /* KMP_AFFINITY_SUPPORTED */
8241     __kmp_cleanup_hierarchy();
8242     TCW_4(__kmp_init_middle, FALSE);
8243   }
8244
8245   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8246
8247   if (__kmp_init_serial) {
8248     __kmp_runtime_destroy();
8249     __kmp_init_serial = FALSE;
8250   }
8251
8252   __kmp_cleanup_threadprivate_caches();
8253
8254   for (f = 0; f < __kmp_threads_capacity; f++) {
8255     if (__kmp_root[f] != NULL) {
8256       __kmp_free(__kmp_root[f]);
8257       __kmp_root[f] = NULL;
8258     }
8259   }
8260   __kmp_free(__kmp_threads);
8261   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8262   // there is no need in freeing __kmp_root.
8263   __kmp_threads = NULL;
8264   __kmp_root = NULL;
8265   __kmp_threads_capacity = 0;
8266
8267   // Free old __kmp_threads arrays if they exist.
8268   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8269   while (ptr) {
8270     kmp_old_threads_list_t *next = ptr->next;
8271     __kmp_free(ptr->threads);
8272     __kmp_free(ptr);
8273     ptr = next;
8274   }
8275
8276 #if KMP_USE_DYNAMIC_LOCK
8277   __kmp_cleanup_indirect_user_locks();
8278 #else
8279   __kmp_cleanup_user_locks();
8280 #endif
8281 #if OMPD_SUPPORT
8282   if (ompd_state) {
8283     __kmp_free(ompd_env_block);
8284     ompd_env_block = NULL;
8285     ompd_env_block_size = 0;
8286   }
8287 #endif
8288
8289 #if KMP_AFFINITY_SUPPORTED
8290   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8291   __kmp_cpuinfo_file = NULL;
8292 #endif /* KMP_AFFINITY_SUPPORTED */
8293
8294 #if KMP_USE_ADAPTIVE_LOCKS
8295 #if KMP_DEBUG_ADAPTIVE_LOCKS
8296   __kmp_print_speculative_stats();
8297 #endif
8298 #endif
8299   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8300   __kmp_nested_nth.nth = NULL;
8301   __kmp_nested_nth.size = 0;
8302   __kmp_nested_nth.used = 0;
8303   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8304   __kmp_nested_proc_bind.bind_types = NULL;
8305   __kmp_nested_proc_bind.size = 0;
8306   __kmp_nested_proc_bind.used = 0;
8307   if (__kmp_affinity_format) {
8308     KMP_INTERNAL_FREE(__kmp_affinity_format);
8309     __kmp_affinity_format = NULL;
8310   }
8311
8312   __kmp_i18n_catclose();
8313
8314 #if KMP_USE_HIER_SCHED
8315   __kmp_hier_scheds.deallocate();
8316 #endif
8317
8318 #if KMP_STATS_ENABLED
8319   __kmp_stats_fini();
8320 #endif
8321
8322   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8323 }
8324
8325 /* ------------------------------------------------------------------------ */
8326
8327 int __kmp_ignore_mppbeg(void) {
8328   char *env;
8329
8330   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8331     if (__kmp_str_match_false(env))
8332       return FALSE;
8333   }
8334   // By default __kmpc_begin() is no-op.
8335   return TRUE;
8336 }
8337
8338 int __kmp_ignore_mppend(void) {
8339   char *env;
8340
8341   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8342     if (__kmp_str_match_false(env))
8343       return FALSE;
8344   }
8345   // By default __kmpc_end() is no-op.
8346   return TRUE;
8347 }
8348
8349 void __kmp_internal_begin(void) {
8350   int gtid;
8351   kmp_root_t *root;
8352
8353   /* this is a very important step as it will register new sibling threads
8354      and assign these new uber threads a new gtid */
8355   gtid = __kmp_entry_gtid();
8356   root = __kmp_threads[gtid]->th.th_root;
8357   KMP_ASSERT(KMP_UBER_GTID(gtid));
8358
8359   if (root->r.r_begin)
8360     return;
8361   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8362   if (root->r.r_begin) {
8363     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8364     return;
8365   }
8366
8367   root->r.r_begin = TRUE;
8368
8369   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8370 }
8371
8372 /* ------------------------------------------------------------------------ */
8373
8374 void __kmp_user_set_library(enum library_type arg) {
8375   int gtid;
8376   kmp_root_t *root;
8377   kmp_info_t *thread;
8378
8379   /* first, make sure we are initialized so we can get our gtid */
8380
8381   gtid = __kmp_entry_gtid();
8382   thread = __kmp_threads[gtid];
8383
8384   root = thread->th.th_root;
8385
8386   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8387                 library_serial));
8388   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8389                                   thread */
8390     KMP_WARNING(SetLibraryIncorrectCall);
8391     return;
8392   }
8393
8394   switch (arg) {
8395   case library_serial:
8396     thread->th.th_set_nproc = 0;
8397     set__nproc(thread, 1);
8398     break;
8399   case library_turnaround:
8400     thread->th.th_set_nproc = 0;
8401     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8402                                            : __kmp_dflt_team_nth_ub);
8403     break;
8404   case library_throughput:
8405     thread->th.th_set_nproc = 0;
8406     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8407                                            : __kmp_dflt_team_nth_ub);
8408     break;
8409   default:
8410     KMP_FATAL(UnknownLibraryType, arg);
8411   }
8412
8413   __kmp_aux_set_library(arg);
8414 }
8415
8416 void __kmp_aux_set_stacksize(size_t arg) {
8417   if (!__kmp_init_serial)
8418     __kmp_serial_initialize();
8419
8420 #if KMP_OS_DARWIN
8421   if (arg & (0x1000 - 1)) {
8422     arg &= ~(0x1000 - 1);
8423     if (arg + 0x1000) /* check for overflow if we round up */
8424       arg += 0x1000;
8425   }
8426 #endif
8427   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8428
8429   /* only change the default stacksize before the first parallel region */
8430   if (!TCR_4(__kmp_init_parallel)) {
8431     size_t value = arg; /* argument is in bytes */
8432
8433     if (value < __kmp_sys_min_stksize)
8434       value = __kmp_sys_min_stksize;
8435     else if (value > KMP_MAX_STKSIZE)
8436       value = KMP_MAX_STKSIZE;
8437
8438     __kmp_stksize = value;
8439
8440     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8441   }
8442
8443   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8444 }
8445
8446 /* set the behaviour of the runtime library */
8447 /* TODO this can cause some odd behaviour with sibling parallelism... */
8448 void __kmp_aux_set_library(enum library_type arg) {
8449   __kmp_library = arg;
8450
8451   switch (__kmp_library) {
8452   case library_serial: {
8453     KMP_INFORM(LibraryIsSerial);
8454   } break;
8455   case library_turnaround:
8456     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8457       __kmp_use_yield = 2; // only yield when oversubscribed
8458     break;
8459   case library_throughput:
8460     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8461       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8462     break;
8463   default:
8464     KMP_FATAL(UnknownLibraryType, arg);
8465   }
8466 }
8467
8468 /* Getting team information common for all team API */
8469 // Returns NULL if not in teams construct
8470 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8471   kmp_info_t *thr = __kmp_entry_thread();
8472   teams_serialized = 0;
8473   if (thr->th.th_teams_microtask) {
8474     kmp_team_t *team = thr->th.th_team;
8475     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8476     int ii = team->t.t_level;
8477     teams_serialized = team->t.t_serialized;
8478     int level = tlevel + 1;
8479     KMP_DEBUG_ASSERT(ii >= tlevel);
8480     while (ii > level) {
8481       for (teams_serialized = team->t.t_serialized;
8482            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8483       }
8484       if (team->t.t_serialized && (!teams_serialized)) {
8485         team = team->t.t_parent;
8486         continue;
8487       }
8488       if (ii > level) {
8489         team = team->t.t_parent;
8490         ii--;
8491       }
8492     }
8493     return team;
8494   }
8495   return NULL;
8496 }
8497
8498 int __kmp_aux_get_team_num() {
8499   int serialized;
8500   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8501   if (team) {
8502     if (serialized > 1) {
8503       return 0; // teams region is serialized ( 1 team of 1 thread ).
8504     } else {
8505       return team->t.t_master_tid;
8506     }
8507   }
8508   return 0;
8509 }
8510
8511 int __kmp_aux_get_num_teams() {
8512   int serialized;
8513   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8514   if (team) {
8515     if (serialized > 1) {
8516       return 1;
8517     } else {
8518       return team->t.t_parent->t.t_nproc;
8519     }
8520   }
8521   return 1;
8522 }
8523
8524 /* ------------------------------------------------------------------------ */
8525
8526 /*
8527  * Affinity Format Parser
8528  *
8529  * Field is in form of: %[[[0].]size]type
8530  * % and type are required (%% means print a literal '%')
8531  * type is either single char or long name surrounded by {},
8532  * e.g., N or {num_threads}
8533  * 0 => leading zeros
8534  * . => right justified when size is specified
8535  * by default output is left justified
8536  * size is the *minimum* field length
8537  * All other characters are printed as is
8538  *
8539  * Available field types:
8540  * L {thread_level}      - omp_get_level()
8541  * n {thread_num}        - omp_get_thread_num()
8542  * h {host}              - name of host machine
8543  * P {process_id}        - process id (integer)
8544  * T {thread_identifier} - native thread identifier (integer)
8545  * N {num_threads}       - omp_get_num_threads()
8546  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8547  * a {thread_affinity}   - comma separated list of integers or integer ranges
8548  *                         (values of affinity mask)
8549  *
8550  * Implementation-specific field types can be added
8551  * If a type is unknown, print "undefined"
8552  */
8553
8554 // Structure holding the short name, long name, and corresponding data type
8555 // for snprintf.  A table of these will represent the entire valid keyword
8556 // field types.
8557 typedef struct kmp_affinity_format_field_t {
8558   char short_name; // from spec e.g., L -> thread level
8559   const char *long_name; // from spec thread_level -> thread level
8560   char field_format; // data type for snprintf (typically 'd' or 's'
8561   // for integer or string)
8562 } kmp_affinity_format_field_t;
8563
8564 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8565 #if KMP_AFFINITY_SUPPORTED
8566     {'A', "thread_affinity", 's'},
8567 #endif
8568     {'t', "team_num", 'd'},
8569     {'T', "num_teams", 'd'},
8570     {'L', "nesting_level", 'd'},
8571     {'n', "thread_num", 'd'},
8572     {'N', "num_threads", 'd'},
8573     {'a', "ancestor_tnum", 'd'},
8574     {'H', "host", 's'},
8575     {'P', "process_id", 'd'},
8576     {'i', "native_thread_id", 'd'}};
8577
8578 // Return the number of characters it takes to hold field
8579 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8580                                             const char **ptr,
8581                                             kmp_str_buf_t *field_buffer) {
8582   int rc, format_index, field_value;
8583   const char *width_left, *width_right;
8584   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8585   static const int FORMAT_SIZE = 20;
8586   char format[FORMAT_SIZE] = {0};
8587   char absolute_short_name = 0;
8588
8589   KMP_DEBUG_ASSERT(gtid >= 0);
8590   KMP_DEBUG_ASSERT(th);
8591   KMP_DEBUG_ASSERT(**ptr == '%');
8592   KMP_DEBUG_ASSERT(field_buffer);
8593
8594   __kmp_str_buf_clear(field_buffer);
8595
8596   // Skip the initial %
8597   (*ptr)++;
8598
8599   // Check for %% first
8600   if (**ptr == '%') {
8601     __kmp_str_buf_cat(field_buffer, "%", 1);
8602     (*ptr)++; // skip over the second %
8603     return 1;
8604   }
8605
8606   // Parse field modifiers if they are present
8607   pad_zeros = false;
8608   if (**ptr == '0') {
8609     pad_zeros = true;
8610     (*ptr)++; // skip over 0
8611   }
8612   right_justify = false;
8613   if (**ptr == '.') {
8614     right_justify = true;
8615     (*ptr)++; // skip over .
8616   }
8617   // Parse width of field: [width_left, width_right)
8618   width_left = width_right = NULL;
8619   if (**ptr >= '0' && **ptr <= '9') {
8620     width_left = *ptr;
8621     SKIP_DIGITS(*ptr);
8622     width_right = *ptr;
8623   }
8624
8625   // Create the format for KMP_SNPRINTF based on flags parsed above
8626   format_index = 0;
8627   format[format_index++] = '%';
8628   if (!right_justify)
8629     format[format_index++] = '-';
8630   if (pad_zeros)
8631     format[format_index++] = '0';
8632   if (width_left && width_right) {
8633     int i = 0;
8634     // Only allow 8 digit number widths.
8635     // This also prevents overflowing format variable
8636     while (i < 8 && width_left < width_right) {
8637       format[format_index++] = *width_left;
8638       width_left++;
8639       i++;
8640     }
8641   }
8642
8643   // Parse a name (long or short)
8644   // Canonicalize the name into absolute_short_name
8645   found_valid_name = false;
8646   parse_long_name = (**ptr == '{');
8647   if (parse_long_name)
8648     (*ptr)++; // skip initial left brace
8649   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8650                              sizeof(__kmp_affinity_format_table[0]);
8651        ++i) {
8652     char short_name = __kmp_affinity_format_table[i].short_name;
8653     const char *long_name = __kmp_affinity_format_table[i].long_name;
8654     char field_format = __kmp_affinity_format_table[i].field_format;
8655     if (parse_long_name) {
8656       size_t length = KMP_STRLEN(long_name);
8657       if (strncmp(*ptr, long_name, length) == 0) {
8658         found_valid_name = true;
8659         (*ptr) += length; // skip the long name
8660       }
8661     } else if (**ptr == short_name) {
8662       found_valid_name = true;
8663       (*ptr)++; // skip the short name
8664     }
8665     if (found_valid_name) {
8666       format[format_index++] = field_format;
8667       format[format_index++] = '\0';
8668       absolute_short_name = short_name;
8669       break;
8670     }
8671   }
8672   if (parse_long_name) {
8673     if (**ptr != '}') {
8674       absolute_short_name = 0;
8675     } else {
8676       (*ptr)++; // skip over the right brace
8677     }
8678   }
8679
8680   // Attempt to fill the buffer with the requested
8681   // value using snprintf within __kmp_str_buf_print()
8682   switch (absolute_short_name) {
8683   case 't':
8684     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8685     break;
8686   case 'T':
8687     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8688     break;
8689   case 'L':
8690     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8691     break;
8692   case 'n':
8693     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8694     break;
8695   case 'H': {
8696     static const int BUFFER_SIZE = 256;
8697     char buf[BUFFER_SIZE];
8698     __kmp_expand_host_name(buf, BUFFER_SIZE);
8699     rc = __kmp_str_buf_print(field_buffer, format, buf);
8700   } break;
8701   case 'P':
8702     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8703     break;
8704   case 'i':
8705     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8706     break;
8707   case 'N':
8708     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8709     break;
8710   case 'a':
8711     field_value =
8712         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8713     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8714     break;
8715 #if KMP_AFFINITY_SUPPORTED
8716   case 'A': {
8717     kmp_str_buf_t buf;
8718     __kmp_str_buf_init(&buf);
8719     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8720     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8721     __kmp_str_buf_free(&buf);
8722   } break;
8723 #endif
8724   default:
8725     // According to spec, If an implementation does not have info for field
8726     // type, then "undefined" is printed
8727     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8728     // Skip the field
8729     if (parse_long_name) {
8730       SKIP_TOKEN(*ptr);
8731       if (**ptr == '}')
8732         (*ptr)++;
8733     } else {
8734       (*ptr)++;
8735     }
8736   }
8737
8738   KMP_ASSERT(format_index <= FORMAT_SIZE);
8739   return rc;
8740 }
8741
8742 /*
8743  * Return number of characters needed to hold the affinity string
8744  * (not including null byte character)
8745  * The resultant string is printed to buffer, which the caller can then
8746  * handle afterwards
8747  */
8748 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8749                                   kmp_str_buf_t *buffer) {
8750   const char *parse_ptr;
8751   size_t retval;
8752   const kmp_info_t *th;
8753   kmp_str_buf_t field;
8754
8755   KMP_DEBUG_ASSERT(buffer);
8756   KMP_DEBUG_ASSERT(gtid >= 0);
8757
8758   __kmp_str_buf_init(&field);
8759   __kmp_str_buf_clear(buffer);
8760
8761   th = __kmp_threads[gtid];
8762   retval = 0;
8763
8764   // If format is NULL or zero-length string, then we use
8765   // affinity-format-var ICV
8766   parse_ptr = format;
8767   if (parse_ptr == NULL || *parse_ptr == '\0') {
8768     parse_ptr = __kmp_affinity_format;
8769   }
8770   KMP_DEBUG_ASSERT(parse_ptr);
8771
8772   while (*parse_ptr != '\0') {
8773     // Parse a field
8774     if (*parse_ptr == '%') {
8775       // Put field in the buffer
8776       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8777       __kmp_str_buf_catbuf(buffer, &field);
8778       retval += rc;
8779     } else {
8780       // Put literal character in buffer
8781       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8782       retval++;
8783       parse_ptr++;
8784     }
8785   }
8786   __kmp_str_buf_free(&field);
8787   return retval;
8788 }
8789
8790 // Displays the affinity string to stdout
8791 void __kmp_aux_display_affinity(int gtid, const char *format) {
8792   kmp_str_buf_t buf;
8793   __kmp_str_buf_init(&buf);
8794   __kmp_aux_capture_affinity(gtid, format, &buf);
8795   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8796   __kmp_str_buf_free(&buf);
8797 }
8798
8799 /* ------------------------------------------------------------------------ */
8800 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8801   int blocktime = arg; /* argument is in microseconds */
8802 #if KMP_USE_MONITOR
8803   int bt_intervals;
8804 #endif
8805   kmp_int8 bt_set;
8806
8807   __kmp_save_internal_controls(thread);
8808
8809   /* Normalize and set blocktime for the teams */
8810   if (blocktime < KMP_MIN_BLOCKTIME)
8811     blocktime = KMP_MIN_BLOCKTIME;
8812   else if (blocktime > KMP_MAX_BLOCKTIME)
8813     blocktime = KMP_MAX_BLOCKTIME;
8814
8815   set__blocktime_team(thread->th.th_team, tid, blocktime);
8816   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8817
8818 #if KMP_USE_MONITOR
8819   /* Calculate and set blocktime intervals for the teams */
8820   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8821
8822   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8823   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8824 #endif
8825
8826   /* Set whether blocktime has been set to "TRUE" */
8827   bt_set = TRUE;
8828
8829   set__bt_set_team(thread->th.th_team, tid, bt_set);
8830   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8831 #if KMP_USE_MONITOR
8832   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8833                 "bt_intervals=%d, monitor_updates=%d\n",
8834                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8835                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8836                 __kmp_monitor_wakeups));
8837 #else
8838   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8839                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8840                 thread->th.th_team->t.t_id, tid, blocktime));
8841 #endif
8842 }
8843
8844 void __kmp_aux_set_defaults(char const *str, size_t len) {
8845   if (!__kmp_init_serial) {
8846     __kmp_serial_initialize();
8847   }
8848   __kmp_env_initialize(str);
8849
8850   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8851     __kmp_env_print();
8852   }
8853 } // __kmp_aux_set_defaults
8854
8855 /* ------------------------------------------------------------------------ */
8856 /* internal fast reduction routines */
8857
8858 PACKED_REDUCTION_METHOD_T
8859 __kmp_determine_reduction_method(
8860     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8861     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8862     kmp_critical_name *lck) {
8863
8864   // Default reduction method: critical construct ( lck != NULL, like in current
8865   // PAROPT )
8866   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8867   // can be selected by RTL
8868   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8869   // can be selected by RTL
8870   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8871   // among generated by PAROPT.
8872
8873   PACKED_REDUCTION_METHOD_T retval;
8874
8875   int team_size;
8876
8877   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8878
8879 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8880   (loc &&                                                                      \
8881    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8882 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8883
8884   retval = critical_reduce_block;
8885
8886   // another choice of getting a team size (with 1 dynamic deference) is slower
8887   team_size = __kmp_get_team_num_threads(global_tid);
8888   if (team_size == 1) {
8889
8890     retval = empty_reduce_block;
8891
8892   } else {
8893
8894     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8895
8896 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8897     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
8898
8899 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8900     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8901
8902     int teamsize_cutoff = 4;
8903
8904 #if KMP_MIC_SUPPORTED
8905     if (__kmp_mic_type != non_mic) {
8906       teamsize_cutoff = 8;
8907     }
8908 #endif
8909     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8910     if (tree_available) {
8911       if (team_size <= teamsize_cutoff) {
8912         if (atomic_available) {
8913           retval = atomic_reduce_block;
8914         }
8915       } else {
8916         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8917       }
8918     } else if (atomic_available) {
8919       retval = atomic_reduce_block;
8920     }
8921 #else
8922 #error "Unknown or unsupported OS"
8923 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8924        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8925
8926 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8927
8928 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8929     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8930
8931     // basic tuning
8932
8933     if (atomic_available) {
8934       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8935         retval = atomic_reduce_block;
8936       }
8937     } // otherwise: use critical section
8938
8939 #elif KMP_OS_DARWIN
8940
8941     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8942     if (atomic_available && (num_vars <= 3)) {
8943       retval = atomic_reduce_block;
8944     } else if (tree_available) {
8945       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8946           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8947         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8948       }
8949     } // otherwise: use critical section
8950
8951 #else
8952 #error "Unknown or unsupported OS"
8953 #endif
8954
8955 #else
8956 #error "Unknown or unsupported architecture"
8957 #endif
8958   }
8959
8960   // KMP_FORCE_REDUCTION
8961
8962   // If the team is serialized (team_size == 1), ignore the forced reduction
8963   // method and stay with the unsynchronized method (empty_reduce_block)
8964   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8965       team_size != 1) {
8966
8967     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8968
8969     int atomic_available, tree_available;
8970
8971     switch ((forced_retval = __kmp_force_reduction_method)) {
8972     case critical_reduce_block:
8973       KMP_ASSERT(lck); // lck should be != 0
8974       break;
8975
8976     case atomic_reduce_block:
8977       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8978       if (!atomic_available) {
8979         KMP_WARNING(RedMethodNotSupported, "atomic");
8980         forced_retval = critical_reduce_block;
8981       }
8982       break;
8983
8984     case tree_reduce_block:
8985       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8986       if (!tree_available) {
8987         KMP_WARNING(RedMethodNotSupported, "tree");
8988         forced_retval = critical_reduce_block;
8989       } else {
8990 #if KMP_FAST_REDUCTION_BARRIER
8991         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8992 #endif
8993       }
8994       break;
8995
8996     default:
8997       KMP_ASSERT(0); // "unsupported method specified"
8998     }
8999
9000     retval = forced_retval;
9001   }
9002
9003   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9004
9005 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9006 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9007
9008   return (retval);
9009 }
9010 // this function is for testing set/get/determine reduce method
9011 kmp_int32 __kmp_get_reduce_method(void) {
9012   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9013 }
9014
9015 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9016 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9017 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9018
9019 // Hard pause shuts down the runtime completely.  Resume happens naturally when
9020 // OpenMP is used subsequently.
9021 void __kmp_hard_pause() {
9022   __kmp_pause_status = kmp_hard_paused;
9023   __kmp_internal_end_thread(-1);
9024 }
9025
9026 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9027 void __kmp_resume_if_soft_paused() {
9028   if (__kmp_pause_status == kmp_soft_paused) {
9029     __kmp_pause_status = kmp_not_paused;
9030
9031     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9032       kmp_info_t *thread = __kmp_threads[gtid];
9033       if (thread) { // Wake it if sleeping
9034         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9035                          thread);
9036         if (fl.is_sleeping())
9037           fl.resume(gtid);
9038         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9039           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9040         } else { // thread holds the lock and may sleep soon
9041           do { // until either the thread sleeps, or we can get the lock
9042             if (fl.is_sleeping()) {
9043               fl.resume(gtid);
9044               break;
9045             } else if (__kmp_try_suspend_mx(thread)) {
9046               __kmp_unlock_suspend_mx(thread);
9047               break;
9048             }
9049           } while (1);
9050         }
9051       }
9052     }
9053   }
9054 }
9055
9056 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9057 // TODO: add warning messages
9058 int __kmp_pause_resource(kmp_pause_status_t level) {
9059   if (level == kmp_not_paused) { // requesting resume
9060     if (__kmp_pause_status == kmp_not_paused) {
9061       // error message about runtime not being paused, so can't resume
9062       return 1;
9063     } else {
9064       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9065                        __kmp_pause_status == kmp_hard_paused);
9066       __kmp_pause_status = kmp_not_paused;
9067       return 0;
9068     }
9069   } else if (level == kmp_soft_paused) { // requesting soft pause
9070     if (__kmp_pause_status != kmp_not_paused) {
9071       // error message about already being paused
9072       return 1;
9073     } else {
9074       __kmp_soft_pause();
9075       return 0;
9076     }
9077   } else if (level == kmp_hard_paused) { // requesting hard pause
9078     if (__kmp_pause_status != kmp_not_paused) {
9079       // error message about already being paused
9080       return 1;
9081     } else {
9082       __kmp_hard_pause();
9083       return 0;
9084     }
9085   } else {
9086     // error message about invalid level
9087     return 1;
9088   }
9089 }
9090
9091 void __kmp_omp_display_env(int verbose) {
9092   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9093   if (__kmp_init_serial == 0)
9094     __kmp_do_serial_initialize();
9095   __kmp_display_env_impl(!verbose, verbose);
9096   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9097 }
9098
9099 // The team size is changing, so distributed barrier must be modified
9100 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9101                                int new_nthreads) {
9102   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9103                    bp_dist_bar);
9104   kmp_info_t **other_threads = team->t.t_threads;
9105
9106   // We want all the workers to stop waiting on the barrier while we adjust the
9107   // size of the team.
9108   for (int f = 1; f < old_nthreads; ++f) {
9109     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9110     // Ignore threads that are already inactive or not present in the team
9111     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9112       // teams construct causes thread_limit to get passed in, and some of
9113       // those could be inactive; just ignore them
9114       continue;
9115     }
9116     // If thread is transitioning still to in_use state, wait for it
9117     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9118       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9119         KMP_CPU_PAUSE();
9120     }
9121     // The thread should be in_use now
9122     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9123     // Transition to unused state
9124     team->t.t_threads[f]->th.th_used_in_team.store(2);
9125     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9126   }
9127   // Release all the workers
9128   team->t.b->go_release();
9129
9130   KMP_MFENCE();
9131
9132   // Workers should see transition status 2 and move to 0; but may need to be
9133   // woken up first
9134   int count = old_nthreads - 1;
9135   while (count > 0) {
9136     count = old_nthreads - 1;
9137     for (int f = 1; f < old_nthreads; ++f) {
9138       if (other_threads[f]->th.th_used_in_team.load() != 0) {
9139         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9140           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9141               void *, other_threads[f]->th.th_sleep_loc);
9142           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9143         }
9144       } else {
9145         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9146         count--;
9147       }
9148     }
9149   }
9150   // Now update the barrier size
9151   team->t.b->update_num_threads(new_nthreads);
9152   team->t.b->go_reset();
9153 }
9154
9155 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9156   // Add the threads back to the team
9157   KMP_DEBUG_ASSERT(team);
9158   // Threads were paused and pointed at th_used_in_team temporarily during a
9159   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9160   // the thread that it should transition itself back into the team. Then, if
9161   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9162   // to wake it up.
9163   for (int f = 1; f < new_nthreads; ++f) {
9164     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9165     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9166                                 3);
9167     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9168       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9169                       (kmp_flag_32<false, false> *)NULL);
9170     }
9171   }
9172   // The threads should be transitioning to the team; when they are done, they
9173   // should have set th_used_in_team to 1. This loop forces master to wait until
9174   // all threads have moved into the team and are waiting in the barrier.
9175   int count = new_nthreads - 1;
9176   while (count > 0) {
9177     count = new_nthreads - 1;
9178     for (int f = 1; f < new_nthreads; ++f) {
9179       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9180         count--;
9181       }
9182     }
9183   }
9184 }
9185
9186 // Globals and functions for hidden helper task
9187 kmp_info_t **__kmp_hidden_helper_threads;
9188 kmp_info_t *__kmp_hidden_helper_main_thread;
9189 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9190 #if KMP_OS_LINUX
9191 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9192 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9193 #else
9194 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9195 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9196 #endif
9197
9198 namespace {
9199 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9200
9201 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9202   // This is an explicit synchronization on all hidden helper threads in case
9203   // that when a regular thread pushes a hidden helper task to one hidden
9204   // helper thread, the thread has not been awaken once since they're released
9205   // by the main thread after creating the team.
9206   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9207   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9208          __kmp_hidden_helper_threads_num)
9209     ;
9210
9211   // If main thread, then wait for signal
9212   if (__kmpc_master(nullptr, *gtid)) {
9213     // First, unset the initial state and release the initial thread
9214     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9215     __kmp_hidden_helper_initz_release();
9216     __kmp_hidden_helper_main_thread_wait();
9217     // Now wake up all worker threads
9218     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9219       __kmp_hidden_helper_worker_thread_signal();
9220     }
9221   }
9222 }
9223 } // namespace
9224
9225 void __kmp_hidden_helper_threads_initz_routine() {
9226   // Create a new root for hidden helper team/threads
9227   const int gtid = __kmp_register_root(TRUE);
9228   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9229   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9230   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9231       __kmp_hidden_helper_threads_num;
9232
9233   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9234
9235   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9236
9237   // Set the initialization flag to FALSE
9238   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9239
9240   __kmp_hidden_helper_threads_deinitz_release();
9241 }
9242
9243 /* Nesting Mode:
9244    Set via KMP_NESTING_MODE, which takes an integer.
9245    Note: we skip duplicate topology levels, and skip levels with only
9246       one entity.
9247    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9248    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9249       in the topology, and initializes the number of threads at each of those
9250       levels to the number of entities at each level, respectively, below the
9251       entity at the parent level.
9252    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9253       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9254       the user to turn nesting on explicitly. This is an even more experimental
9255       option to this experimental feature, and may change or go away in the
9256       future.
9257 */
9258
9259 // Allocate space to store nesting levels
9260 void __kmp_init_nesting_mode() {
9261   int levels = KMP_HW_LAST;
9262   __kmp_nesting_mode_nlevels = levels;
9263   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9264   for (int i = 0; i < levels; ++i)
9265     __kmp_nesting_nth_level[i] = 0;
9266   if (__kmp_nested_nth.size < levels) {
9267     __kmp_nested_nth.nth =
9268         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9269     __kmp_nested_nth.size = levels;
9270   }
9271 }
9272
9273 // Set # threads for top levels of nesting; must be called after topology set
9274 void __kmp_set_nesting_mode_threads() {
9275   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9276
9277   if (__kmp_nesting_mode == 1)
9278     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9279   else if (__kmp_nesting_mode > 1)
9280     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9281
9282   if (__kmp_topology) { // use topology info
9283     int loc, hw_level;
9284     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9285                                 loc < __kmp_nesting_mode_nlevels;
9286          loc++, hw_level++) {
9287       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9288       if (__kmp_nesting_nth_level[loc] == 1)
9289         loc--;
9290     }
9291     // Make sure all cores are used
9292     if (__kmp_nesting_mode > 1 && loc > 1) {
9293       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9294       int num_cores = __kmp_topology->get_count(core_level);
9295       int upper_levels = 1;
9296       for (int level = 0; level < loc - 1; ++level)
9297         upper_levels *= __kmp_nesting_nth_level[level];
9298       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9299         __kmp_nesting_nth_level[loc - 1] =
9300             num_cores / __kmp_nesting_nth_level[loc - 2];
9301     }
9302     __kmp_nesting_mode_nlevels = loc;
9303     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9304   } else { // no topology info available; provide a reasonable guesstimation
9305     if (__kmp_avail_proc >= 4) {
9306       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9307       __kmp_nesting_nth_level[1] = 2;
9308       __kmp_nesting_mode_nlevels = 2;
9309     } else {
9310       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9311       __kmp_nesting_mode_nlevels = 1;
9312     }
9313     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9314   }
9315   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9316     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9317   }
9318   set__nproc(thread, __kmp_nesting_nth_level[0]);
9319   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9320     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9321   if (get__max_active_levels(thread) > 1) {
9322     // if max levels was set, set nesting mode levels to same
9323     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9324   }
9325   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9326     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9327 }
9328
9329 // Empty symbols to export (see exports_so.txt) when feature is disabled
9330 extern "C" {
9331 #if !KMP_STATS_ENABLED
9332 void __kmp_reset_stats() {}
9333 #endif
9334 #if !USE_DEBUGGER
9335 int __kmp_omp_debug_struct_info = FALSE;
9336 int __kmp_debugging = FALSE;
9337 #endif
9338 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9339 void __kmp_itt_fini_ittlib() {}
9340 void __kmp_itt_init_ittlib() {}
9341 #endif
9342 }
9343
9344 // end of file