openmp/runtime/src/kmp_runtime.cpp

   1 /*
   2  * kmp_runtime.cpp -- KPTS runtime support library
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "kmp.h"
  14 #include "kmp_affinity.h"
  15 #include "kmp_atomic.h"
  16 #include "kmp_environment.h"
  17 #include "kmp_error.h"
  18 #include "kmp_i18n.h"
  19 #include "kmp_io.h"
  20 #include "kmp_itt.h"
  21 #include "kmp_settings.h"
  22 #include "kmp_stats.h"
  23 #include "kmp_str.h"
  24 #include "kmp_wait_release.h"
  25 #include "kmp_wrapper_getpid.h"
  26 #include "kmp_dispatch.h"
  27 #include "kmp_utils.h"
  28 #if KMP_USE_HIER_SCHED
  29 #include "kmp_dispatch_hier.h"
  30 #endif
  31
  32 #if OMPT_SUPPORT
  33 #include "ompt-specific.h"
  34 #endif
  35 #if OMPD_SUPPORT
  36 #include "ompd-specific.h"
  37 #endif
  38
  39 #if OMP_PROFILING_SUPPORT
  40 #include "llvm/Support/TimeProfiler.h"
  41 static char *ProfileTraceFile = nullptr;
  42 #endif
  43
  44 /* these are temporary issues to be dealt with */
  45 #define KMP_USE_PRCTL 0
  46
  47 #if KMP_OS_WINDOWS
  48 #include <process.h>
  49 #endif
  50
  51 #ifndef KMP_USE_SHM
  52 // Windows and WASI do not need these include files as they don't use shared
  53 // memory.
  54 #else
  55 #include <sys/mman.h>
  56 #include <sys/stat.h>
  57 #include <fcntl.h>
  58 #define SHM_SIZE 1024
  59 #endif
  60
  61 #if defined(KMP_GOMP_COMPAT)
  62 char const __kmp_version_alt_comp[] =
  63     KMP_VERSION_PREFIX "alternative compiler support: yes";
  64 #endif /* defined(KMP_GOMP_COMPAT) */
  65
  66 char const __kmp_version_omp_api[] =
  67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
  68
  69 #ifdef KMP_DEBUG
  70 char const __kmp_version_lock[] =
  71     KMP_VERSION_PREFIX "lock type: run time selectable";
  72 #endif /* KMP_DEBUG */
  73
  74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
  75
  76 /* ------------------------------------------------------------------------ */
  77
  78 #if KMP_USE_MONITOR
  79 kmp_info_t __kmp_monitor;
  80 #endif
  81
  82 /* Forward declarations */
  83
  84 void __kmp_cleanup(void);
  85
  86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
  87                                   int gtid);
  88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
  89                                   kmp_internal_control_t *new_icvs,
  90                                   ident_t *loc);
  91 #if KMP_AFFINITY_SUPPORTED
  92 static void __kmp_partition_places(kmp_team_t *team,
  93                                    int update_master_only = 0);
  94 #endif
  95 static void __kmp_do_serial_initialize(void);
  96 void __kmp_fork_barrier(int gtid, int tid);
  97 void __kmp_join_barrier(int gtid);
  98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
  99                           kmp_internal_control_t *new_icvs, ident_t *loc);
 100
 101 #ifdef USE_LOAD_BALANCE
 102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
 103 #endif
 104
 105 static int __kmp_expand_threads(int nNeed);
 106 #if KMP_OS_WINDOWS
 107 static int __kmp_unregister_root_other_thread(int gtid);
 108 #endif
 109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
 110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
 111
 112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
 113                                int new_nthreads);
 114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 115
 116 /* Calculate the identifier of the current thread */
 117 /* fast (and somewhat portable) way to get unique identifier of executing
 118    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
 119 int __kmp_get_global_thread_id() {
 120   int i;
 121   kmp_info_t **other_threads;
 122   size_t stack_data;
 123   char *stack_addr;
 124   size_t stack_size;
 125   char *stack_base;
 126
 127   KA_TRACE(
 128       1000,
 129       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
 130        __kmp_nth, __kmp_all_nth));
 131
 132   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
 133      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
 134      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
 135      __kmp_init_gtid for this to work. */
 136
 137   if (!TCR_4(__kmp_init_gtid))
 138     return KMP_GTID_DNE;
 139
 140 #ifdef KMP_TDATA_GTID
 141   if (TCR_4(__kmp_gtid_mode) >= 3) {
 142     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
 143     return __kmp_gtid;
 144   }
 145 #endif
 146   if (TCR_4(__kmp_gtid_mode) >= 2) {
 147     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
 148     return __kmp_gtid_get_specific();
 149   }
 150   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
 151
 152   stack_addr = (char *)&stack_data;
 153   other_threads = __kmp_threads;
 154
 155   /* ATT: The code below is a source of potential bugs due to unsynchronized
 156      access to __kmp_threads array. For example:
 157      1. Current thread loads other_threads[i] to thr and checks it, it is
 158         non-NULL.
 159      2. Current thread is suspended by OS.
 160      3. Another thread unregisters and finishes (debug versions of free()
 161         may fill memory with something like 0xEF).
 162      4. Current thread is resumed.
 163      5. Current thread reads junk from *thr.
 164      TODO: Fix it.  --ln  */
 165
 166   for (i = 0; i < __kmp_threads_capacity; i++) {
 167
 168     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
 169     if (!thr)
 170       continue;
 171
 172     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
 173     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
 174
 175     /* stack grows down -- search through all of the active threads */
 176
 177     if (stack_addr <= stack_base) {
 178       size_t stack_diff = stack_base - stack_addr;
 179
 180       if (stack_diff <= stack_size) {
 181         /* The only way we can be closer than the allocated */
 182         /* stack size is if we are running on this thread. */
 183         // __kmp_gtid_get_specific can return negative value because this
 184         // function can be called by thread destructor. However, before the
 185         // thread destructor is called, the value of the corresponding
 186         // thread-specific data will be reset to NULL.
 187         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
 188                          __kmp_gtid_get_specific() == i);
 189         return i;
 190       }
 191     }
 192   }
 193
 194   /* get specific to try and determine our gtid */
 195   KA_TRACE(1000,
 196            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
 197             "thread, using TLS\n"));
 198   i = __kmp_gtid_get_specific();
 199
 200   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
 201
 202   /* if we havn't been assigned a gtid, then return code */
 203   if (i < 0)
 204     return i;
 205
 206   // other_threads[i] can be nullptr at this point because the corresponding
 207   // thread could have already been destructed. It can happen when this function
 208   // is called in end library routine.
 209   if (!TCR_SYNC_PTR(other_threads[i]))
 210     return i;
 211
 212   /* dynamically updated stack window for uber threads to avoid get_specific
 213      call */
 214   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
 215     KMP_FATAL(StackOverflow, i);
 216   }
 217
 218   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
 219   if (stack_addr > stack_base) {
 220     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
 221     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
 222             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
 223                 stack_base);
 224   } else {
 225     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
 226             stack_base - stack_addr);
 227   }
 228
 229   /* Reprint stack bounds for ubermaster since they have been refined */
 230   if (__kmp_storage_map) {
 231     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
 232     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
 233     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
 234                                  other_threads[i]->th.th_info.ds.ds_stacksize,
 235                                  "th_%d stack (refinement)", i);
 236   }
 237   return i;
 238 }
 239
 240 int __kmp_get_global_thread_id_reg() {
 241   int gtid;
 242
 243   if (!__kmp_init_serial) {
 244     gtid = KMP_GTID_DNE;
 245   } else
 246 #ifdef KMP_TDATA_GTID
 247       if (TCR_4(__kmp_gtid_mode) >= 3) {
 248     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
 249     gtid = __kmp_gtid;
 250   } else
 251 #endif
 252       if (TCR_4(__kmp_gtid_mode) >= 2) {
 253     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
 254     gtid = __kmp_gtid_get_specific();
 255   } else {
 256     KA_TRACE(1000,
 257              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
 258     gtid = __kmp_get_global_thread_id();
 259   }
 260
 261   /* we must be a new uber master sibling thread */
 262   if (gtid == KMP_GTID_DNE) {
 263     KA_TRACE(10,
 264              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
 265               "Registering a new gtid.\n"));
 266     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
 267     if (!__kmp_init_serial) {
 268       __kmp_do_serial_initialize();
 269       gtid = __kmp_gtid_get_specific();
 270     } else {
 271       gtid = __kmp_register_root(FALSE);
 272     }
 273     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 274     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
 275   }
 276
 277   KMP_DEBUG_ASSERT(gtid >= 0);
 278
 279   return gtid;
 280 }
 281
 282 /* caller must hold forkjoin_lock */
 283 void __kmp_check_stack_overlap(kmp_info_t *th) {
 284   int f;
 285   char *stack_beg = NULL;
 286   char *stack_end = NULL;
 287   int gtid;
 288
 289   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
 290   if (__kmp_storage_map) {
 291     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
 292     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
 293
 294     gtid = __kmp_gtid_from_thread(th);
 295
 296     if (gtid == KMP_GTID_MONITOR) {
 297       __kmp_print_storage_map_gtid(
 298           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
 299           "th_%s stack (%s)", "mon",
 300           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
 301     } else {
 302       __kmp_print_storage_map_gtid(
 303           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
 304           "th_%d stack (%s)", gtid,
 305           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
 306     }
 307   }
 308
 309   /* No point in checking ubermaster threads since they use refinement and
 310    * cannot overlap */
 311   gtid = __kmp_gtid_from_thread(th);
 312   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
 313     KA_TRACE(10,
 314              ("__kmp_check_stack_overlap: performing extensive checking\n"));
 315     if (stack_beg == NULL) {
 316       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
 317       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
 318     }
 319
 320     for (f = 0; f < __kmp_threads_capacity; f++) {
 321       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
 322
 323       if (f_th && f_th != th) {
 324         char *other_stack_end =
 325             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
 326         char *other_stack_beg =
 327             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
 328         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
 329             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
 330
 331           /* Print the other stack values before the abort */
 332           if (__kmp_storage_map)
 333             __kmp_print_storage_map_gtid(
 334                 -1, other_stack_beg, other_stack_end,
 335                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
 336                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
 337
 338           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
 339                       __kmp_msg_null);
 340         }
 341       }
 342     }
 343   }
 344   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
 345 }
 346
 347 /* ------------------------------------------------------------------------ */
 348
 349 void __kmp_infinite_loop(void) {
 350   static int done = FALSE;
 351
 352   while (!done) {
 353     KMP_YIELD(TRUE);
 354   }
 355 }
 356
 357 #define MAX_MESSAGE 512
 358
 359 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
 360                                   char const *format, ...) {
 361   char buffer[MAX_MESSAGE];
 362   va_list ap;
 363
 364   va_start(ap, format);
 365   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
 366                p2, (unsigned long)size, format);
 367   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
 368   __kmp_vprintf(kmp_err, buffer, ap);
 369 #if KMP_PRINT_DATA_PLACEMENT
 370   int node;
 371   if (gtid >= 0) {
 372     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
 373       if (__kmp_storage_map_verbose) {
 374         node = __kmp_get_host_node(p1);
 375         if (node < 0) /* doesn't work, so don't try this next time */
 376           __kmp_storage_map_verbose = FALSE;
 377         else {
 378           char *last;
 379           int lastNode;
 380           int localProc = __kmp_get_cpu_from_gtid(gtid);
 381
 382           const int page_size = KMP_GET_PAGE_SIZE();
 383
 384           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
 385           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
 386           if (localProc >= 0)
 387             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
 388                                  localProc >> 1);
 389           else
 390             __kmp_printf_no_lock("  GTID %d\n", gtid);
 391 #if KMP_USE_PRCTL
 392           /* The more elaborate format is disabled for now because of the prctl
 393            * hanging bug. */
 394           do {
 395             last = p1;
 396             lastNode = node;
 397             /* This loop collates adjacent pages with the same host node. */
 398             do {
 399               (char *)p1 += page_size;
 400             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
 401             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
 402                                  lastNode);
 403           } while (p1 <= p2);
 404 #else
 405           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
 406                                (char *)p1 + (page_size - 1),
 407                                __kmp_get_host_node(p1));
 408           if (p1 < p2) {
 409             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
 410                                  (char *)p2 + (page_size - 1),
 411                                  __kmp_get_host_node(p2));
 412           }
 413 #endif
 414         }
 415       }
 416     } else
 417       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
 418   }
 419 #endif /* KMP_PRINT_DATA_PLACEMENT */
 420   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
 421
 422   va_end(ap);
 423 }
 424
 425 void __kmp_warn(char const *format, ...) {
 426   char buffer[MAX_MESSAGE];
 427   va_list ap;
 428
 429   if (__kmp_generate_warnings == kmp_warnings_off) {
 430     return;
 431   }
 432
 433   va_start(ap, format);
 434
 435   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
 436   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
 437   __kmp_vprintf(kmp_err, buffer, ap);
 438   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
 439
 440   va_end(ap);
 441 }
 442
 443 void __kmp_abort_process() {
 444   // Later threads may stall here, but that's ok because abort() will kill them.
 445   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
 446
 447   if (__kmp_debug_buf) {
 448     __kmp_dump_debug_buffer();
 449   }
 450
 451 #if KMP_OS_WINDOWS
 452   // Let other threads know of abnormal termination and prevent deadlock
 453   // if abort happened during library initialization or shutdown
 454   __kmp_global.g.g_abort = SIGABRT;
 455
 456   /* On Windows* OS by default abort() causes pop-up error box, which stalls
 457      nightly testing. Unfortunately, we cannot reliably suppress pop-up error
 458      boxes. _set_abort_behavior() works well, but this function is not
 459      available in VS7 (this is not problem for DLL, but it is a problem for
 460      static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
 461      help, at least in some versions of MS C RTL.
 462
 463      It seems following sequence is the only way to simulate abort() and
 464      avoid pop-up error box. */
 465   raise(SIGABRT);
 466   _exit(3); // Just in case, if signal ignored, exit anyway.
 467 #else
 468   __kmp_unregister_library();
 469   abort();
 470 #endif
 471
 472   __kmp_infinite_loop();
 473   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
 474
 475 } // __kmp_abort_process
 476
 477 void __kmp_abort_thread(void) {
 478   // TODO: Eliminate g_abort global variable and this function.
 479   // In case of abort just call abort(), it will kill all the threads.
 480   __kmp_infinite_loop();
 481 } // __kmp_abort_thread
 482
 483 /* Print out the storage map for the major kmp_info_t thread data structures
 484    that are allocated together. */
 485
 486 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
 487   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
 488                                gtid);
 489
 490   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
 491                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
 492
 493   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
 494                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
 495
 496   __kmp_print_storage_map_gtid(
 497       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
 498       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
 499
 500   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
 501                                &thr->th.th_bar[bs_plain_barrier + 1],
 502                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
 503                                gtid);
 504
 505   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
 506                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
 507                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
 508                                gtid);
 509
 510 #if KMP_FAST_REDUCTION_BARRIER
 511   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
 512                                &thr->th.th_bar[bs_reduction_barrier + 1],
 513                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
 514                                gtid);
 515 #endif // KMP_FAST_REDUCTION_BARRIER
 516 }
 517
 518 /* Print out the storage map for the major kmp_team_t team data structures
 519    that are allocated together. */
 520
 521 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
 522                                          int team_id, int num_thr) {
 523   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
 524   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
 525                                header, team_id);
 526
 527   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
 528                                &team->t.t_bar[bs_last_barrier],
 529                                sizeof(kmp_balign_team_t) * bs_last_barrier,
 530                                "%s_%d.t_bar", header, team_id);
 531
 532   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
 533                                &team->t.t_bar[bs_plain_barrier + 1],
 534                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
 535                                header, team_id);
 536
 537   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
 538                                &team->t.t_bar[bs_forkjoin_barrier + 1],
 539                                sizeof(kmp_balign_team_t),
 540                                "%s_%d.t_bar[forkjoin]", header, team_id);
 541
 542 #if KMP_FAST_REDUCTION_BARRIER
 543   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
 544                                &team->t.t_bar[bs_reduction_barrier + 1],
 545                                sizeof(kmp_balign_team_t),
 546                                "%s_%d.t_bar[reduction]", header, team_id);
 547 #endif // KMP_FAST_REDUCTION_BARRIER
 548
 549   __kmp_print_storage_map_gtid(
 550       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
 551       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
 552
 553   __kmp_print_storage_map_gtid(
 554       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
 555       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
 556
 557   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
 558                                &team->t.t_disp_buffer[num_disp_buff],
 559                                sizeof(dispatch_shared_info_t) * num_disp_buff,
 560                                "%s_%d.t_disp_buffer", header, team_id);
 561 }
 562
 563 static void __kmp_init_allocator() {
 564   __kmp_init_memkind();
 565   __kmp_init_target_mem();
 566 }
 567 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 568
 569 /* ------------------------------------------------------------------------ */
 570
 571 #if ENABLE_LIBOMPTARGET
 572 static void __kmp_init_omptarget() {
 573   __kmp_init_target_task();
 574 }
 575 #endif
 576
 577 /* ------------------------------------------------------------------------ */
 578
 579 #if KMP_DYNAMIC_LIB
 580 #if KMP_OS_WINDOWS
 581
 582 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
 583   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
 584
 585   switch (fdwReason) {
 586
 587   case DLL_PROCESS_ATTACH:
 588     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
 589
 590     return TRUE;
 591
 592   case DLL_PROCESS_DETACH:
 593     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
 594
 595     // According to Windows* documentation for DllMain entry point:
 596     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
 597     //   lpReserved == NULL when FreeLibrary() is called,
 598     //   lpReserved != NULL when the process is terminated.
 599     // When FreeLibrary() is called, worker threads remain alive. So the
 600     // runtime's state is consistent and executing proper shutdown is OK.
 601     // When the process is terminated, worker threads have exited or been
 602     // forcefully terminated by the OS and only the shutdown thread remains.
 603     // This can leave the runtime in an inconsistent state.
 604     // Hence, only attempt proper cleanup when FreeLibrary() is called.
 605     // Otherwise, rely on OS to reclaim resources.
 606     if (lpReserved == NULL)
 607       __kmp_internal_end_library(__kmp_gtid_get_specific());
 608
 609     return TRUE;
 610
 611   case DLL_THREAD_ATTACH:
 612     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
 613
 614     /* if we want to register new siblings all the time here call
 615      * __kmp_get_gtid(); */
 616     return TRUE;
 617
 618   case DLL_THREAD_DETACH:
 619     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
 620
 621     __kmp_internal_end_thread(__kmp_gtid_get_specific());
 622     return TRUE;
 623   }
 624
 625   return TRUE;
 626 }
 627
 628 #endif /* KMP_OS_WINDOWS */
 629 #endif /* KMP_DYNAMIC_LIB */
 630
 631 /* __kmp_parallel_deo -- Wait until it's our turn. */
 632 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 633   int gtid = *gtid_ref;
 634 #ifdef BUILD_PARALLEL_ORDERED
 635   kmp_team_t *team = __kmp_team_from_gtid(gtid);
 636 #endif /* BUILD_PARALLEL_ORDERED */
 637
 638   if (__kmp_env_consistency_check) {
 639     if (__kmp_threads[gtid]->th.th_root->r.r_active)
 640 #if KMP_USE_DYNAMIC_LOCK
 641       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
 642 #else
 643       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
 644 #endif
 645   }
 646 #ifdef BUILD_PARALLEL_ORDERED
 647   if (!team->t.t_serialized) {
 648     KMP_MB();
 649     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
 650              NULL);
 651     KMP_MB();
 652   }
 653 #endif /* BUILD_PARALLEL_ORDERED */
 654 }
 655
 656 /* __kmp_parallel_dxo -- Signal the next task. */
 657 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 658   int gtid = *gtid_ref;
 659 #ifdef BUILD_PARALLEL_ORDERED
 660   int tid = __kmp_tid_from_gtid(gtid);
 661   kmp_team_t *team = __kmp_team_from_gtid(gtid);
 662 #endif /* BUILD_PARALLEL_ORDERED */
 663
 664   if (__kmp_env_consistency_check) {
 665     if (__kmp_threads[gtid]->th.th_root->r.r_active)
 666       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
 667   }
 668 #ifdef BUILD_PARALLEL_ORDERED
 669   if (!team->t.t_serialized) {
 670     KMP_MB(); /* Flush all pending memory write invalidates.  */
 671
 672     /* use the tid of the next thread in this team */
 673     /* TODO replace with general release procedure */
 674     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
 675
 676     KMP_MB(); /* Flush all pending memory write invalidates.  */
 677   }
 678 #endif /* BUILD_PARALLEL_ORDERED */
 679 }
 680
 681 /* ------------------------------------------------------------------------ */
 682 /* The BARRIER for a SINGLE process section is always explicit   */
 683
 684 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
 685   int status;
 686   kmp_info_t *th;
 687   kmp_team_t *team;
 688
 689   if (!TCR_4(__kmp_init_parallel))
 690     __kmp_parallel_initialize();
 691   __kmp_resume_if_soft_paused();
 692
 693   th = __kmp_threads[gtid];
 694   team = th->th.th_team;
 695   status = 0;
 696
 697   th->th.th_ident = id_ref;
 698
 699   if (team->t.t_serialized) {
 700     status = 1;
 701   } else {
 702     kmp_int32 old_this = th->th.th_local.this_construct;
 703
 704     ++th->th.th_local.this_construct;
 705     /* try to set team count to thread count--success means thread got the
 706        single block */
 707     /* TODO: Should this be acquire or release? */
 708     if (team->t.t_construct == old_this) {
 709       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
 710                                               th->th.th_local.this_construct);
 711     }
 712 #if USE_ITT_BUILD
 713     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
 714         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
 715         team->t.t_active_level == 1) {
 716       // Only report metadata by primary thread of active team at level 1
 717       __kmp_itt_metadata_single(id_ref);
 718     }
 719 #endif /* USE_ITT_BUILD */
 720   }
 721
 722   if (__kmp_env_consistency_check) {
 723     if (status && push_ws) {
 724       __kmp_push_workshare(gtid, ct_psingle, id_ref);
 725     } else {
 726       __kmp_check_workshare(gtid, ct_psingle, id_ref);
 727     }
 728   }
 729 #if USE_ITT_BUILD
 730   if (status) {
 731     __kmp_itt_single_start(gtid);
 732   }
 733 #endif /* USE_ITT_BUILD */
 734   return status;
 735 }
 736
 737 void __kmp_exit_single(int gtid) {
 738 #if USE_ITT_BUILD
 739   __kmp_itt_single_end(gtid);
 740 #endif /* USE_ITT_BUILD */
 741   if (__kmp_env_consistency_check)
 742     __kmp_pop_workshare(gtid, ct_psingle, NULL);
 743 }
 744
 745 /* determine if we can go parallel or must use a serialized parallel region and
 746  * how many threads we can use
 747  * set_nproc is the number of threads requested for the team
 748  * returns 0 if we should serialize or only use one thread,
 749  * otherwise the number of threads to use
 750  * The forkjoin lock is held by the caller. */
 751 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
 752                                  int master_tid, int set_nthreads,
 753                                  int enter_teams) {
 754   int capacity;
 755   int new_nthreads;
 756   KMP_DEBUG_ASSERT(__kmp_init_serial);
 757   KMP_DEBUG_ASSERT(root && parent_team);
 758   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
 759
 760   // If dyn-var is set, dynamically adjust the number of desired threads,
 761   // according to the method specified by dynamic_mode.
 762   new_nthreads = set_nthreads;
 763   if (!get__dynamic_2(parent_team, master_tid)) {
 764     ;
 765   }
 766 #ifdef USE_LOAD_BALANCE
 767   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
 768     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
 769     if (new_nthreads == 1) {
 770       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
 771                     "reservation to 1 thread\n",
 772                     master_tid));
 773       return 1;
 774     }
 775     if (new_nthreads < set_nthreads) {
 776       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
 777                     "reservation to %d threads\n",
 778                     master_tid, new_nthreads));
 779     }
 780   }
 781 #endif /* USE_LOAD_BALANCE */
 782   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
 783     new_nthreads = __kmp_avail_proc - __kmp_nth +
 784                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
 785     if (new_nthreads <= 1) {
 786       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
 787                     "reservation to 1 thread\n",
 788                     master_tid));
 789       return 1;
 790     }
 791     if (new_nthreads < set_nthreads) {
 792       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
 793                     "reservation to %d threads\n",
 794                     master_tid, new_nthreads));
 795     } else {
 796       new_nthreads = set_nthreads;
 797     }
 798   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
 799     if (set_nthreads > 2) {
 800       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
 801       new_nthreads = (new_nthreads % set_nthreads) + 1;
 802       if (new_nthreads == 1) {
 803         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
 804                       "reservation to 1 thread\n",
 805                       master_tid));
 806         return 1;
 807       }
 808       if (new_nthreads < set_nthreads) {
 809         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
 810                       "reservation to %d threads\n",
 811                       master_tid, new_nthreads));
 812       }
 813     }
 814   } else {
 815     KMP_ASSERT(0);
 816   }
 817
 818   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
 819   if (__kmp_nth + new_nthreads -
 820           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
 821       __kmp_max_nth) {
 822     int tl_nthreads = __kmp_max_nth - __kmp_nth +
 823                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
 824     if (tl_nthreads <= 0) {
 825       tl_nthreads = 1;
 826     }
 827
 828     // If dyn-var is false, emit a 1-time warning.
 829     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
 830       __kmp_reserve_warn = 1;
 831       __kmp_msg(kmp_ms_warning,
 832                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
 833                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
 834     }
 835     if (tl_nthreads == 1) {
 836       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
 837                     "reduced reservation to 1 thread\n",
 838                     master_tid));
 839       return 1;
 840     }
 841     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
 842                   "reservation to %d threads\n",
 843                   master_tid, tl_nthreads));
 844     new_nthreads = tl_nthreads;
 845   }
 846
 847   // Respect OMP_THREAD_LIMIT
 848   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
 849   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
 850   if (cg_nthreads + new_nthreads -
 851           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
 852       max_cg_threads) {
 853     int tl_nthreads = max_cg_threads - cg_nthreads +
 854                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
 855     if (tl_nthreads <= 0) {
 856       tl_nthreads = 1;
 857     }
 858
 859     // If dyn-var is false, emit a 1-time warning.
 860     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
 861       __kmp_reserve_warn = 1;
 862       __kmp_msg(kmp_ms_warning,
 863                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
 864                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
 865     }
 866     if (tl_nthreads == 1) {
 867       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
 868                     "reduced reservation to 1 thread\n",
 869                     master_tid));
 870       return 1;
 871     }
 872     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
 873                   "reservation to %d threads\n",
 874                   master_tid, tl_nthreads));
 875     new_nthreads = tl_nthreads;
 876   }
 877
 878   // Check if the threads array is large enough, or needs expanding.
 879   // See comment in __kmp_register_root() about the adjustment if
 880   // __kmp_threads[0] == NULL.
 881   capacity = __kmp_threads_capacity;
 882   if (TCR_PTR(__kmp_threads[0]) == NULL) {
 883     --capacity;
 884   }
 885   // If it is not for initializing the hidden helper team, we need to take
 886   // __kmp_hidden_helper_threads_num out of the capacity because it is included
 887   // in __kmp_threads_capacity.
 888   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
 889     capacity -= __kmp_hidden_helper_threads_num;
 890   }
 891   if (__kmp_nth + new_nthreads -
 892           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
 893       capacity) {
 894     // Expand the threads array.
 895     int slotsRequired = __kmp_nth + new_nthreads -
 896                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
 897                         capacity;
 898     int slotsAdded = __kmp_expand_threads(slotsRequired);
 899     if (slotsAdded < slotsRequired) {
 900       // The threads array was not expanded enough.
 901       new_nthreads -= (slotsRequired - slotsAdded);
 902       KMP_ASSERT(new_nthreads >= 1);
 903
 904       // If dyn-var is false, emit a 1-time warning.
 905       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
 906         __kmp_reserve_warn = 1;
 907         if (__kmp_tp_cached) {
 908           __kmp_msg(kmp_ms_warning,
 909                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
 910                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
 911                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
 912         } else {
 913           __kmp_msg(kmp_ms_warning,
 914                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
 915                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
 916         }
 917       }
 918     }
 919   }
 920
 921 #ifdef KMP_DEBUG
 922   if (new_nthreads == 1) {
 923     KC_TRACE(10,
 924              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
 925               "dead roots and rechecking; requested %d threads\n",
 926               __kmp_get_gtid(), set_nthreads));
 927   } else {
 928     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
 929                   " %d threads\n",
 930                   __kmp_get_gtid(), new_nthreads, set_nthreads));
 931   }
 932 #endif // KMP_DEBUG
 933   return new_nthreads;
 934 }
 935
 936 /* Allocate threads from the thread pool and assign them to the new team. We are
 937    assured that there are enough threads available, because we checked on that
 938    earlier within critical section forkjoin */
 939 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
 940                                     kmp_info_t *master_th, int master_gtid,
 941                                     int fork_teams_workers) {
 942   int i;
 943   int use_hot_team;
 944
 945   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
 946   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
 947   KMP_MB();
 948
 949   /* first, let's setup the primary thread */
 950   master_th->th.th_info.ds.ds_tid = 0;
 951   master_th->th.th_team = team;
 952   master_th->th.th_team_nproc = team->t.t_nproc;
 953   master_th->th.th_team_master = master_th;
 954   master_th->th.th_team_serialized = FALSE;
 955   master_th->th.th_dispatch = &team->t.t_dispatch[0];
 956
 957 /* make sure we are not the optimized hot team */
 958 #if KMP_NESTED_HOT_TEAMS
 959   use_hot_team = 0;
 960   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
 961   if (hot_teams) { // hot teams array is not allocated if
 962     // KMP_HOT_TEAMS_MAX_LEVEL=0
 963     int level = team->t.t_active_level - 1; // index in array of hot teams
 964     if (master_th->th.th_teams_microtask) { // are we inside the teams?
 965       if (master_th->th.th_teams_size.nteams > 1) {
 966         ++level; // level was not increased in teams construct for
 967         // team_of_masters
 968       }
 969       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
 970           master_th->th.th_teams_level == team->t.t_level) {
 971         ++level; // level was not increased in teams construct for
 972         // team_of_workers before the parallel
 973       } // team->t.t_level will be increased inside parallel
 974     }
 975     if (level < __kmp_hot_teams_max_level) {
 976       if (hot_teams[level].hot_team) {
 977         // hot team has already been allocated for given level
 978         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
 979         use_hot_team = 1; // the team is ready to use
 980       } else {
 981         use_hot_team = 0; // AC: threads are not allocated yet
 982         hot_teams[level].hot_team = team; // remember new hot team
 983         hot_teams[level].hot_team_nth = team->t.t_nproc;
 984       }
 985     } else {
 986       use_hot_team = 0;
 987     }
 988   }
 989 #else
 990   use_hot_team = team == root->r.r_hot_team;
 991 #endif
 992   if (!use_hot_team) {
 993
 994     /* install the primary thread */
 995     team->t.t_threads[0] = master_th;
 996     __kmp_initialize_info(master_th, team, 0, master_gtid);
 997
 998     /* now, install the worker threads */
 999     for (i = 1; i < team->t.t_nproc; i++) {
1000
1001       /* fork or reallocate a new thread and install it in team */
1002       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1003       team->t.t_threads[i] = thr;
1004       KMP_DEBUG_ASSERT(thr);
1005       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1006       /* align team and thread arrived states */
1007       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1008                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1009                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1010                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1011                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1012                     team->t.t_bar[bs_plain_barrier].b_arrived));
1013       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1014       thr->th.th_teams_level = master_th->th.th_teams_level;
1015       thr->th.th_teams_size = master_th->th.th_teams_size;
1016       { // Initialize threads' barrier data.
1017         int b;
1018         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1019         for (b = 0; b < bs_last_barrier; ++b) {
1020           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1021           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1022 #if USE_DEBUGGER
1023           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1024 #endif
1025         }
1026       }
1027     }
1028
1029 #if KMP_AFFINITY_SUPPORTED
1030     // Do not partition the places list for teams construct workers who
1031     // haven't actually been forked to do real work yet. This partitioning
1032     // will take place in the parallel region nested within the teams construct.
1033     if (!fork_teams_workers) {
1034       __kmp_partition_places(team);
1035     }
1036 #endif
1037
1038     if (team->t.t_nproc > 1 &&
1039         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1040       team->t.b->update_num_threads(team->t.t_nproc);
1041       __kmp_add_threads_to_team(team, team->t.t_nproc);
1042     }
1043   }
1044
1045   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1046     for (i = 0; i < team->t.t_nproc; i++) {
1047       kmp_info_t *thr = team->t.t_threads[i];
1048       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1049           thr->th.th_prev_level != team->t.t_level) {
1050         team->t.t_display_affinity = 1;
1051         break;
1052       }
1053     }
1054   }
1055
1056   KMP_MB();
1057 }
1058
1059 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1060 // Propagate any changes to the floating point control registers out to the team
1061 // We try to avoid unnecessary writes to the relevant cache line in the team
1062 // structure, so we don't make changes unless they are needed.
1063 inline static void propagateFPControl(kmp_team_t *team) {
1064   if (__kmp_inherit_fp_control) {
1065     kmp_int16 x87_fpu_control_word;
1066     kmp_uint32 mxcsr;
1067
1068     // Get primary thread's values of FPU control flags (both X87 and vector)
1069     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070     __kmp_store_mxcsr(&mxcsr);
1071     mxcsr &= KMP_X86_MXCSR_MASK;
1072
1073     // There is no point looking at t_fp_control_saved here.
1074     // If it is TRUE, we still have to update the values if they are different
1075     // from those we now have. If it is FALSE we didn't save anything yet, but
1076     // our objective is the same. We have to ensure that the values in the team
1077     // are the same as those we have.
1078     // So, this code achieves what we need whether or not t_fp_control_saved is
1079     // true. By checking whether the value needs updating we avoid unnecessary
1080     // writes that would put the cache-line into a written state, causing all
1081     // threads in the team to have to read it again.
1082     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1083     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1084     // Although we don't use this value, other code in the runtime wants to know
1085     // whether it should restore them. So we must ensure it is correct.
1086     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1087   } else {
1088     // Similarly here. Don't write to this cache-line in the team structure
1089     // unless we have to.
1090     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1091   }
1092 }
1093
1094 // Do the opposite, setting the hardware registers to the updated values from
1095 // the team.
1096 inline static void updateHWFPControl(kmp_team_t *team) {
1097   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1098     // Only reset the fp control regs if they have been changed in the team.
1099     // the parallel region that we are exiting.
1100     kmp_int16 x87_fpu_control_word;
1101     kmp_uint32 mxcsr;
1102     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1103     __kmp_store_mxcsr(&mxcsr);
1104     mxcsr &= KMP_X86_MXCSR_MASK;
1105
1106     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1107       __kmp_clear_x87_fpu_status_word();
1108       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1109     }
1110
1111     if (team->t.t_mxcsr != mxcsr) {
1112       __kmp_load_mxcsr(&team->t.t_mxcsr);
1113     }
1114   }
1115 }
1116 #else
1117 #define propagateFPControl(x) ((void)0)
1118 #define updateHWFPControl(x) ((void)0)
1119 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1120
1121 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1122                                      int realloc); // forward declaration
1123
1124 /* Run a parallel region that has been serialized, so runs only in a team of the
1125    single primary thread. */
1126 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1127   kmp_info_t *this_thr;
1128   kmp_team_t *serial_team;
1129
1130   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1131
1132   /* Skip all this code for autopar serialized loops since it results in
1133      unacceptable overhead */
1134   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1135     return;
1136
1137   if (!TCR_4(__kmp_init_parallel))
1138     __kmp_parallel_initialize();
1139   __kmp_resume_if_soft_paused();
1140
1141   this_thr = __kmp_threads[global_tid];
1142   serial_team = this_thr->th.th_serial_team;
1143
1144   /* utilize the serialized team held by this thread */
1145   KMP_DEBUG_ASSERT(serial_team);
1146   KMP_MB();
1147
1148   if (__kmp_tasking_mode != tskm_immediate_exec) {
1149     KMP_DEBUG_ASSERT(
1150         this_thr->th.th_task_team ==
1151         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1152     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1153                      NULL);
1154     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1155                   "team %p, new task_team = NULL\n",
1156                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1157     this_thr->th.th_task_team = NULL;
1158   }
1159
1160   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1161   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1162     proc_bind = proc_bind_false;
1163   } else if (proc_bind == proc_bind_default) {
1164     // No proc_bind clause was specified, so use the current value
1165     // of proc-bind-var for this parallel region.
1166     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1167   }
1168   // Reset for next parallel region
1169   this_thr->th.th_set_proc_bind = proc_bind_default;
1170
1171   // Reset num_threads for next parallel region
1172   this_thr->th.th_set_nproc = 0;
1173
1174 #if OMPT_SUPPORT
1175   ompt_data_t ompt_parallel_data = ompt_data_none;
1176   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1177   if (ompt_enabled.enabled &&
1178       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1179
1180     ompt_task_info_t *parent_task_info;
1181     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1182
1183     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1184     if (ompt_enabled.ompt_callback_parallel_begin) {
1185       int team_size = 1;
1186
1187       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1188           &(parent_task_info->task_data), &(parent_task_info->frame),
1189           &ompt_parallel_data, team_size,
1190           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191     }
1192   }
1193 #endif // OMPT_SUPPORT
1194
1195   if (this_thr->th.th_team != serial_team) {
1196     // Nested level will be an index in the nested nthreads array
1197     int level = this_thr->th.th_team->t.t_level;
1198
1199     if (serial_team->t.t_serialized) {
1200       /* this serial team was already used
1201          TODO increase performance by making this locks more specific */
1202       kmp_team_t *new_team;
1203
1204       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1205
1206       new_team =
1207           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1208 #if OMPT_SUPPORT
1209                               ompt_parallel_data,
1210 #endif
1211                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1212                               0 USE_NESTED_HOT_ARG(NULL));
1213       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1214       KMP_ASSERT(new_team);
1215
1216       /* setup new serialized team and install it */
1217       new_team->t.t_threads[0] = this_thr;
1218       new_team->t.t_parent = this_thr->th.th_team;
1219       serial_team = new_team;
1220       this_thr->th.th_serial_team = serial_team;
1221
1222       KF_TRACE(
1223           10,
1224           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1225            global_tid, serial_team));
1226
1227       /* TODO the above breaks the requirement that if we run out of resources,
1228          then we can still guarantee that serialized teams are ok, since we may
1229          need to allocate a new one */
1230     } else {
1231       KF_TRACE(
1232           10,
1233           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1234            global_tid, serial_team));
1235     }
1236
1237     /* we have to initialize this serial team */
1238     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1239     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1240     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1241     serial_team->t.t_ident = loc;
1242     serial_team->t.t_serialized = 1;
1243     serial_team->t.t_nproc = 1;
1244     serial_team->t.t_parent = this_thr->th.th_team;
1245     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1246     this_thr->th.th_team = serial_team;
1247     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1248
1249     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1250                   this_thr->th.th_current_task));
1251     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1252     this_thr->th.th_current_task->td_flags.executing = 0;
1253
1254     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1255
1256     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1257        implicit task for each serialized task represented by
1258        team->t.t_serialized? */
1259     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1260               &this_thr->th.th_current_task->td_parent->td_icvs);
1261
1262     // Thread value exists in the nested nthreads array for the next nested
1263     // level
1264     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265       this_thr->th.th_current_task->td_icvs.nproc =
1266           __kmp_nested_nth.nth[level + 1];
1267     }
1268
1269     if (__kmp_nested_proc_bind.used &&
1270         (level + 1 < __kmp_nested_proc_bind.used)) {
1271       this_thr->th.th_current_task->td_icvs.proc_bind =
1272           __kmp_nested_proc_bind.bind_types[level + 1];
1273     }
1274
1275 #if USE_DEBUGGER
1276     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1277 #endif
1278     this_thr->th.th_info.ds.ds_tid = 0;
1279
1280     /* set thread cache values */
1281     this_thr->th.th_team_nproc = 1;
1282     this_thr->th.th_team_master = this_thr;
1283     this_thr->th.th_team_serialized = 1;
1284
1285     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1286     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1287     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1288
1289     propagateFPControl(serial_team);
1290
1291     /* check if we need to allocate dispatch buffers stack */
1292     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1293     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1294       serial_team->t.t_dispatch->th_disp_buffer =
1295           (dispatch_private_info_t *)__kmp_allocate(
1296               sizeof(dispatch_private_info_t));
1297     }
1298     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1299
1300     KMP_MB();
1301
1302   } else {
1303     /* this serialized team is already being used,
1304      * that's fine, just add another nested level */
1305     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1306     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1307     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1308     ++serial_team->t.t_serialized;
1309     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1310
1311     // Nested level will be an index in the nested nthreads array
1312     int level = this_thr->th.th_team->t.t_level;
1313     // Thread value exists in the nested nthreads array for the next nested
1314     // level
1315     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1316       this_thr->th.th_current_task->td_icvs.nproc =
1317           __kmp_nested_nth.nth[level + 1];
1318     }
1319     serial_team->t.t_level++;
1320     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1321                   "of serial team %p to %d\n",
1322                   global_tid, serial_team, serial_team->t.t_level));
1323
1324     /* allocate/push dispatch buffers stack */
1325     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326     {
1327       dispatch_private_info_t *disp_buffer =
1328           (dispatch_private_info_t *)__kmp_allocate(
1329               sizeof(dispatch_private_info_t));
1330       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1331       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1332     }
1333     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334
1335     KMP_MB();
1336   }
1337   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1338
1339   // Perform the display affinity functionality for
1340   // serialized parallel regions
1341   if (__kmp_display_affinity) {
1342     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1343         this_thr->th.th_prev_num_threads != 1) {
1344       // NULL means use the affinity-format-var ICV
1345       __kmp_aux_display_affinity(global_tid, NULL);
1346       this_thr->th.th_prev_level = serial_team->t.t_level;
1347       this_thr->th.th_prev_num_threads = 1;
1348     }
1349   }
1350
1351   if (__kmp_env_consistency_check)
1352     __kmp_push_parallel(global_tid, NULL);
1353 #if OMPT_SUPPORT
1354   serial_team->t.ompt_team_info.master_return_address = codeptr;
1355   if (ompt_enabled.enabled &&
1356       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1357     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1358         OMPT_GET_FRAME_ADDRESS(0);
1359
1360     ompt_lw_taskteam_t lw_taskteam;
1361     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1362                             &ompt_parallel_data, codeptr);
1363
1364     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1365     // don't use lw_taskteam after linking. content was swaped
1366
1367     /* OMPT implicit task begin */
1368     if (ompt_enabled.ompt_callback_implicit_task) {
1369       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1370           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1371           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1372           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1373       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1374           __kmp_tid_from_gtid(global_tid);
1375     }
1376
1377     /* OMPT state */
1378     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1379     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1380         OMPT_GET_FRAME_ADDRESS(0);
1381   }
1382 #endif
1383 }
1384
1385 // Test if this fork is for a team closely nested in a teams construct
1386 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1387                                           microtask_t microtask, int level,
1388                                           int teams_level, kmp_va_list ap) {
1389   return (master_th->th.th_teams_microtask && ap &&
1390           microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391 }
1392
1393 // Test if this fork is for the teams construct, i.e. to form the outer league
1394 // of teams
1395 static inline bool __kmp_is_entering_teams(int active_level, int level,
1396                                            int teams_level, kmp_va_list ap) {
1397   return ((ap == NULL && active_level == 0) ||
1398           (ap && teams_level > 0 && teams_level == level));
1399 }
1400
1401 // AC: This is start of parallel that is nested inside teams construct.
1402 // The team is actual (hot), all workers are ready at the fork barrier.
1403 // No lock needed to initialize the team a bit, then free workers.
1404 static inline int
1405 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1406                     kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1407                     enum fork_context_e call_context, microtask_t microtask,
1408                     launch_t invoker, int master_set_numthreads, int level,
1409 #if OMPT_SUPPORT
1410                     ompt_data_t ompt_parallel_data, void *return_address,
1411 #endif
1412                     kmp_va_list ap) {
1413   void **argv;
1414   int i;
1415
1416   parent_team->t.t_ident = loc;
1417   __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1418   parent_team->t.t_argc = argc;
1419   argv = (void **)parent_team->t.t_argv;
1420   for (i = argc - 1; i >= 0; --i) {
1421     *argv++ = va_arg(kmp_va_deref(ap), void *);
1422   }
1423   // Increment our nested depth levels, but not increase the serialization
1424   if (parent_team == master_th->th.th_serial_team) {
1425     // AC: we are in serialized parallel
1426     __kmpc_serialized_parallel(loc, gtid);
1427     KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1428
1429     if (call_context == fork_context_gnu) {
1430       // AC: need to decrement t_serialized for enquiry functions to work
1431       // correctly, will restore at join time
1432       parent_team->t.t_serialized--;
1433       return TRUE;
1434     }
1435
1436 #if OMPD_SUPPORT
1437     parent_team->t.t_pkfn = microtask;
1438 #endif
1439
1440 #if OMPT_SUPPORT
1441     void *dummy;
1442     void **exit_frame_p;
1443     ompt_data_t *implicit_task_data;
1444     ompt_lw_taskteam_t lw_taskteam;
1445
1446     if (ompt_enabled.enabled) {
1447       __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1448                               &ompt_parallel_data, return_address);
1449       exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1450
1451       __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1452       // Don't use lw_taskteam after linking. Content was swapped.
1453
1454       /* OMPT implicit task begin */
1455       implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1456       if (ompt_enabled.ompt_callback_implicit_task) {
1457         OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1458         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1459             ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1460             1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461       }
1462
1463       /* OMPT state */
1464       master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1465     } else {
1466       exit_frame_p = &dummy;
1467     }
1468 #endif
1469
1470     // AC: need to decrement t_serialized for enquiry functions to work
1471     // correctly, will restore at join time
1472     parent_team->t.t_serialized--;
1473
1474     {
1475       KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1476       KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1477       __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1478 #if OMPT_SUPPORT
1479                              ,
1480                              exit_frame_p
1481 #endif
1482                              );
1483     }
1484
1485 #if OMPT_SUPPORT
1486     if (ompt_enabled.enabled) {
1487       *exit_frame_p = NULL;
1488       OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1489       if (ompt_enabled.ompt_callback_implicit_task) {
1490         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1491             ompt_scope_end, NULL, implicit_task_data, 1,
1492             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1493       }
1494       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1495       __ompt_lw_taskteam_unlink(master_th);
1496       if (ompt_enabled.ompt_callback_parallel_end) {
1497         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1498             &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1499             OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1500       }
1501       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1502     }
1503 #endif
1504     return TRUE;
1505   }
1506
1507   parent_team->t.t_pkfn = microtask;
1508   parent_team->t.t_invoke = invoker;
1509   KMP_ATOMIC_INC(&root->r.r_in_parallel);
1510   parent_team->t.t_active_level++;
1511   parent_team->t.t_level++;
1512   parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1513
1514   // If the threads allocated to the team are less than the thread limit, update
1515   // the thread limit here. th_teams_size.nth is specific to this team nested
1516   // in a teams construct, the team is fully created, and we're about to do
1517   // the actual fork. Best to do this here so that the subsequent uses below
1518   // and in the join have the correct value.
1519   master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1520
1521 #if OMPT_SUPPORT
1522   if (ompt_enabled.enabled) {
1523     ompt_lw_taskteam_t lw_taskteam;
1524     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1525                             return_address);
1526     __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1527   }
1528 #endif
1529
1530   /* Change number of threads in the team if requested */
1531   if (master_set_numthreads) { // The parallel has num_threads clause
1532     if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1533       // AC: only can reduce number of threads dynamically, can't increase
1534       kmp_info_t **other_threads = parent_team->t.t_threads;
1535       // NOTE: if using distributed barrier, we need to run this code block
1536       // even when the team size appears not to have changed from the max.
1537       int old_proc = master_th->th.th_teams_size.nth;
1538       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1539         __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1540         __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1541       }
1542       parent_team->t.t_nproc = master_set_numthreads;
1543       for (i = 0; i < master_set_numthreads; ++i) {
1544         other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545       }
1546     }
1547     // Keep extra threads hot in the team for possible next parallels
1548     master_th->th.th_set_nproc = 0;
1549   }
1550
1551 #if USE_DEBUGGER
1552   if (__kmp_debugging) { // Let debugger override number of threads.
1553     int nth = __kmp_omp_num_threads(loc);
1554     if (nth > 0) { // 0 means debugger doesn't want to change num threads
1555       master_set_numthreads = nth;
1556     }
1557   }
1558 #endif
1559
1560   // Figure out the proc_bind policy for the nested parallel within teams
1561   kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1562   // proc_bind_default means don't update
1563   kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1564   if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1565     proc_bind = proc_bind_false;
1566   } else {
1567     // No proc_bind clause specified; use current proc-bind-var
1568     if (proc_bind == proc_bind_default) {
1569       proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1570     }
1571     /* else: The proc_bind policy was specified explicitly on parallel clause.
1572        This overrides proc-bind-var for this parallel region, but does not
1573        change proc-bind-var. */
1574     // Figure the value of proc-bind-var for the child threads.
1575     if ((level + 1 < __kmp_nested_proc_bind.used) &&
1576         (__kmp_nested_proc_bind.bind_types[level + 1] !=
1577          master_th->th.th_current_task->td_icvs.proc_bind)) {
1578       proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579     }
1580   }
1581   KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1582   // Need to change the bind-var ICV to correct value for each implicit task
1583   if (proc_bind_icv != proc_bind_default &&
1584       master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1585     kmp_info_t **other_threads = parent_team->t.t_threads;
1586     for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1587       other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588     }
1589   }
1590   // Reset for next parallel region
1591   master_th->th.th_set_proc_bind = proc_bind_default;
1592
1593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1594   if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1595        KMP_ITT_DEBUG) &&
1596       __kmp_forkjoin_frames_mode == 3 &&
1597       parent_team->t.t_active_level == 1 // only report frames at level 1
1598       && master_th->th.th_teams_size.nteams == 1) {
1599     kmp_uint64 tmp_time = __itt_get_timestamp();
1600     master_th->th.th_frame_time = tmp_time;
1601     parent_team->t.t_region_time = tmp_time;
1602   }
1603   if (__itt_stack_caller_create_ptr) {
1604     KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1605     // create new stack stitching id before entering fork barrier
1606     parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1607   }
1608 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1609 #if KMP_AFFINITY_SUPPORTED
1610   __kmp_partition_places(parent_team);
1611 #endif
1612
1613   KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1614                 "master_th=%p, gtid=%d\n",
1615                 root, parent_team, master_th, gtid));
1616   __kmp_internal_fork(loc, gtid, parent_team);
1617   KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1618                 "master_th=%p, gtid=%d\n",
1619                 root, parent_team, master_th, gtid));
1620
1621   if (call_context == fork_context_gnu)
1622     return TRUE;
1623
1624   /* Invoke microtask for PRIMARY thread */
1625   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1626                 parent_team->t.t_id, parent_team->t.t_pkfn));
1627
1628   if (!parent_team->t.t_invoke(gtid)) {
1629     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1630   }
1631   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1632                 parent_team->t.t_id, parent_team->t.t_pkfn));
1633   KMP_MB(); /* Flush all pending memory write invalidates.  */
1634
1635   KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1636
1637   return TRUE;
1638 }
1639
1640 // Create a serialized parallel region
1641 static inline int
1642 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1643                        kmp_int32 argc, microtask_t microtask, launch_t invoker,
1644                        kmp_info_t *master_th, kmp_team_t *parent_team,
1645 #if OMPT_SUPPORT
1646                        ompt_data_t *ompt_parallel_data, void **return_address,
1647                        ompt_data_t **parent_task_data,
1648 #endif
1649                        kmp_va_list ap) {
1650   kmp_team_t *team;
1651   int i;
1652   void **argv;
1653
1654 /* josh todo: hypothetical question: what do we do for OS X*? */
1655 #if KMP_OS_LINUX &&                                                            \
1656     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1657   SimpleVLA<void *> args(argc);
1658 #else
1659   void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1660 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1661           KMP_ARCH_AARCH64) */
1662
1663   KA_TRACE(
1664       20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1665
1666   __kmpc_serialized_parallel(loc, gtid);
1667
1668 #if OMPD_SUPPORT
1669   master_th->th.th_serial_team->t.t_pkfn = microtask;
1670 #endif
1671
1672   if (call_context == fork_context_intel) {
1673     /* TODO this sucks, use the compiler itself to pass args! :) */
1674     master_th->th.th_serial_team->t.t_ident = loc;
1675     if (!ap) {
1676       // revert change made in __kmpc_serialized_parallel()
1677       master_th->th.th_serial_team->t.t_level--;
1678 // Get args from parent team for teams construct
1679
1680 #if OMPT_SUPPORT
1681       void *dummy;
1682       void **exit_frame_p;
1683       ompt_task_info_t *task_info;
1684       ompt_lw_taskteam_t lw_taskteam;
1685
1686       if (ompt_enabled.enabled) {
1687         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1688                                 ompt_parallel_data, *return_address);
1689
1690         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1691         // don't use lw_taskteam after linking. content was swaped
1692         task_info = OMPT_CUR_TASK_INFO(master_th);
1693         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1694         if (ompt_enabled.ompt_callback_implicit_task) {
1695           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1696           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1697               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1698               &(task_info->task_data), 1,
1699               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700         }
1701
1702         /* OMPT state */
1703         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1704       } else {
1705         exit_frame_p = &dummy;
1706       }
1707 #endif
1708
1709       {
1710         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1711         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1712         __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1713 #if OMPT_SUPPORT
1714                                ,
1715                                exit_frame_p
1716 #endif
1717                                );
1718       }
1719
1720 #if OMPT_SUPPORT
1721       if (ompt_enabled.enabled) {
1722         *exit_frame_p = NULL;
1723         if (ompt_enabled.ompt_callback_implicit_task) {
1724           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725               ompt_scope_end, NULL, &(task_info->task_data), 1,
1726               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1727         }
1728         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1729         __ompt_lw_taskteam_unlink(master_th);
1730         if (ompt_enabled.ompt_callback_parallel_end) {
1731           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1732               ompt_parallel_data, *parent_task_data,
1733               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1734         }
1735         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1736       }
1737 #endif
1738     } else if (microtask == (microtask_t)__kmp_teams_master) {
1739       KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1740       team = master_th->th.th_team;
1741       // team->t.t_pkfn = microtask;
1742       team->t.t_invoke = invoker;
1743       __kmp_alloc_argv_entries(argc, team, TRUE);
1744       team->t.t_argc = argc;
1745       argv = (void **)team->t.t_argv;
1746       if (ap) {
1747         for (i = argc - 1; i >= 0; --i)
1748           *argv++ = va_arg(kmp_va_deref(ap), void *);
1749       } else {
1750         for (i = 0; i < argc; ++i)
1751           // Get args from parent team for teams construct
1752           argv[i] = parent_team->t.t_argv[i];
1753       }
1754       // AC: revert change made in __kmpc_serialized_parallel()
1755       //     because initial code in teams should have level=0
1756       team->t.t_level--;
1757       // AC: call special invoker for outer "parallel" of teams construct
1758       invoker(gtid);
1759 #if OMPT_SUPPORT
1760       if (ompt_enabled.enabled) {
1761         ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1762         if (ompt_enabled.ompt_callback_implicit_task) {
1763           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1764               ompt_scope_end, NULL, &(task_info->task_data), 0,
1765               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1766         }
1767         if (ompt_enabled.ompt_callback_parallel_end) {
1768           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1769               ompt_parallel_data, *parent_task_data,
1770               OMPT_INVOKER(call_context) | ompt_parallel_league,
1771               *return_address);
1772         }
1773         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774       }
1775 #endif
1776     } else {
1777       argv = args;
1778       for (i = argc - 1; i >= 0; --i)
1779         *argv++ = va_arg(kmp_va_deref(ap), void *);
1780       KMP_MB();
1781
1782 #if OMPT_SUPPORT
1783       void *dummy;
1784       void **exit_frame_p;
1785       ompt_task_info_t *task_info;
1786       ompt_lw_taskteam_t lw_taskteam;
1787       ompt_data_t *implicit_task_data;
1788
1789       if (ompt_enabled.enabled) {
1790         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1791                                 ompt_parallel_data, *return_address);
1792         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1793         // don't use lw_taskteam after linking. content was swaped
1794         task_info = OMPT_CUR_TASK_INFO(master_th);
1795         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1796
1797         /* OMPT implicit task begin */
1798         implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1799         if (ompt_enabled.ompt_callback_implicit_task) {
1800           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802               implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1803               ompt_task_implicit);
1804           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1805         }
1806
1807         /* OMPT state */
1808         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809       } else {
1810         exit_frame_p = &dummy;
1811       }
1812 #endif
1813
1814       {
1815         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817         __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1818 #if OMPT_SUPPORT
1819                                ,
1820                                exit_frame_p
1821 #endif
1822                                );
1823       }
1824
1825 #if OMPT_SUPPORT
1826       if (ompt_enabled.enabled) {
1827         *exit_frame_p = NULL;
1828         if (ompt_enabled.ompt_callback_implicit_task) {
1829           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1830               ompt_scope_end, NULL, &(task_info->task_data), 1,
1831               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1832         }
1833
1834         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1835         __ompt_lw_taskteam_unlink(master_th);
1836         if (ompt_enabled.ompt_callback_parallel_end) {
1837           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838               ompt_parallel_data, *parent_task_data,
1839               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1840         }
1841         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842       }
1843 #endif
1844     }
1845   } else if (call_context == fork_context_gnu) {
1846 #if OMPT_SUPPORT
1847     if (ompt_enabled.enabled) {
1848       ompt_lw_taskteam_t lwt;
1849       __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1850                               *return_address);
1851
1852       lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1853       __ompt_lw_taskteam_link(&lwt, master_th, 1);
1854     }
1855 // don't use lw_taskteam after linking. content was swaped
1856 #endif
1857
1858     // we were called from GNU native code
1859     KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1860     return FALSE;
1861   } else {
1862     KMP_ASSERT2(call_context < fork_context_last,
1863                 "__kmp_serial_fork_call: unknown fork_context parameter");
1864   }
1865
1866   KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1867   KMP_MB();
1868   return FALSE;
1869 }
1870
1871 /* most of the work for a fork */
1872 /* return true if we really went parallel, false if serialized */
1873 int __kmp_fork_call(ident_t *loc, int gtid,
1874                     enum fork_context_e call_context, // Intel, GNU, ...
1875                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1876                     kmp_va_list ap) {
1877   void **argv;
1878   int i;
1879   int master_tid;
1880   int master_this_cons;
1881   kmp_team_t *team;
1882   kmp_team_t *parent_team;
1883   kmp_info_t *master_th;
1884   kmp_root_t *root;
1885   int nthreads;
1886   int master_active;
1887   int master_set_numthreads;
1888   int task_thread_limit = 0;
1889   int level;
1890   int active_level;
1891   int teams_level;
1892 #if KMP_NESTED_HOT_TEAMS
1893   kmp_hot_team_ptr_t **p_hot_teams;
1894 #endif
1895   { // KMP_TIME_BLOCK
1896     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1897     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1898
1899     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1900     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1901       /* Some systems prefer the stack for the root thread(s) to start with */
1902       /* some gap from the parent stack to prevent false sharing. */
1903       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1904       /* These 2 lines below are so this does not get optimized out */
1905       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1906         __kmp_stkpadding += (short)((kmp_int64)dummy);
1907     }
1908
1909     /* initialize if needed */
1910     KMP_DEBUG_ASSERT(
1911         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1912     if (!TCR_4(__kmp_init_parallel))
1913       __kmp_parallel_initialize();
1914     __kmp_resume_if_soft_paused();
1915
1916     /* setup current data */
1917     // AC: potentially unsafe, not in sync with library shutdown,
1918     // __kmp_threads can be freed
1919     master_th = __kmp_threads[gtid];
1920
1921     parent_team = master_th->th.th_team;
1922     master_tid = master_th->th.th_info.ds.ds_tid;
1923     master_this_cons = master_th->th.th_local.this_construct;
1924     root = master_th->th.th_root;
1925     master_active = root->r.r_active;
1926     master_set_numthreads = master_th->th.th_set_nproc;
1927     task_thread_limit =
1928         master_th->th.th_current_task->td_icvs.task_thread_limit;
1929
1930 #if OMPT_SUPPORT
1931     ompt_data_t ompt_parallel_data = ompt_data_none;
1932     ompt_data_t *parent_task_data;
1933     ompt_frame_t *ompt_frame;
1934     void *return_address = NULL;
1935
1936     if (ompt_enabled.enabled) {
1937       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1938                                     NULL, NULL);
1939       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1940     }
1941 #endif
1942
1943     // Assign affinity to root thread if it hasn't happened yet
1944     __kmp_assign_root_init_mask();
1945
1946     // Nested level will be an index in the nested nthreads array
1947     level = parent_team->t.t_level;
1948     // used to launch non-serial teams even if nested is not allowed
1949     active_level = parent_team->t.t_active_level;
1950     // needed to check nesting inside the teams
1951     teams_level = master_th->th.th_teams_level;
1952 #if KMP_NESTED_HOT_TEAMS
1953     p_hot_teams = &master_th->th.th_hot_teams;
1954     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1955       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1956           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1957       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1958       // it is either actual or not needed (when active_level > 0)
1959       (*p_hot_teams)[0].hot_team_nth = 1;
1960     }
1961 #endif
1962
1963 #if OMPT_SUPPORT
1964     if (ompt_enabled.enabled) {
1965       if (ompt_enabled.ompt_callback_parallel_begin) {
1966         int team_size = master_set_numthreads
1967                             ? master_set_numthreads
1968                             : get__nproc_2(parent_team, master_tid);
1969         int flags = OMPT_INVOKER(call_context) |
1970                     ((microtask == (microtask_t)__kmp_teams_master)
1971                          ? ompt_parallel_league
1972                          : ompt_parallel_team);
1973         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1974             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1975             return_address);
1976       }
1977       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1978     }
1979 #endif
1980
1981     master_th->th.th_ident = loc;
1982
1983     // Parallel closely nested in teams construct:
1984     if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1985       return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1986                                  call_context, microtask, invoker,
1987                                  master_set_numthreads, level,
1988 #if OMPT_SUPPORT
1989                                  ompt_parallel_data, return_address,
1990 #endif
1991                                  ap);
1992     } // End parallel closely nested in teams construct
1993
1994 #if KMP_DEBUG
1995     if (__kmp_tasking_mode != tskm_immediate_exec) {
1996       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1997                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1998     }
1999 #endif
2000
2001     // Need this to happen before we determine the number of threads, not while
2002     // we are allocating the team
2003     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2004
2005     // Determine the number of threads
2006     int enter_teams =
2007         __kmp_is_entering_teams(active_level, level, teams_level, ap);
2008     if ((!enter_teams &&
2009          (parent_team->t.t_active_level >=
2010           master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2011         (__kmp_library == library_serial)) {
2012       KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2013       nthreads = 1;
2014     } else {
2015       nthreads = master_set_numthreads
2016                      ? master_set_numthreads
2017                      // TODO: get nproc directly from current task
2018                      : get__nproc_2(parent_team, master_tid);
2019       // Use the thread_limit set for the current target task if exists, else go
2020       // with the deduced nthreads
2021       nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2022                      ? task_thread_limit
2023                      : nthreads;
2024       // Check if we need to take forkjoin lock? (no need for serialized
2025       // parallel out of teams construct).
2026       if (nthreads > 1) {
2027         /* determine how many new threads we can use */
2028         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2029         /* AC: If we execute teams from parallel region (on host), then teams
2030            should be created but each can only have 1 thread if nesting is
2031            disabled. If teams called from serial region, then teams and their
2032            threads should be created regardless of the nesting setting. */
2033         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2034                                          nthreads, enter_teams);
2035         if (nthreads == 1) {
2036           // Free lock for single thread execution here; for multi-thread
2037           // execution it will be freed later after team of threads created
2038           // and initialized
2039           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2040         }
2041       }
2042     }
2043     KMP_DEBUG_ASSERT(nthreads > 0);
2044
2045     // If we temporarily changed the set number of threads then restore it now
2046     master_th->th.th_set_nproc = 0;
2047
2048     if (nthreads == 1) {
2049       return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2050                                     invoker, master_th, parent_team,
2051 #if OMPT_SUPPORT
2052                                     &ompt_parallel_data, &return_address,
2053                                     &parent_task_data,
2054 #endif
2055                                     ap);
2056     } // if (nthreads == 1)
2057
2058     // GEH: only modify the executing flag in the case when not serialized
2059     //      serialized case is handled in kmpc_serialized_parallel
2060     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2061                   "curtask=%p, curtask_max_aclevel=%d\n",
2062                   parent_team->t.t_active_level, master_th,
2063                   master_th->th.th_current_task,
2064                   master_th->th.th_current_task->td_icvs.max_active_levels));
2065     // TODO: GEH - cannot do this assertion because root thread not set up as
2066     // executing
2067     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2068     master_th->th.th_current_task->td_flags.executing = 0;
2069
2070     if (!master_th->th.th_teams_microtask || level > teams_level) {
2071       /* Increment our nested depth level */
2072       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2073     }
2074
2075     // See if we need to make a copy of the ICVs.
2076     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2077     if ((level + 1 < __kmp_nested_nth.used) &&
2078         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2079       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2080     } else {
2081       nthreads_icv = 0; // don't update
2082     }
2083
2084     // Figure out the proc_bind_policy for the new team.
2085     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2086     // proc_bind_default means don't update
2087     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2088     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2089       proc_bind = proc_bind_false;
2090     } else {
2091       // No proc_bind clause specified; use current proc-bind-var for this
2092       // parallel region
2093       if (proc_bind == proc_bind_default) {
2094         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2095       }
2096       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2097       if (master_th->th.th_teams_microtask &&
2098           microtask == (microtask_t)__kmp_teams_master) {
2099         proc_bind = __kmp_teams_proc_bind;
2100       }
2101       /* else: The proc_bind policy was specified explicitly on parallel clause.
2102          This overrides proc-bind-var for this parallel region, but does not
2103          change proc-bind-var. */
2104       // Figure the value of proc-bind-var for the child threads.
2105       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2106           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2107            master_th->th.th_current_task->td_icvs.proc_bind)) {
2108         // Do not modify the proc bind icv for the two teams construct forks
2109         // They just let the proc bind icv pass through
2110         if (!master_th->th.th_teams_microtask ||
2111             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2112           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2113       }
2114     }
2115
2116     // Reset for next parallel region
2117     master_th->th.th_set_proc_bind = proc_bind_default;
2118
2119     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2120       kmp_internal_control_t new_icvs;
2121       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2122       new_icvs.next = NULL;
2123       if (nthreads_icv > 0) {
2124         new_icvs.nproc = nthreads_icv;
2125       }
2126       if (proc_bind_icv != proc_bind_default) {
2127         new_icvs.proc_bind = proc_bind_icv;
2128       }
2129
2130       /* allocate a new parallel team */
2131       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2132       team = __kmp_allocate_team(root, nthreads, nthreads,
2133 #if OMPT_SUPPORT
2134                                  ompt_parallel_data,
2135 #endif
2136                                  proc_bind, &new_icvs,
2137                                  argc USE_NESTED_HOT_ARG(master_th));
2138       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2139         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2140     } else {
2141       /* allocate a new parallel team */
2142       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2143       team = __kmp_allocate_team(root, nthreads, nthreads,
2144 #if OMPT_SUPPORT
2145                                  ompt_parallel_data,
2146 #endif
2147                                  proc_bind,
2148                                  &master_th->th.th_current_task->td_icvs,
2149                                  argc USE_NESTED_HOT_ARG(master_th));
2150       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2151         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2152                   &master_th->th.th_current_task->td_icvs);
2153     }
2154     KF_TRACE(
2155         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2156
2157     /* setup the new team */
2158     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2159     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2160     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2161     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2162     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2163 #if OMPT_SUPPORT
2164     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2165                           return_address);
2166 #endif
2167     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2168     // TODO: parent_team->t.t_level == INT_MAX ???
2169     if (!master_th->th.th_teams_microtask || level > teams_level) {
2170       int new_level = parent_team->t.t_level + 1;
2171       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2172       new_level = parent_team->t.t_active_level + 1;
2173       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2174     } else {
2175       // AC: Do not increase parallel level at start of the teams construct
2176       int new_level = parent_team->t.t_level;
2177       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2178       new_level = parent_team->t.t_active_level;
2179       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2180     }
2181     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2182     // set primary thread's schedule as new run-time schedule
2183     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2184
2185     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2186     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2187
2188     // Update the floating point rounding in the team if required.
2189     propagateFPControl(team);
2190 #if OMPD_SUPPORT
2191     if (ompd_state & OMPD_ENABLE_BP)
2192       ompd_bp_parallel_begin();
2193 #endif
2194
2195     if (__kmp_tasking_mode != tskm_immediate_exec) {
2196       // Set primary thread's task team to team's task team. Unless this is hot
2197       // team, it should be NULL.
2198       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2199                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2200       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2201                     "%p, new task_team %p / team %p\n",
2202                     __kmp_gtid_from_thread(master_th),
2203                     master_th->th.th_task_team, parent_team,
2204                     team->t.t_task_team[master_th->th.th_task_state], team));
2205
2206       if (active_level || master_th->th.th_task_team) {
2207         // Take a memo of primary thread's task_state
2208         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2209         if (master_th->th.th_task_state_top >=
2210             master_th->th.th_task_state_stack_sz) { // increase size
2211           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2212           kmp_uint8 *old_stack, *new_stack;
2213           kmp_uint32 i;
2214           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2215           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2216             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2217           }
2218           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2219                ++i) { // zero-init rest of stack
2220             new_stack[i] = 0;
2221           }
2222           old_stack = master_th->th.th_task_state_memo_stack;
2223           master_th->th.th_task_state_memo_stack = new_stack;
2224           master_th->th.th_task_state_stack_sz = new_size;
2225           __kmp_free(old_stack);
2226         }
2227         // Store primary thread's task_state on stack
2228         master_th->th
2229             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2230             master_th->th.th_task_state;
2231         master_th->th.th_task_state_top++;
2232 #if KMP_NESTED_HOT_TEAMS
2233         if (master_th->th.th_hot_teams &&
2234             active_level < __kmp_hot_teams_max_level &&
2235             team == master_th->th.th_hot_teams[active_level].hot_team) {
2236           // Restore primary thread's nested state if nested hot team
2237           master_th->th.th_task_state =
2238               master_th->th
2239                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2240         } else {
2241 #endif
2242           master_th->th.th_task_state = 0;
2243 #if KMP_NESTED_HOT_TEAMS
2244         }
2245 #endif
2246       }
2247 #if !KMP_NESTED_HOT_TEAMS
2248       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2249                        (team == root->r.r_hot_team));
2250 #endif
2251     }
2252
2253     KA_TRACE(
2254         20,
2255         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2256          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2257          team->t.t_nproc));
2258     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2259                      (team->t.t_master_tid == 0 &&
2260                       (team->t.t_parent == root->r.r_root_team ||
2261                        team->t.t_parent->t.t_serialized)));
2262     KMP_MB();
2263
2264     /* now, setup the arguments */
2265     argv = (void **)team->t.t_argv;
2266     if (ap) {
2267       for (i = argc - 1; i >= 0; --i) {
2268         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2269         KMP_CHECK_UPDATE(*argv, new_argv);
2270         argv++;
2271       }
2272     } else {
2273       for (i = 0; i < argc; ++i) {
2274         // Get args from parent team for teams construct
2275         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2276       }
2277     }
2278
2279     /* now actually fork the threads */
2280     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2281     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2282       root->r.r_active = TRUE;
2283
2284     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2285     __kmp_setup_icv_copy(team, nthreads,
2286                          &master_th->th.th_current_task->td_icvs, loc);
2287
2288 #if OMPT_SUPPORT
2289     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2290 #endif
2291
2292     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2293
2294 #if USE_ITT_BUILD
2295     if (team->t.t_active_level == 1 // only report frames at level 1
2296         && !master_th->th.th_teams_microtask) { // not in teams construct
2297 #if USE_ITT_NOTIFY
2298       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2299           (__kmp_forkjoin_frames_mode == 3 ||
2300            __kmp_forkjoin_frames_mode == 1)) {
2301         kmp_uint64 tmp_time = 0;
2302         if (__itt_get_timestamp_ptr)
2303           tmp_time = __itt_get_timestamp();
2304         // Internal fork - report frame begin
2305         master_th->th.th_frame_time = tmp_time;
2306         if (__kmp_forkjoin_frames_mode == 3)
2307           team->t.t_region_time = tmp_time;
2308       } else
2309 // only one notification scheme (either "submit" or "forking/joined", not both)
2310 #endif /* USE_ITT_NOTIFY */
2311         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2312             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2313           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2314           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2315         }
2316     }
2317 #endif /* USE_ITT_BUILD */
2318
2319     /* now go on and do the work */
2320     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2321     KMP_MB();
2322     KF_TRACE(10,
2323              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2324               root, team, master_th, gtid));
2325
2326 #if USE_ITT_BUILD
2327     if (__itt_stack_caller_create_ptr) {
2328       // create new stack stitching id before entering fork barrier
2329       if (!enter_teams) {
2330         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2331         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2332       } else if (parent_team->t.t_serialized) {
2333         // keep stack stitching id in the serialized parent_team;
2334         // current team will be used for parallel inside the teams;
2335         // if parent_team is active, then it already keeps stack stitching id
2336         // for the league of teams
2337         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2338         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2339       }
2340     }
2341 #endif /* USE_ITT_BUILD */
2342
2343     // AC: skip __kmp_internal_fork at teams construct, let only primary
2344     // threads execute
2345     if (ap) {
2346       __kmp_internal_fork(loc, gtid, team);
2347       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2348                     "master_th=%p, gtid=%d\n",
2349                     root, team, master_th, gtid));
2350     }
2351
2352     if (call_context == fork_context_gnu) {
2353       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2354       return TRUE;
2355     }
2356
2357     /* Invoke microtask for PRIMARY thread */
2358     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2359                   team->t.t_id, team->t.t_pkfn));
2360   } // END of timer KMP_fork_call block
2361
2362 #if KMP_STATS_ENABLED
2363   // If beginning a teams construct, then change thread state
2364   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2365   if (!ap) {
2366     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2367   }
2368 #endif
2369
2370   if (!team->t.t_invoke(gtid)) {
2371     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2372   }
2373
2374 #if KMP_STATS_ENABLED
2375   // If was beginning of a teams construct, then reset thread state
2376   if (!ap) {
2377     KMP_SET_THREAD_STATE(previous_state);
2378   }
2379 #endif
2380
2381   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2382                 team->t.t_id, team->t.t_pkfn));
2383   KMP_MB(); /* Flush all pending memory write invalidates.  */
2384
2385   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2386 #if OMPT_SUPPORT
2387   if (ompt_enabled.enabled) {
2388     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2389   }
2390 #endif
2391
2392   return TRUE;
2393 }
2394
2395 #if OMPT_SUPPORT
2396 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2397                                             kmp_team_t *team) {
2398   // restore state outside the region
2399   thread->th.ompt_thread_info.state =
2400       ((team->t.t_serialized) ? ompt_state_work_serial
2401                               : ompt_state_work_parallel);
2402 }
2403
2404 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2405                                    kmp_team_t *team, ompt_data_t *parallel_data,
2406                                    int flags, void *codeptr) {
2407   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2408   if (ompt_enabled.ompt_callback_parallel_end) {
2409     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2410         parallel_data, &(task_info->task_data), flags, codeptr);
2411   }
2412
2413   task_info->frame.enter_frame = ompt_data_none;
2414   __kmp_join_restore_state(thread, team);
2415 }
2416 #endif
2417
2418 void __kmp_join_call(ident_t *loc, int gtid
2419 #if OMPT_SUPPORT
2420                      ,
2421                      enum fork_context_e fork_context
2422 #endif
2423                      ,
2424                      int exit_teams) {
2425   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2426   kmp_team_t *team;
2427   kmp_team_t *parent_team;
2428   kmp_info_t *master_th;
2429   kmp_root_t *root;
2430   int master_active;
2431
2432   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2433
2434   /* setup current data */
2435   master_th = __kmp_threads[gtid];
2436   root = master_th->th.th_root;
2437   team = master_th->th.th_team;
2438   parent_team = team->t.t_parent;
2439
2440   master_th->th.th_ident = loc;
2441
2442 #if OMPT_SUPPORT
2443   void *team_microtask = (void *)team->t.t_pkfn;
2444   // For GOMP interface with serialized parallel, need the
2445   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2446   // and end-parallel events.
2447   if (ompt_enabled.enabled &&
2448       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2449     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2450   }
2451 #endif
2452
2453 #if KMP_DEBUG
2454   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2455     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2456                   "th_task_team = %p\n",
2457                   __kmp_gtid_from_thread(master_th), team,
2458                   team->t.t_task_team[master_th->th.th_task_state],
2459                   master_th->th.th_task_team));
2460     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2461                      team->t.t_task_team[master_th->th.th_task_state]);
2462   }
2463 #endif
2464
2465   if (team->t.t_serialized) {
2466     if (master_th->th.th_teams_microtask) {
2467       // We are in teams construct
2468       int level = team->t.t_level;
2469       int tlevel = master_th->th.th_teams_level;
2470       if (level == tlevel) {
2471         // AC: we haven't incremented it earlier at start of teams construct,
2472         //     so do it here - at the end of teams construct
2473         team->t.t_level++;
2474       } else if (level == tlevel + 1) {
2475         // AC: we are exiting parallel inside teams, need to increment
2476         // serialization in order to restore it in the next call to
2477         // __kmpc_end_serialized_parallel
2478         team->t.t_serialized++;
2479       }
2480     }
2481     __kmpc_end_serialized_parallel(loc, gtid);
2482
2483 #if OMPT_SUPPORT
2484     if (ompt_enabled.enabled) {
2485       if (fork_context == fork_context_gnu) {
2486         __ompt_lw_taskteam_unlink(master_th);
2487       }
2488       __kmp_join_restore_state(master_th, parent_team);
2489     }
2490 #endif
2491
2492     return;
2493   }
2494
2495   master_active = team->t.t_master_active;
2496
2497   if (!exit_teams) {
2498     // AC: No barrier for internal teams at exit from teams construct.
2499     //     But there is barrier for external team (league).
2500     __kmp_internal_join(loc, gtid, team);
2501 #if USE_ITT_BUILD
2502     if (__itt_stack_caller_create_ptr) {
2503       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2504       // destroy the stack stitching id after join barrier
2505       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2506       team->t.t_stack_id = NULL;
2507     }
2508 #endif
2509   } else {
2510     master_th->th.th_task_state =
2511         0; // AC: no tasking in teams (out of any parallel)
2512 #if USE_ITT_BUILD
2513     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2514       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2515       // destroy the stack stitching id on exit from the teams construct
2516       // if parent_team is active, then the id will be destroyed later on
2517       // by master of the league of teams
2518       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2519       parent_team->t.t_stack_id = NULL;
2520     }
2521 #endif
2522   }
2523
2524   KMP_MB();
2525
2526 #if OMPT_SUPPORT
2527   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2528   void *codeptr = team->t.ompt_team_info.master_return_address;
2529 #endif
2530
2531 #if USE_ITT_BUILD
2532   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2533   if (team->t.t_active_level == 1 &&
2534       (!master_th->th.th_teams_microtask || /* not in teams construct */
2535        master_th->th.th_teams_size.nteams == 1)) {
2536     master_th->th.th_ident = loc;
2537     // only one notification scheme (either "submit" or "forking/joined", not
2538     // both)
2539     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2540         __kmp_forkjoin_frames_mode == 3)
2541       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2542                              master_th->th.th_frame_time, 0, loc,
2543                              master_th->th.th_team_nproc, 1);
2544     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2545              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2546       __kmp_itt_region_joined(gtid);
2547   } // active_level == 1
2548 #endif /* USE_ITT_BUILD */
2549
2550 #if KMP_AFFINITY_SUPPORTED
2551   if (!exit_teams) {
2552     // Restore master thread's partition.
2553     master_th->th.th_first_place = team->t.t_first_place;
2554     master_th->th.th_last_place = team->t.t_last_place;
2555   }
2556 #endif // KMP_AFFINITY_SUPPORTED
2557
2558   if (master_th->th.th_teams_microtask && !exit_teams &&
2559       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2560       team->t.t_level == master_th->th.th_teams_level + 1) {
2561 // AC: We need to leave the team structure intact at the end of parallel
2562 // inside the teams construct, so that at the next parallel same (hot) team
2563 // works, only adjust nesting levels
2564 #if OMPT_SUPPORT
2565     ompt_data_t ompt_parallel_data = ompt_data_none;
2566     if (ompt_enabled.enabled) {
2567       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2568       if (ompt_enabled.ompt_callback_implicit_task) {
2569         int ompt_team_size = team->t.t_nproc;
2570         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2571             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2572             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2573       }
2574       task_info->frame.exit_frame = ompt_data_none;
2575       task_info->task_data = ompt_data_none;
2576       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2577       __ompt_lw_taskteam_unlink(master_th);
2578     }
2579 #endif
2580     /* Decrement our nested depth level */
2581     team->t.t_level--;
2582     team->t.t_active_level--;
2583     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2584
2585     // Restore number of threads in the team if needed. This code relies on
2586     // the proper adjustment of th_teams_size.nth after the fork in
2587     // __kmp_teams_master on each teams primary thread in the case that
2588     // __kmp_reserve_threads reduced it.
2589     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2590       int old_num = master_th->th.th_team_nproc;
2591       int new_num = master_th->th.th_teams_size.nth;
2592       kmp_info_t **other_threads = team->t.t_threads;
2593       team->t.t_nproc = new_num;
2594       for (int i = 0; i < old_num; ++i) {
2595         other_threads[i]->th.th_team_nproc = new_num;
2596       }
2597       // Adjust states of non-used threads of the team
2598       for (int i = old_num; i < new_num; ++i) {
2599         // Re-initialize thread's barrier data.
2600         KMP_DEBUG_ASSERT(other_threads[i]);
2601         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2602         for (int b = 0; b < bs_last_barrier; ++b) {
2603           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2604           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2605 #if USE_DEBUGGER
2606           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2607 #endif
2608         }
2609         if (__kmp_tasking_mode != tskm_immediate_exec) {
2610           // Synchronize thread's task state
2611           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2612         }
2613       }
2614     }
2615
2616 #if OMPT_SUPPORT
2617     if (ompt_enabled.enabled) {
2618       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2619                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2620     }
2621 #endif
2622
2623     return;
2624   }
2625
2626   /* do cleanup and restore the parent team */
2627   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2628   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2629
2630   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2631
2632   /* jc: The following lock has instructions with REL and ACQ semantics,
2633      separating the parallel user code called in this parallel region
2634      from the serial user code called after this function returns. */
2635   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2636
2637   if (!master_th->th.th_teams_microtask ||
2638       team->t.t_level > master_th->th.th_teams_level) {
2639     /* Decrement our nested depth level */
2640     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2641   }
2642   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2643
2644 #if OMPT_SUPPORT
2645   if (ompt_enabled.enabled) {
2646     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2647     if (ompt_enabled.ompt_callback_implicit_task) {
2648       int flags = (team_microtask == (void *)__kmp_teams_master)
2649                       ? ompt_task_initial
2650                       : ompt_task_implicit;
2651       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2652       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2653           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2654           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2655     }
2656     task_info->frame.exit_frame = ompt_data_none;
2657     task_info->task_data = ompt_data_none;
2658   }
2659 #endif
2660
2661   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2662                 master_th, team));
2663   __kmp_pop_current_task_from_thread(master_th);
2664
2665   master_th->th.th_def_allocator = team->t.t_def_allocator;
2666
2667 #if OMPD_SUPPORT
2668   if (ompd_state & OMPD_ENABLE_BP)
2669     ompd_bp_parallel_end();
2670 #endif
2671   updateHWFPControl(team);
2672
2673   if (root->r.r_active != master_active)
2674     root->r.r_active = master_active;
2675
2676   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2677                             master_th)); // this will free worker threads
2678
2679   /* this race was fun to find. make sure the following is in the critical
2680      region otherwise assertions may fail occasionally since the old team may be
2681      reallocated and the hierarchy appears inconsistent. it is actually safe to
2682      run and won't cause any bugs, but will cause those assertion failures. it's
2683      only one deref&assign so might as well put this in the critical region */
2684   master_th->th.th_team = parent_team;
2685   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2686   master_th->th.th_team_master = parent_team->t.t_threads[0];
2687   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2688
2689   /* restore serialized team, if need be */
2690   if (parent_team->t.t_serialized &&
2691       parent_team != master_th->th.th_serial_team &&
2692       parent_team != root->r.r_root_team) {
2693     __kmp_free_team(root,
2694                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2695     master_th->th.th_serial_team = parent_team;
2696   }
2697
2698   if (__kmp_tasking_mode != tskm_immediate_exec) {
2699     if (master_th->th.th_task_state_top >
2700         0) { // Restore task state from memo stack
2701       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2702       // Remember primary thread's state if we re-use this nested hot team
2703       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2704           master_th->th.th_task_state;
2705       --master_th->th.th_task_state_top; // pop
2706       // Now restore state at this level
2707       master_th->th.th_task_state =
2708           master_th->th
2709               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2710     } else if (team != root->r.r_hot_team) {
2711       // Reset the task state of primary thread if we are not hot team because
2712       // in this case all the worker threads will be free, and their task state
2713       // will be reset. If not reset the primary's, the task state will be
2714       // inconsistent.
2715       master_th->th.th_task_state = 0;
2716     }
2717     // Copy the task team from the parent team to the primary thread
2718     master_th->th.th_task_team =
2719         parent_team->t.t_task_team[master_th->th.th_task_state];
2720     KA_TRACE(20,
2721              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2722               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2723               parent_team));
2724   }
2725
2726   // TODO: GEH - cannot do this assertion because root thread not set up as
2727   // executing
2728   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2729   master_th->th.th_current_task->td_flags.executing = 1;
2730
2731   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732
2733 #if KMP_AFFINITY_SUPPORTED
2734   if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2735     __kmp_reset_root_init_mask(gtid);
2736   }
2737 #endif
2738 #if OMPT_SUPPORT
2739   int flags =
2740       OMPT_INVOKER(fork_context) |
2741       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2742                                                       : ompt_parallel_team);
2743   if (ompt_enabled.enabled) {
2744     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2745                     codeptr);
2746   }
2747 #endif
2748
2749   KMP_MB();
2750   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751 }
2752
2753 /* Check whether we should push an internal control record onto the
2754    serial team stack.  If so, do it.  */
2755 void __kmp_save_internal_controls(kmp_info_t *thread) {
2756
2757   if (thread->th.th_team != thread->th.th_serial_team) {
2758     return;
2759   }
2760   if (thread->th.th_team->t.t_serialized > 1) {
2761     int push = 0;
2762
2763     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2764       push = 1;
2765     } else {
2766       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2767           thread->th.th_team->t.t_serialized) {
2768         push = 1;
2769       }
2770     }
2771     if (push) { /* push a record on the serial team's stack */
2772       kmp_internal_control_t *control =
2773           (kmp_internal_control_t *)__kmp_allocate(
2774               sizeof(kmp_internal_control_t));
2775
2776       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2777
2778       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2779
2780       control->next = thread->th.th_team->t.t_control_stack_top;
2781       thread->th.th_team->t.t_control_stack_top = control;
2782     }
2783   }
2784 }
2785
2786 /* Changes set_nproc */
2787 void __kmp_set_num_threads(int new_nth, int gtid) {
2788   kmp_info_t *thread;
2789   kmp_root_t *root;
2790
2791   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2792   KMP_DEBUG_ASSERT(__kmp_init_serial);
2793
2794   if (new_nth < 1)
2795     new_nth = 1;
2796   else if (new_nth > __kmp_max_nth)
2797     new_nth = __kmp_max_nth;
2798
2799   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2800   thread = __kmp_threads[gtid];
2801   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2802     return; // nothing to do
2803
2804   __kmp_save_internal_controls(thread);
2805
2806   set__nproc(thread, new_nth);
2807
2808   // If this omp_set_num_threads() call will cause the hot team size to be
2809   // reduced (in the absence of a num_threads clause), then reduce it now,
2810   // rather than waiting for the next parallel region.
2811   root = thread->th.th_root;
2812   if (__kmp_init_parallel && (!root->r.r_active) &&
2813       (root->r.r_hot_team->t.t_nproc > new_nth)
2814 #if KMP_NESTED_HOT_TEAMS
2815       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2816 #endif
2817   ) {
2818     kmp_team_t *hot_team = root->r.r_hot_team;
2819     int f;
2820
2821     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2822
2823     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2824       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2825     }
2826     // Release the extra threads we don't need any more.
2827     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2828       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2829       if (__kmp_tasking_mode != tskm_immediate_exec) {
2830         // When decreasing team size, threads no longer in the team should unref
2831         // task team.
2832         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2833       }
2834       __kmp_free_thread(hot_team->t.t_threads[f]);
2835       hot_team->t.t_threads[f] = NULL;
2836     }
2837     hot_team->t.t_nproc = new_nth;
2838 #if KMP_NESTED_HOT_TEAMS
2839     if (thread->th.th_hot_teams) {
2840       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2841       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2842     }
2843 #endif
2844
2845     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2846       hot_team->t.b->update_num_threads(new_nth);
2847       __kmp_add_threads_to_team(hot_team, new_nth);
2848     }
2849
2850     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2851
2852     // Update the t_nproc field in the threads that are still active.
2853     for (f = 0; f < new_nth; f++) {
2854       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2855       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2856     }
2857     // Special flag in case omp_set_num_threads() call
2858     hot_team->t.t_size_changed = -1;
2859   }
2860 }
2861
2862 /* Changes max_active_levels */
2863 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2864   kmp_info_t *thread;
2865
2866   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2867                 "%d = (%d)\n",
2868                 gtid, max_active_levels));
2869   KMP_DEBUG_ASSERT(__kmp_init_serial);
2870
2871   // validate max_active_levels
2872   if (max_active_levels < 0) {
2873     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2874     // We ignore this call if the user has specified a negative value.
2875     // The current setting won't be changed. The last valid setting will be
2876     // used. A warning will be issued (if warnings are allowed as controlled by
2877     // the KMP_WARNINGS env var).
2878     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2879                   "max_active_levels for thread %d = (%d)\n",
2880                   gtid, max_active_levels));
2881     return;
2882   }
2883   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2884     // it's OK, the max_active_levels is within the valid range: [ 0;
2885     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2886     // We allow a zero value. (implementation defined behavior)
2887   } else {
2888     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2889                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2890     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2891     // Current upper limit is MAX_INT. (implementation defined behavior)
2892     // If the input exceeds the upper limit, we correct the input to be the
2893     // upper limit. (implementation defined behavior)
2894     // Actually, the flow should never get here until we use MAX_INT limit.
2895   }
2896   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2897                 "max_active_levels for thread %d = (%d)\n",
2898                 gtid, max_active_levels));
2899
2900   thread = __kmp_threads[gtid];
2901
2902   __kmp_save_internal_controls(thread);
2903
2904   set__max_active_levels(thread, max_active_levels);
2905 }
2906
2907 /* Gets max_active_levels */
2908 int __kmp_get_max_active_levels(int gtid) {
2909   kmp_info_t *thread;
2910
2911   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2912   KMP_DEBUG_ASSERT(__kmp_init_serial);
2913
2914   thread = __kmp_threads[gtid];
2915   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2916   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2917                 "curtask_maxaclevel=%d\n",
2918                 gtid, thread->th.th_current_task,
2919                 thread->th.th_current_task->td_icvs.max_active_levels));
2920   return thread->th.th_current_task->td_icvs.max_active_levels;
2921 }
2922
2923 // nteams-var per-device ICV
2924 void __kmp_set_num_teams(int num_teams) {
2925   if (num_teams > 0)
2926     __kmp_nteams = num_teams;
2927 }
2928 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2929 // teams-thread-limit-var per-device ICV
2930 void __kmp_set_teams_thread_limit(int limit) {
2931   if (limit > 0)
2932     __kmp_teams_thread_limit = limit;
2933 }
2934 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2935
2936 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2937 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2938
2939 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2940 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2941   kmp_info_t *thread;
2942   kmp_sched_t orig_kind;
2943   //    kmp_team_t *team;
2944
2945   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2946                 gtid, (int)kind, chunk));
2947   KMP_DEBUG_ASSERT(__kmp_init_serial);
2948
2949   // Check if the kind parameter is valid, correct if needed.
2950   // Valid parameters should fit in one of two intervals - standard or extended:
2951   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2952   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2953   orig_kind = kind;
2954   kind = __kmp_sched_without_mods(kind);
2955
2956   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2957       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2958     // TODO: Hint needs attention in case we change the default schedule.
2959     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2960               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2961               __kmp_msg_null);
2962     kind = kmp_sched_default;
2963     chunk = 0; // ignore chunk value in case of bad kind
2964   }
2965
2966   thread = __kmp_threads[gtid];
2967
2968   __kmp_save_internal_controls(thread);
2969
2970   if (kind < kmp_sched_upper_std) {
2971     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2972       // differ static chunked vs. unchunked:  chunk should be invalid to
2973       // indicate unchunked schedule (which is the default)
2974       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2975     } else {
2976       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2977           __kmp_sch_map[kind - kmp_sched_lower - 1];
2978     }
2979   } else {
2980     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2981     //    kmp_sched_lower - 2 ];
2982     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2983         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2984                       kmp_sched_lower - 2];
2985   }
2986   __kmp_sched_apply_mods_intkind(
2987       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2988   if (kind == kmp_sched_auto || chunk < 1) {
2989     // ignore parameter chunk for schedule auto
2990     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2991   } else {
2992     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2993   }
2994 }
2995
2996 /* Gets def_sched_var ICV values */
2997 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2998   kmp_info_t *thread;
2999   enum sched_type th_type;
3000
3001   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3002   KMP_DEBUG_ASSERT(__kmp_init_serial);
3003
3004   thread = __kmp_threads[gtid];
3005
3006   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3007   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3008   case kmp_sch_static:
3009   case kmp_sch_static_greedy:
3010   case kmp_sch_static_balanced:
3011     *kind = kmp_sched_static;
3012     __kmp_sched_apply_mods_stdkind(kind, th_type);
3013     *chunk = 0; // chunk was not set, try to show this fact via zero value
3014     return;
3015   case kmp_sch_static_chunked:
3016     *kind = kmp_sched_static;
3017     break;
3018   case kmp_sch_dynamic_chunked:
3019     *kind = kmp_sched_dynamic;
3020     break;
3021   case kmp_sch_guided_chunked:
3022   case kmp_sch_guided_iterative_chunked:
3023   case kmp_sch_guided_analytical_chunked:
3024     *kind = kmp_sched_guided;
3025     break;
3026   case kmp_sch_auto:
3027     *kind = kmp_sched_auto;
3028     break;
3029   case kmp_sch_trapezoidal:
3030     *kind = kmp_sched_trapezoidal;
3031     break;
3032 #if KMP_STATIC_STEAL_ENABLED
3033   case kmp_sch_static_steal:
3034     *kind = kmp_sched_static_steal;
3035     break;
3036 #endif
3037   default:
3038     KMP_FATAL(UnknownSchedulingType, th_type);
3039   }
3040
3041   __kmp_sched_apply_mods_stdkind(kind, th_type);
3042   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3043 }
3044
3045 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3046
3047   int ii, dd;
3048   kmp_team_t *team;
3049   kmp_info_t *thr;
3050
3051   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3052   KMP_DEBUG_ASSERT(__kmp_init_serial);
3053
3054   // validate level
3055   if (level == 0)
3056     return 0;
3057   if (level < 0)
3058     return -1;
3059   thr = __kmp_threads[gtid];
3060   team = thr->th.th_team;
3061   ii = team->t.t_level;
3062   if (level > ii)
3063     return -1;
3064
3065   if (thr->th.th_teams_microtask) {
3066     // AC: we are in teams region where multiple nested teams have same level
3067     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3068     if (level <=
3069         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3070       KMP_DEBUG_ASSERT(ii >= tlevel);
3071       // AC: As we need to pass by the teams league, we need to artificially
3072       // increase ii
3073       if (ii == tlevel) {
3074         ii += 2; // three teams have same level
3075       } else {
3076         ii++; // two teams have same level
3077       }
3078     }
3079   }
3080
3081   if (ii == level)
3082     return __kmp_tid_from_gtid(gtid);
3083
3084   dd = team->t.t_serialized;
3085   level++;
3086   while (ii > level) {
3087     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3088     }
3089     if ((team->t.t_serialized) && (!dd)) {
3090       team = team->t.t_parent;
3091       continue;
3092     }
3093     if (ii > level) {
3094       team = team->t.t_parent;
3095       dd = team->t.t_serialized;
3096       ii--;
3097     }
3098   }
3099
3100   return (dd > 1) ? (0) : (team->t.t_master_tid);
3101 }
3102
3103 int __kmp_get_team_size(int gtid, int level) {
3104
3105   int ii, dd;
3106   kmp_team_t *team;
3107   kmp_info_t *thr;
3108
3109   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3110   KMP_DEBUG_ASSERT(__kmp_init_serial);
3111
3112   // validate level
3113   if (level == 0)
3114     return 1;
3115   if (level < 0)
3116     return -1;
3117   thr = __kmp_threads[gtid];
3118   team = thr->th.th_team;
3119   ii = team->t.t_level;
3120   if (level > ii)
3121     return -1;
3122
3123   if (thr->th.th_teams_microtask) {
3124     // AC: we are in teams region where multiple nested teams have same level
3125     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3126     if (level <=
3127         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3128       KMP_DEBUG_ASSERT(ii >= tlevel);
3129       // AC: As we need to pass by the teams league, we need to artificially
3130       // increase ii
3131       if (ii == tlevel) {
3132         ii += 2; // three teams have same level
3133       } else {
3134         ii++; // two teams have same level
3135       }
3136     }
3137   }
3138
3139   while (ii > level) {
3140     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3141     }
3142     if (team->t.t_serialized && (!dd)) {
3143       team = team->t.t_parent;
3144       continue;
3145     }
3146     if (ii > level) {
3147       team = team->t.t_parent;
3148       ii--;
3149     }
3150   }
3151
3152   return team->t.t_nproc;
3153 }
3154
3155 kmp_r_sched_t __kmp_get_schedule_global() {
3156   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3157   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3158   // independently. So one can get the updated schedule here.
3159
3160   kmp_r_sched_t r_sched;
3161
3162   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3163   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3164   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3165   // different roots (even in OMP 2.5)
3166   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3167   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3168   if (s == kmp_sch_static) {
3169     // replace STATIC with more detailed schedule (balanced or greedy)
3170     r_sched.r_sched_type = __kmp_static;
3171   } else if (s == kmp_sch_guided_chunked) {
3172     // replace GUIDED with more detailed schedule (iterative or analytical)
3173     r_sched.r_sched_type = __kmp_guided;
3174   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3175     r_sched.r_sched_type = __kmp_sched;
3176   }
3177   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3178
3179   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3180     // __kmp_chunk may be wrong here (if it was not ever set)
3181     r_sched.chunk = KMP_DEFAULT_CHUNK;
3182   } else {
3183     r_sched.chunk = __kmp_chunk;
3184   }
3185
3186   return r_sched;
3187 }
3188
3189 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3190    at least argc number of *t_argv entries for the requested team. */
3191 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3192
3193   KMP_DEBUG_ASSERT(team);
3194   if (!realloc || argc > team->t.t_max_argc) {
3195
3196     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3197                    "current entries=%d\n",
3198                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3199     /* if previously allocated heap space for args, free them */
3200     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3201       __kmp_free((void *)team->t.t_argv);
3202
3203     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3204       /* use unused space in the cache line for arguments */
3205       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3206       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3207                      "argv entries\n",
3208                      team->t.t_id, team->t.t_max_argc));
3209       team->t.t_argv = &team->t.t_inline_argv[0];
3210       if (__kmp_storage_map) {
3211         __kmp_print_storage_map_gtid(
3212             -1, &team->t.t_inline_argv[0],
3213             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3214             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3215             team->t.t_id);
3216       }
3217     } else {
3218       /* allocate space for arguments in the heap */
3219       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3220                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3221                                : 2 * argc;
3222       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3223                      "argv entries\n",
3224                      team->t.t_id, team->t.t_max_argc));
3225       team->t.t_argv =
3226           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3227       if (__kmp_storage_map) {
3228         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3229                                      &team->t.t_argv[team->t.t_max_argc],
3230                                      sizeof(void *) * team->t.t_max_argc,
3231                                      "team_%d.t_argv", team->t.t_id);
3232       }
3233     }
3234   }
3235 }
3236
3237 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3238   int i;
3239   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3240   team->t.t_threads =
3241       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3242   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3243       sizeof(dispatch_shared_info_t) * num_disp_buff);
3244   team->t.t_dispatch =
3245       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3246   team->t.t_implicit_task_taskdata =
3247       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3248   team->t.t_max_nproc = max_nth;
3249
3250   /* setup dispatch buffers */
3251   for (i = 0; i < num_disp_buff; ++i) {
3252     team->t.t_disp_buffer[i].buffer_index = i;
3253     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3254   }
3255 }
3256
3257 static void __kmp_free_team_arrays(kmp_team_t *team) {
3258   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3259   int i;
3260   for (i = 0; i < team->t.t_max_nproc; ++i) {
3261     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3262       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3263       team->t.t_dispatch[i].th_disp_buffer = NULL;
3264     }
3265   }
3266 #if KMP_USE_HIER_SCHED
3267   __kmp_dispatch_free_hierarchies(team);
3268 #endif
3269   __kmp_free(team->t.t_threads);
3270   __kmp_free(team->t.t_disp_buffer);
3271   __kmp_free(team->t.t_dispatch);
3272   __kmp_free(team->t.t_implicit_task_taskdata);
3273   team->t.t_threads = NULL;
3274   team->t.t_disp_buffer = NULL;
3275   team->t.t_dispatch = NULL;
3276   team->t.t_implicit_task_taskdata = 0;
3277 }
3278
3279 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3280   kmp_info_t **oldThreads = team->t.t_threads;
3281
3282   __kmp_free(team->t.t_disp_buffer);
3283   __kmp_free(team->t.t_dispatch);
3284   __kmp_free(team->t.t_implicit_task_taskdata);
3285   __kmp_allocate_team_arrays(team, max_nth);
3286
3287   KMP_MEMCPY(team->t.t_threads, oldThreads,
3288              team->t.t_nproc * sizeof(kmp_info_t *));
3289
3290   __kmp_free(oldThreads);
3291 }
3292
3293 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3294
3295   kmp_r_sched_t r_sched =
3296       __kmp_get_schedule_global(); // get current state of scheduling globals
3297
3298   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3299
3300   kmp_internal_control_t g_icvs = {
3301     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3302     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3303     // adjustment of threads (per thread)
3304     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3305     // whether blocktime is explicitly set
3306     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3307 #if KMP_USE_MONITOR
3308     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3309 // intervals
3310 #endif
3311     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3312     // next parallel region (per thread)
3313     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3314     __kmp_cg_max_nth, // int thread_limit;
3315     __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3316     // on task. This is used in the case of target thread_limit
3317     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3318     // for max_active_levels
3319     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3320     // {sched,chunk} pair
3321     __kmp_nested_proc_bind.bind_types[0],
3322     __kmp_default_device,
3323     NULL // struct kmp_internal_control *next;
3324   };
3325
3326   return g_icvs;
3327 }
3328
3329 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3330
3331   kmp_internal_control_t gx_icvs;
3332   gx_icvs.serial_nesting_level =
3333       0; // probably =team->t.t_serial like in save_inter_controls
3334   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3335   gx_icvs.next = NULL;
3336
3337   return gx_icvs;
3338 }
3339
3340 static void __kmp_initialize_root(kmp_root_t *root) {
3341   int f;
3342   kmp_team_t *root_team;
3343   kmp_team_t *hot_team;
3344   int hot_team_max_nth;
3345   kmp_r_sched_t r_sched =
3346       __kmp_get_schedule_global(); // get current state of scheduling globals
3347   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3348   KMP_DEBUG_ASSERT(root);
3349   KMP_ASSERT(!root->r.r_begin);
3350
3351   /* setup the root state structure */
3352   __kmp_init_lock(&root->r.r_begin_lock);
3353   root->r.r_begin = FALSE;
3354   root->r.r_active = FALSE;
3355   root->r.r_in_parallel = 0;
3356   root->r.r_blocktime = __kmp_dflt_blocktime;
3357 #if KMP_AFFINITY_SUPPORTED
3358   root->r.r_affinity_assigned = FALSE;
3359 #endif
3360
3361   /* setup the root team for this task */
3362   /* allocate the root team structure */
3363   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3364
3365   root_team =
3366       __kmp_allocate_team(root,
3367                           1, // new_nproc
3368                           1, // max_nproc
3369 #if OMPT_SUPPORT
3370                           ompt_data_none, // root parallel id
3371 #endif
3372                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3373                           0 // argc
3374                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3375       );
3376 #if USE_DEBUGGER
3377   // Non-NULL value should be assigned to make the debugger display the root
3378   // team.
3379   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3380 #endif
3381
3382   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3383
3384   root->r.r_root_team = root_team;
3385   root_team->t.t_control_stack_top = NULL;
3386
3387   /* initialize root team */
3388   root_team->t.t_threads[0] = NULL;
3389   root_team->t.t_nproc = 1;
3390   root_team->t.t_serialized = 1;
3391   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392   root_team->t.t_sched.sched = r_sched.sched;
3393   KA_TRACE(
3394       20,
3395       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3396        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3397
3398   /* setup the  hot team for this task */
3399   /* allocate the hot team structure */
3400   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3401
3402   hot_team =
3403       __kmp_allocate_team(root,
3404                           1, // new_nproc
3405                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3406 #if OMPT_SUPPORT
3407                           ompt_data_none, // root parallel id
3408 #endif
3409                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3410                           0 // argc
3411                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3412       );
3413   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3414
3415   root->r.r_hot_team = hot_team;
3416   root_team->t.t_control_stack_top = NULL;
3417
3418   /* first-time initialization */
3419   hot_team->t.t_parent = root_team;
3420
3421   /* initialize hot team */
3422   hot_team_max_nth = hot_team->t.t_max_nproc;
3423   for (f = 0; f < hot_team_max_nth; ++f) {
3424     hot_team->t.t_threads[f] = NULL;
3425   }
3426   hot_team->t.t_nproc = 1;
3427   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3428   hot_team->t.t_sched.sched = r_sched.sched;
3429   hot_team->t.t_size_changed = 0;
3430 }
3431
3432 #ifdef KMP_DEBUG
3433
3434 typedef struct kmp_team_list_item {
3435   kmp_team_p const *entry;
3436   struct kmp_team_list_item *next;
3437 } kmp_team_list_item_t;
3438 typedef kmp_team_list_item_t *kmp_team_list_t;
3439
3440 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3441     kmp_team_list_t list, // List of teams.
3442     kmp_team_p const *team // Team to add.
3443 ) {
3444
3445   // List must terminate with item where both entry and next are NULL.
3446   // Team is added to the list only once.
3447   // List is sorted in ascending order by team id.
3448   // Team id is *not* a key.
3449
3450   kmp_team_list_t l;
3451
3452   KMP_DEBUG_ASSERT(list != NULL);
3453   if (team == NULL) {
3454     return;
3455   }
3456
3457   __kmp_print_structure_team_accum(list, team->t.t_parent);
3458   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3459
3460   // Search list for the team.
3461   l = list;
3462   while (l->next != NULL && l->entry != team) {
3463     l = l->next;
3464   }
3465   if (l->next != NULL) {
3466     return; // Team has been added before, exit.
3467   }
3468
3469   // Team is not found. Search list again for insertion point.
3470   l = list;
3471   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3472     l = l->next;
3473   }
3474
3475   // Insert team.
3476   {
3477     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3478         sizeof(kmp_team_list_item_t));
3479     *item = *l;
3480     l->entry = team;
3481     l->next = item;
3482   }
3483 }
3484
3485 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3486
3487 ) {
3488   __kmp_printf("%s", title);
3489   if (team != NULL) {
3490     __kmp_printf("%2x %p\n", team->t.t_id, team);
3491   } else {
3492     __kmp_printf(" - (nil)\n");
3493   }
3494 }
3495
3496 static void __kmp_print_structure_thread(char const *title,
3497                                          kmp_info_p const *thread) {
3498   __kmp_printf("%s", title);
3499   if (thread != NULL) {
3500     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3501   } else {
3502     __kmp_printf(" - (nil)\n");
3503   }
3504 }
3505
3506 void __kmp_print_structure(void) {
3507
3508   kmp_team_list_t list;
3509
3510   // Initialize list of teams.
3511   list =
3512       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3513   list->entry = NULL;
3514   list->next = NULL;
3515
3516   __kmp_printf("\n------------------------------\nGlobal Thread "
3517                "Table\n------------------------------\n");
3518   {
3519     int gtid;
3520     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3521       __kmp_printf("%2d", gtid);
3522       if (__kmp_threads != NULL) {
3523         __kmp_printf(" %p", __kmp_threads[gtid]);
3524       }
3525       if (__kmp_root != NULL) {
3526         __kmp_printf(" %p", __kmp_root[gtid]);
3527       }
3528       __kmp_printf("\n");
3529     }
3530   }
3531
3532   // Print out __kmp_threads array.
3533   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3534                "----------\n");
3535   if (__kmp_threads != NULL) {
3536     int gtid;
3537     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3538       kmp_info_t const *thread = __kmp_threads[gtid];
3539       if (thread != NULL) {
3540         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3541         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3542         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3543         __kmp_print_structure_team("    Serial Team:  ",
3544                                    thread->th.th_serial_team);
3545         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3546         __kmp_print_structure_thread("    Primary:      ",
3547                                      thread->th.th_team_master);
3548         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3549         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3550         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3551         __kmp_print_structure_thread("    Next in pool: ",
3552                                      thread->th.th_next_pool);
3553         __kmp_printf("\n");
3554         __kmp_print_structure_team_accum(list, thread->th.th_team);
3555         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3556       }
3557     }
3558   } else {
3559     __kmp_printf("Threads array is not allocated.\n");
3560   }
3561
3562   // Print out __kmp_root array.
3563   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3564                "--------\n");
3565   if (__kmp_root != NULL) {
3566     int gtid;
3567     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3568       kmp_root_t const *root = __kmp_root[gtid];
3569       if (root != NULL) {
3570         __kmp_printf("GTID %2d %p:\n", gtid, root);
3571         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3572         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3573         __kmp_print_structure_thread("    Uber Thread:  ",
3574                                      root->r.r_uber_thread);
3575         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3576         __kmp_printf("    In Parallel:  %2d\n",
3577                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3578         __kmp_printf("\n");
3579         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3580         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3581       }
3582     }
3583   } else {
3584     __kmp_printf("Ubers array is not allocated.\n");
3585   }
3586
3587   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3588                "--------\n");
3589   while (list->next != NULL) {
3590     kmp_team_p const *team = list->entry;
3591     int i;
3592     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3593     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3594     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3595     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3596     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3597     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3598     for (i = 0; i < team->t.t_nproc; ++i) {
3599       __kmp_printf("    Thread %2d:      ", i);
3600       __kmp_print_structure_thread("", team->t.t_threads[i]);
3601     }
3602     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3603     __kmp_printf("\n");
3604     list = list->next;
3605   }
3606
3607   // Print out __kmp_thread_pool and __kmp_team_pool.
3608   __kmp_printf("\n------------------------------\nPools\n----------------------"
3609                "--------\n");
3610   __kmp_print_structure_thread("Thread pool:          ",
3611                                CCAST(kmp_info_t *, __kmp_thread_pool));
3612   __kmp_print_structure_team("Team pool:            ",
3613                              CCAST(kmp_team_t *, __kmp_team_pool));
3614   __kmp_printf("\n");
3615
3616   // Free team list.
3617   while (list != NULL) {
3618     kmp_team_list_item_t *item = list;
3619     list = list->next;
3620     KMP_INTERNAL_FREE(item);
3621   }
3622 }
3623
3624 #endif
3625
3626 //---------------------------------------------------------------------------
3627 //  Stuff for per-thread fast random number generator
3628 //  Table of primes
3629 static const unsigned __kmp_primes[] = {
3630     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3631     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3632     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3633     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3634     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3635     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3636     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3637     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3638     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3639     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3640     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3641
3642 //---------------------------------------------------------------------------
3643 //  __kmp_get_random: Get a random number using a linear congruential method.
3644 unsigned short __kmp_get_random(kmp_info_t *thread) {
3645   unsigned x = thread->th.th_x;
3646   unsigned short r = (unsigned short)(x >> 16);
3647
3648   thread->th.th_x = x * thread->th.th_a + 1;
3649
3650   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3651                 thread->th.th_info.ds.ds_tid, r));
3652
3653   return r;
3654 }
3655 //--------------------------------------------------------
3656 // __kmp_init_random: Initialize a random number generator
3657 void __kmp_init_random(kmp_info_t *thread) {
3658   unsigned seed = thread->th.th_info.ds.ds_tid;
3659
3660   thread->th.th_a =
3661       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3662   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3663   KA_TRACE(30,
3664            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3665 }
3666
3667 #if KMP_OS_WINDOWS
3668 /* reclaim array entries for root threads that are already dead, returns number
3669  * reclaimed */
3670 static int __kmp_reclaim_dead_roots(void) {
3671   int i, r = 0;
3672
3673   for (i = 0; i < __kmp_threads_capacity; ++i) {
3674     if (KMP_UBER_GTID(i) &&
3675         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3676         !__kmp_root[i]
3677              ->r.r_active) { // AC: reclaim only roots died in non-active state
3678       r += __kmp_unregister_root_other_thread(i);
3679     }
3680   }
3681   return r;
3682 }
3683 #endif
3684
3685 /* This function attempts to create free entries in __kmp_threads and
3686    __kmp_root, and returns the number of free entries generated.
3687
3688    For Windows* OS static library, the first mechanism used is to reclaim array
3689    entries for root threads that are already dead.
3690
3691    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3692    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3693    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3694    threadprivate cache array has been created. Synchronization with
3695    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3696
3697    After any dead root reclamation, if the clipping value allows array expansion
3698    to result in the generation of a total of nNeed free slots, the function does
3699    that expansion. If not, nothing is done beyond the possible initial root
3700    thread reclamation.
3701
3702    If any argument is negative, the behavior is undefined. */
3703 static int __kmp_expand_threads(int nNeed) {
3704   int added = 0;
3705   int minimumRequiredCapacity;
3706   int newCapacity;
3707   kmp_info_t **newThreads;
3708   kmp_root_t **newRoot;
3709
3710   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3711   // resizing __kmp_threads does not need additional protection if foreign
3712   // threads are present
3713
3714 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3715   /* only for Windows static library */
3716   /* reclaim array entries for root threads that are already dead */
3717   added = __kmp_reclaim_dead_roots();
3718
3719   if (nNeed) {
3720     nNeed -= added;
3721     if (nNeed < 0)
3722       nNeed = 0;
3723   }
3724 #endif
3725   if (nNeed <= 0)
3726     return added;
3727
3728   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3729   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3730   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3731   // > __kmp_max_nth in one of two ways:
3732   //
3733   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3734   //    may not be reused by another thread, so we may need to increase
3735   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3736   //
3737   // 2) New foreign root(s) are encountered.  We always register new foreign
3738   //    roots. This may cause a smaller # of threads to be allocated at
3739   //    subsequent parallel regions, but the worker threads hang around (and
3740   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3741   //
3742   // Anyway, that is the reason for moving the check to see if
3743   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3744   // instead of having it performed here. -BB
3745
3746   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3747
3748   /* compute expansion headroom to check if we can expand */
3749   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3750     /* possible expansion too small -- give up */
3751     return added;
3752   }
3753   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3754
3755   newCapacity = __kmp_threads_capacity;
3756   do {
3757     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3758                                                           : __kmp_sys_max_nth;
3759   } while (newCapacity < minimumRequiredCapacity);
3760   newThreads = (kmp_info_t **)__kmp_allocate(
3761       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3762   newRoot =
3763       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3764   KMP_MEMCPY(newThreads, __kmp_threads,
3765              __kmp_threads_capacity * sizeof(kmp_info_t *));
3766   KMP_MEMCPY(newRoot, __kmp_root,
3767              __kmp_threads_capacity * sizeof(kmp_root_t *));
3768   // Put old __kmp_threads array on a list. Any ongoing references to the old
3769   // list will be valid. This list is cleaned up at library shutdown.
3770   kmp_old_threads_list_t *node =
3771       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3772   node->threads = __kmp_threads;
3773   node->next = __kmp_old_threads_list;
3774   __kmp_old_threads_list = node;
3775
3776   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3777   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3778   added += newCapacity - __kmp_threads_capacity;
3779   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3780
3781   if (newCapacity > __kmp_tp_capacity) {
3782     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3783     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3784       __kmp_threadprivate_resize_cache(newCapacity);
3785     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3786       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3787     }
3788     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3789   }
3790
3791   return added;
3792 }
3793
3794 /* Register the current thread as a root thread and obtain our gtid. We must
3795    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3796    thread that calls from __kmp_do_serial_initialize() */
3797 int __kmp_register_root(int initial_thread) {
3798   kmp_info_t *root_thread;
3799   kmp_root_t *root;
3800   int gtid;
3801   int capacity;
3802   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3803   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3804   KMP_MB();
3805
3806   /* 2007-03-02:
3807      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3808      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3809      work as expected -- it may return false (that means there is at least one
3810      empty slot in __kmp_threads array), but it is possible the only free slot
3811      is #0, which is reserved for initial thread and so cannot be used for this
3812      one. Following code workarounds this bug.
3813
3814      However, right solution seems to be not reserving slot #0 for initial
3815      thread because:
3816      (1) there is no magic in slot #0,
3817      (2) we cannot detect initial thread reliably (the first thread which does
3818         serial initialization may be not a real initial thread).
3819   */
3820   capacity = __kmp_threads_capacity;
3821   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3822     --capacity;
3823   }
3824
3825   // If it is not for initializing the hidden helper team, we need to take
3826   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3827   // in __kmp_threads_capacity.
3828   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3829     capacity -= __kmp_hidden_helper_threads_num;
3830   }
3831
3832   /* see if there are too many threads */
3833   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3834     if (__kmp_tp_cached) {
3835       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3836                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3837                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3838     } else {
3839       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3840                   __kmp_msg_null);
3841     }
3842   }
3843
3844   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3845   // 0: initial thread, also a regular OpenMP thread.
3846   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3847   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3848   // regular OpenMP threads.
3849   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3850     // Find an available thread slot for hidden helper thread. Slots for hidden
3851     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3852     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3853                    gtid <= __kmp_hidden_helper_threads_num;
3854          gtid++)
3855       ;
3856     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3857     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3858                  "hidden helper thread: T#%d\n",
3859                  gtid));
3860   } else {
3861     /* find an available thread slot */
3862     // Don't reassign the zero slot since we need that to only be used by
3863     // initial thread. Slots for hidden helper threads should also be skipped.
3864     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3865       gtid = 0;
3866     } else {
3867       for (gtid = __kmp_hidden_helper_threads_num + 1;
3868            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3869         ;
3870     }
3871     KA_TRACE(
3872         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3873     KMP_ASSERT(gtid < __kmp_threads_capacity);
3874   }
3875
3876   /* update global accounting */
3877   __kmp_all_nth++;
3878   TCW_4(__kmp_nth, __kmp_nth + 1);
3879
3880   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3881   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3882   if (__kmp_adjust_gtid_mode) {
3883     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3884       if (TCR_4(__kmp_gtid_mode) != 2) {
3885         TCW_4(__kmp_gtid_mode, 2);
3886       }
3887     } else {
3888       if (TCR_4(__kmp_gtid_mode) != 1) {
3889         TCW_4(__kmp_gtid_mode, 1);
3890       }
3891     }
3892   }
3893
3894 #ifdef KMP_ADJUST_BLOCKTIME
3895   /* Adjust blocktime to zero if necessary            */
3896   /* Middle initialization might not have occurred yet */
3897   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3898     if (__kmp_nth > __kmp_avail_proc) {
3899       __kmp_zero_bt = TRUE;
3900     }
3901   }
3902 #endif /* KMP_ADJUST_BLOCKTIME */
3903
3904   /* setup this new hierarchy */
3905   if (!(root = __kmp_root[gtid])) {
3906     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3907     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3908   }
3909
3910 #if KMP_STATS_ENABLED
3911   // Initialize stats as soon as possible (right after gtid assignment).
3912   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3913   __kmp_stats_thread_ptr->startLife();
3914   KMP_SET_THREAD_STATE(SERIAL_REGION);
3915   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3916 #endif
3917   __kmp_initialize_root(root);
3918
3919   /* setup new root thread structure */
3920   if (root->r.r_uber_thread) {
3921     root_thread = root->r.r_uber_thread;
3922   } else {
3923     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3924     if (__kmp_storage_map) {
3925       __kmp_print_thread_storage_map(root_thread, gtid);
3926     }
3927     root_thread->th.th_info.ds.ds_gtid = gtid;
3928 #if OMPT_SUPPORT
3929     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3930 #endif
3931     root_thread->th.th_root = root;
3932     if (__kmp_env_consistency_check) {
3933       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3934     }
3935 #if USE_FAST_MEMORY
3936     __kmp_initialize_fast_memory(root_thread);
3937 #endif /* USE_FAST_MEMORY */
3938
3939 #if KMP_USE_BGET
3940     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3941     __kmp_initialize_bget(root_thread);
3942 #endif
3943     __kmp_init_random(root_thread); // Initialize random number generator
3944   }
3945
3946   /* setup the serial team held in reserve by the root thread */
3947   if (!root_thread->th.th_serial_team) {
3948     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3949     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3950     root_thread->th.th_serial_team = __kmp_allocate_team(
3951         root, 1, 1,
3952 #if OMPT_SUPPORT
3953         ompt_data_none, // root parallel id
3954 #endif
3955         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3956   }
3957   KMP_ASSERT(root_thread->th.th_serial_team);
3958   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3959                 root_thread->th.th_serial_team));
3960
3961   /* drop root_thread into place */
3962   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3963
3964   root->r.r_root_team->t.t_threads[0] = root_thread;
3965   root->r.r_hot_team->t.t_threads[0] = root_thread;
3966   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3967   // AC: the team created in reserve, not for execution (it is unused for now).
3968   root_thread->th.th_serial_team->t.t_serialized = 0;
3969   root->r.r_uber_thread = root_thread;
3970
3971   /* initialize the thread, get it ready to go */
3972   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3973   TCW_4(__kmp_init_gtid, TRUE);
3974
3975   /* prepare the primary thread for get_gtid() */
3976   __kmp_gtid_set_specific(gtid);
3977
3978 #if USE_ITT_BUILD
3979   __kmp_itt_thread_name(gtid);
3980 #endif /* USE_ITT_BUILD */
3981
3982 #ifdef KMP_TDATA_GTID
3983   __kmp_gtid = gtid;
3984 #endif
3985   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3986   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3987
3988   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3989                 "plain=%u\n",
3990                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3991                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3992                 KMP_INIT_BARRIER_STATE));
3993   { // Initialize barrier data.
3994     int b;
3995     for (b = 0; b < bs_last_barrier; ++b) {
3996       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3997 #if USE_DEBUGGER
3998       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3999 #endif
4000     }
4001   }
4002   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4003                    KMP_INIT_BARRIER_STATE);
4004
4005 #if KMP_AFFINITY_SUPPORTED
4006   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4007   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4008   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4009   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4010 #endif /* KMP_AFFINITY_SUPPORTED */
4011   root_thread->th.th_def_allocator = __kmp_def_allocator;
4012   root_thread->th.th_prev_level = 0;
4013   root_thread->th.th_prev_num_threads = 1;
4014
4015   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4016   tmp->cg_root = root_thread;
4017   tmp->cg_thread_limit = __kmp_cg_max_nth;
4018   tmp->cg_nthreads = 1;
4019   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4020                  " cg_nthreads init to 1\n",
4021                  root_thread, tmp));
4022   tmp->up = NULL;
4023   root_thread->th.th_cg_roots = tmp;
4024
4025   __kmp_root_counter++;
4026
4027 #if OMPT_SUPPORT
4028   if (!initial_thread && ompt_enabled.enabled) {
4029
4030     kmp_info_t *root_thread = ompt_get_thread();
4031
4032     ompt_set_thread_state(root_thread, ompt_state_overhead);
4033
4034     if (ompt_enabled.ompt_callback_thread_begin) {
4035       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4036           ompt_thread_initial, __ompt_get_thread_data_internal());
4037     }
4038     ompt_data_t *task_data;
4039     ompt_data_t *parallel_data;
4040     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4041                                   NULL);
4042     if (ompt_enabled.ompt_callback_implicit_task) {
4043       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4044           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4045     }
4046
4047     ompt_set_thread_state(root_thread, ompt_state_work_serial);
4048   }
4049 #endif
4050 #if OMPD_SUPPORT
4051   if (ompd_state & OMPD_ENABLE_BP)
4052     ompd_bp_thread_begin();
4053 #endif
4054
4055   KMP_MB();
4056   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4057
4058   return gtid;
4059 }
4060
4061 #if KMP_NESTED_HOT_TEAMS
4062 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4063                                 const int max_level) {
4064   int i, n, nth;
4065   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4066   if (!hot_teams || !hot_teams[level].hot_team) {
4067     return 0;
4068   }
4069   KMP_DEBUG_ASSERT(level < max_level);
4070   kmp_team_t *team = hot_teams[level].hot_team;
4071   nth = hot_teams[level].hot_team_nth;
4072   n = nth - 1; // primary thread is not freed
4073   if (level < max_level - 1) {
4074     for (i = 0; i < nth; ++i) {
4075       kmp_info_t *th = team->t.t_threads[i];
4076       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4077       if (i > 0 && th->th.th_hot_teams) {
4078         __kmp_free(th->th.th_hot_teams);
4079         th->th.th_hot_teams = NULL;
4080       }
4081     }
4082   }
4083   __kmp_free_team(root, team, NULL);
4084   return n;
4085 }
4086 #endif
4087
4088 // Resets a root thread and clear its root and hot teams.
4089 // Returns the number of __kmp_threads entries directly and indirectly freed.
4090 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4091   kmp_team_t *root_team = root->r.r_root_team;
4092   kmp_team_t *hot_team = root->r.r_hot_team;
4093   int n = hot_team->t.t_nproc;
4094   int i;
4095
4096   KMP_DEBUG_ASSERT(!root->r.r_active);
4097
4098   root->r.r_root_team = NULL;
4099   root->r.r_hot_team = NULL;
4100   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4101   // before call to __kmp_free_team().
4102   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4103 #if KMP_NESTED_HOT_TEAMS
4104   if (__kmp_hot_teams_max_level >
4105       0) { // need to free nested hot teams and their threads if any
4106     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4107       kmp_info_t *th = hot_team->t.t_threads[i];
4108       if (__kmp_hot_teams_max_level > 1) {
4109         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4110       }
4111       if (th->th.th_hot_teams) {
4112         __kmp_free(th->th.th_hot_teams);
4113         th->th.th_hot_teams = NULL;
4114       }
4115     }
4116   }
4117 #endif
4118   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4119
4120   // Before we can reap the thread, we need to make certain that all other
4121   // threads in the teams that had this root as ancestor have stopped trying to
4122   // steal tasks.
4123   if (__kmp_tasking_mode != tskm_immediate_exec) {
4124     __kmp_wait_to_unref_task_teams();
4125   }
4126
4127 #if KMP_OS_WINDOWS
4128   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4129   KA_TRACE(
4130       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4131            "\n",
4132            (LPVOID) & (root->r.r_uber_thread->th),
4133            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4134   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4135 #endif /* KMP_OS_WINDOWS */
4136
4137 #if OMPD_SUPPORT
4138   if (ompd_state & OMPD_ENABLE_BP)
4139     ompd_bp_thread_end();
4140 #endif
4141
4142 #if OMPT_SUPPORT
4143   ompt_data_t *task_data;
4144   ompt_data_t *parallel_data;
4145   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4146                                 NULL);
4147   if (ompt_enabled.ompt_callback_implicit_task) {
4148     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4149         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4150   }
4151   if (ompt_enabled.ompt_callback_thread_end) {
4152     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4153         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4154   }
4155 #endif
4156
4157   TCW_4(__kmp_nth,
4158         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4159   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4160   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4161                  " to %d\n",
4162                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4163                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4164   if (i == 1) {
4165     // need to free contention group structure
4166     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4167                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4168     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4169     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4170     root->r.r_uber_thread->th.th_cg_roots = NULL;
4171   }
4172   __kmp_reap_thread(root->r.r_uber_thread, 1);
4173
4174   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4175   // instead of freeing.
4176   root->r.r_uber_thread = NULL;
4177   /* mark root as no longer in use */
4178   root->r.r_begin = FALSE;
4179
4180   return n;
4181 }
4182
4183 void __kmp_unregister_root_current_thread(int gtid) {
4184   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4185   /* this lock should be ok, since unregister_root_current_thread is never
4186      called during an abort, only during a normal close. furthermore, if you
4187      have the forkjoin lock, you should never try to get the initz lock */
4188   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4189   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4190     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4191                   "exiting T#%d\n",
4192                   gtid));
4193     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4194     return;
4195   }
4196   kmp_root_t *root = __kmp_root[gtid];
4197
4198   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4199   KMP_ASSERT(KMP_UBER_GTID(gtid));
4200   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4201   KMP_ASSERT(root->r.r_active == FALSE);
4202
4203   KMP_MB();
4204
4205   kmp_info_t *thread = __kmp_threads[gtid];
4206   kmp_team_t *team = thread->th.th_team;
4207   kmp_task_team_t *task_team = thread->th.th_task_team;
4208
4209   // we need to wait for the proxy tasks before finishing the thread
4210   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4211                             task_team->tt.tt_hidden_helper_task_encountered)) {
4212 #if OMPT_SUPPORT
4213     // the runtime is shutting down so we won't report any events
4214     thread->th.ompt_thread_info.state = ompt_state_undefined;
4215 #endif
4216     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4217   }
4218
4219   __kmp_reset_root(gtid, root);
4220
4221   KMP_MB();
4222   KC_TRACE(10,
4223            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4224
4225   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4226 }
4227
4228 #if KMP_OS_WINDOWS
4229 /* __kmp_forkjoin_lock must be already held
4230    Unregisters a root thread that is not the current thread.  Returns the number
4231    of __kmp_threads entries freed as a result. */
4232 static int __kmp_unregister_root_other_thread(int gtid) {
4233   kmp_root_t *root = __kmp_root[gtid];
4234   int r;
4235
4236   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4237   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4238   KMP_ASSERT(KMP_UBER_GTID(gtid));
4239   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4240   KMP_ASSERT(root->r.r_active == FALSE);
4241
4242   r = __kmp_reset_root(gtid, root);
4243   KC_TRACE(10,
4244            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4245   return r;
4246 }
4247 #endif
4248
4249 #if KMP_DEBUG
4250 void __kmp_task_info() {
4251
4252   kmp_int32 gtid = __kmp_entry_gtid();
4253   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4254   kmp_info_t *this_thr = __kmp_threads[gtid];
4255   kmp_team_t *steam = this_thr->th.th_serial_team;
4256   kmp_team_t *team = this_thr->th.th_team;
4257
4258   __kmp_printf(
4259       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4260       "ptask=%p\n",
4261       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4262       team->t.t_implicit_task_taskdata[tid].td_parent);
4263 }
4264 #endif // KMP_DEBUG
4265
4266 /* TODO optimize with one big memclr, take out what isn't needed, split
4267    responsibility to workers as much as possible, and delay initialization of
4268    features as much as possible  */
4269 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4270                                   int tid, int gtid) {
4271   /* this_thr->th.th_info.ds.ds_gtid is setup in
4272      kmp_allocate_thread/create_worker.
4273      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4274   KMP_DEBUG_ASSERT(this_thr != NULL);
4275   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4276   KMP_DEBUG_ASSERT(team);
4277   KMP_DEBUG_ASSERT(team->t.t_threads);
4278   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4279   kmp_info_t *master = team->t.t_threads[0];
4280   KMP_DEBUG_ASSERT(master);
4281   KMP_DEBUG_ASSERT(master->th.th_root);
4282
4283   KMP_MB();
4284
4285   TCW_SYNC_PTR(this_thr->th.th_team, team);
4286
4287   this_thr->th.th_info.ds.ds_tid = tid;
4288   this_thr->th.th_set_nproc = 0;
4289   if (__kmp_tasking_mode != tskm_immediate_exec)
4290     // When tasking is possible, threads are not safe to reap until they are
4291     // done tasking; this will be set when tasking code is exited in wait
4292     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4293   else // no tasking --> always safe to reap
4294     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4295   this_thr->th.th_set_proc_bind = proc_bind_default;
4296 #if KMP_AFFINITY_SUPPORTED
4297   this_thr->th.th_new_place = this_thr->th.th_current_place;
4298 #endif
4299   this_thr->th.th_root = master->th.th_root;
4300
4301   /* setup the thread's cache of the team structure */
4302   this_thr->th.th_team_nproc = team->t.t_nproc;
4303   this_thr->th.th_team_master = master;
4304   this_thr->th.th_team_serialized = team->t.t_serialized;
4305
4306   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4307
4308   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4309                 tid, gtid, this_thr, this_thr->th.th_current_task));
4310
4311   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4312                            team, tid, TRUE);
4313
4314   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4315                 tid, gtid, this_thr, this_thr->th.th_current_task));
4316   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4317   // __kmp_initialize_team()?
4318
4319   /* TODO no worksharing in speculative threads */
4320   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4321
4322   this_thr->th.th_local.this_construct = 0;
4323
4324   if (!this_thr->th.th_pri_common) {
4325     this_thr->th.th_pri_common =
4326         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4327     if (__kmp_storage_map) {
4328       __kmp_print_storage_map_gtid(
4329           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4330           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4331     }
4332     this_thr->th.th_pri_head = NULL;
4333   }
4334
4335   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4336       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4337     // Make new thread's CG root same as primary thread's
4338     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4339     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4340     if (tmp) {
4341       // worker changes CG, need to check if old CG should be freed
4342       int i = tmp->cg_nthreads--;
4343       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4344                      " on node %p of thread %p to %d\n",
4345                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4346       if (i == 1) {
4347         __kmp_free(tmp); // last thread left CG --> free it
4348       }
4349     }
4350     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4351     // Increment new thread's CG root's counter to add the new thread
4352     this_thr->th.th_cg_roots->cg_nthreads++;
4353     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4354                    " node %p of thread %p to %d\n",
4355                    this_thr, this_thr->th.th_cg_roots,
4356                    this_thr->th.th_cg_roots->cg_root,
4357                    this_thr->th.th_cg_roots->cg_nthreads));
4358     this_thr->th.th_current_task->td_icvs.thread_limit =
4359         this_thr->th.th_cg_roots->cg_thread_limit;
4360   }
4361
4362   /* Initialize dynamic dispatch */
4363   {
4364     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4365     // Use team max_nproc since this will never change for the team.
4366     size_t disp_size =
4367         sizeof(dispatch_private_info_t) *
4368         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4369     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4370                   team->t.t_max_nproc));
4371     KMP_ASSERT(dispatch);
4372     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4373     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4374
4375     dispatch->th_disp_index = 0;
4376     dispatch->th_doacross_buf_idx = 0;
4377     if (!dispatch->th_disp_buffer) {
4378       dispatch->th_disp_buffer =
4379           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4380
4381       if (__kmp_storage_map) {
4382         __kmp_print_storage_map_gtid(
4383             gtid, &dispatch->th_disp_buffer[0],
4384             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4385                                           ? 1
4386                                           : __kmp_dispatch_num_buffers],
4387             disp_size,
4388             "th_%d.th_dispatch.th_disp_buffer "
4389             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4390             gtid, team->t.t_id, gtid);
4391       }
4392     } else {
4393       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4394     }
4395
4396     dispatch->th_dispatch_pr_current = 0;
4397     dispatch->th_dispatch_sh_current = 0;
4398
4399     dispatch->th_deo_fcn = 0; /* ORDERED     */
4400     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4401   }
4402
4403   this_thr->th.th_next_pool = NULL;
4404
4405   if (!this_thr->th.th_task_state_memo_stack) {
4406     size_t i;
4407     this_thr->th.th_task_state_memo_stack =
4408         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4409     this_thr->th.th_task_state_top = 0;
4410     this_thr->th.th_task_state_stack_sz = 4;
4411     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4412          ++i) // zero init the stack
4413       this_thr->th.th_task_state_memo_stack[i] = 0;
4414   }
4415
4416   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4417   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4418
4419   KMP_MB();
4420 }
4421
4422 /* allocate a new thread for the requesting team. this is only called from
4423    within a forkjoin critical section. we will first try to get an available
4424    thread from the thread pool. if none is available, we will fork a new one
4425    assuming we are able to create a new one. this should be assured, as the
4426    caller should check on this first. */
4427 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4428                                   int new_tid) {
4429   kmp_team_t *serial_team;
4430   kmp_info_t *new_thr;
4431   int new_gtid;
4432
4433   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4434   KMP_DEBUG_ASSERT(root && team);
4435 #if !KMP_NESTED_HOT_TEAMS
4436   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4437 #endif
4438   KMP_MB();
4439
4440   /* first, try to get one from the thread pool */
4441   if (__kmp_thread_pool) {
4442     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4443     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4444     if (new_thr == __kmp_thread_pool_insert_pt) {
4445       __kmp_thread_pool_insert_pt = NULL;
4446     }
4447     TCW_4(new_thr->th.th_in_pool, FALSE);
4448     __kmp_suspend_initialize_thread(new_thr);
4449     __kmp_lock_suspend_mx(new_thr);
4450     if (new_thr->th.th_active_in_pool == TRUE) {
4451       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4452       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4453       new_thr->th.th_active_in_pool = FALSE;
4454     }
4455     __kmp_unlock_suspend_mx(new_thr);
4456
4457     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4458                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4459     KMP_ASSERT(!new_thr->th.th_team);
4460     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4461
4462     /* setup the thread structure */
4463     __kmp_initialize_info(new_thr, team, new_tid,
4464                           new_thr->th.th_info.ds.ds_gtid);
4465     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4466
4467     TCW_4(__kmp_nth, __kmp_nth + 1);
4468
4469     new_thr->th.th_task_state = 0;
4470     new_thr->th.th_task_state_top = 0;
4471     new_thr->th.th_task_state_stack_sz = 4;
4472
4473     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4474       // Make sure pool thread has transitioned to waiting on own thread struct
4475       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4476       // Thread activated in __kmp_allocate_team when increasing team size
4477     }
4478
4479 #ifdef KMP_ADJUST_BLOCKTIME
4480     /* Adjust blocktime back to zero if necessary */
4481     /* Middle initialization might not have occurred yet */
4482     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4483       if (__kmp_nth > __kmp_avail_proc) {
4484         __kmp_zero_bt = TRUE;
4485       }
4486     }
4487 #endif /* KMP_ADJUST_BLOCKTIME */
4488
4489 #if KMP_DEBUG
4490     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4491     // KMP_BARRIER_PARENT_FLAG.
4492     int b;
4493     kmp_balign_t *balign = new_thr->th.th_bar;
4494     for (b = 0; b < bs_last_barrier; ++b)
4495       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4496 #endif
4497
4498     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4499                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4500
4501     KMP_MB();
4502     return new_thr;
4503   }
4504
4505   /* no, well fork a new one */
4506   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4507   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4508
4509 #if KMP_USE_MONITOR
4510   // If this is the first worker thread the RTL is creating, then also
4511   // launch the monitor thread.  We try to do this as early as possible.
4512   if (!TCR_4(__kmp_init_monitor)) {
4513     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4514     if (!TCR_4(__kmp_init_monitor)) {
4515       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4516       TCW_4(__kmp_init_monitor, 1);
4517       __kmp_create_monitor(&__kmp_monitor);
4518       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4519 #if KMP_OS_WINDOWS
4520       // AC: wait until monitor has started. This is a fix for CQ232808.
4521       // The reason is that if the library is loaded/unloaded in a loop with
4522       // small (parallel) work in between, then there is high probability that
4523       // monitor thread started after the library shutdown. At shutdown it is
4524       // too late to cope with the problem, because when the primary thread is
4525       // in DllMain (process detach) the monitor has no chances to start (it is
4526       // blocked), and primary thread has no means to inform the monitor that
4527       // the library has gone, because all the memory which the monitor can
4528       // access is going to be released/reset.
4529       while (TCR_4(__kmp_init_monitor) < 2) {
4530         KMP_YIELD(TRUE);
4531       }
4532       KF_TRACE(10, ("after monitor thread has started\n"));
4533 #endif
4534     }
4535     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4536   }
4537 #endif
4538
4539   KMP_MB();
4540
4541   {
4542     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4543                              ? 1
4544                              : __kmp_hidden_helper_threads_num + 1;
4545
4546     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4547          ++new_gtid) {
4548       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4549     }
4550
4551     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4552       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4553     }
4554   }
4555
4556   /* allocate space for it. */
4557   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4558
4559   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4560
4561 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4562   // suppress race conditions detection on synchronization flags in debug mode
4563   // this helps to analyze library internals eliminating false positives
4564   __itt_suppress_mark_range(
4565       __itt_suppress_range, __itt_suppress_threading_errors,
4566       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4567   __itt_suppress_mark_range(
4568       __itt_suppress_range, __itt_suppress_threading_errors,
4569       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4570 #if KMP_OS_WINDOWS
4571   __itt_suppress_mark_range(
4572       __itt_suppress_range, __itt_suppress_threading_errors,
4573       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4574 #else
4575   __itt_suppress_mark_range(__itt_suppress_range,
4576                             __itt_suppress_threading_errors,
4577                             &new_thr->th.th_suspend_init_count,
4578                             sizeof(new_thr->th.th_suspend_init_count));
4579 #endif
4580   // TODO: check if we need to also suppress b_arrived flags
4581   __itt_suppress_mark_range(__itt_suppress_range,
4582                             __itt_suppress_threading_errors,
4583                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4584                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4585   __itt_suppress_mark_range(__itt_suppress_range,
4586                             __itt_suppress_threading_errors,
4587                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4588                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4589   __itt_suppress_mark_range(__itt_suppress_range,
4590                             __itt_suppress_threading_errors,
4591                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4592                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4593 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4594   if (__kmp_storage_map) {
4595     __kmp_print_thread_storage_map(new_thr, new_gtid);
4596   }
4597
4598   // add the reserve serialized team, initialized from the team's primary thread
4599   {
4600     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4601     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4602     new_thr->th.th_serial_team = serial_team =
4603         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4604 #if OMPT_SUPPORT
4605                                           ompt_data_none, // root parallel id
4606 #endif
4607                                           proc_bind_default, &r_icvs,
4608                                           0 USE_NESTED_HOT_ARG(NULL));
4609   }
4610   KMP_ASSERT(serial_team);
4611   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4612   // execution (it is unused for now).
4613   serial_team->t.t_threads[0] = new_thr;
4614   KF_TRACE(10,
4615            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4616             new_thr));
4617
4618   /* setup the thread structures */
4619   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4620
4621 #if USE_FAST_MEMORY
4622   __kmp_initialize_fast_memory(new_thr);
4623 #endif /* USE_FAST_MEMORY */
4624
4625 #if KMP_USE_BGET
4626   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4627   __kmp_initialize_bget(new_thr);
4628 #endif
4629
4630   __kmp_init_random(new_thr); // Initialize random number generator
4631
4632   /* Initialize these only once when thread is grabbed for a team allocation */
4633   KA_TRACE(20,
4634            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4635             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4636
4637   int b;
4638   kmp_balign_t *balign = new_thr->th.th_bar;
4639   for (b = 0; b < bs_last_barrier; ++b) {
4640     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4641     balign[b].bb.team = NULL;
4642     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4643     balign[b].bb.use_oncore_barrier = 0;
4644   }
4645
4646   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4647   new_thr->th.th_sleep_loc_type = flag_unset;
4648
4649   new_thr->th.th_spin_here = FALSE;
4650   new_thr->th.th_next_waiting = 0;
4651 #if KMP_OS_UNIX
4652   new_thr->th.th_blocking = false;
4653 #endif
4654
4655 #if KMP_AFFINITY_SUPPORTED
4656   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4657   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4658   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4659   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4660 #endif
4661   new_thr->th.th_def_allocator = __kmp_def_allocator;
4662   new_thr->th.th_prev_level = 0;
4663   new_thr->th.th_prev_num_threads = 1;
4664
4665   TCW_4(new_thr->th.th_in_pool, FALSE);
4666   new_thr->th.th_active_in_pool = FALSE;
4667   TCW_4(new_thr->th.th_active, TRUE);
4668
4669   /* adjust the global counters */
4670   __kmp_all_nth++;
4671   __kmp_nth++;
4672
4673   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4674   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4675   if (__kmp_adjust_gtid_mode) {
4676     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4677       if (TCR_4(__kmp_gtid_mode) != 2) {
4678         TCW_4(__kmp_gtid_mode, 2);
4679       }
4680     } else {
4681       if (TCR_4(__kmp_gtid_mode) != 1) {
4682         TCW_4(__kmp_gtid_mode, 1);
4683       }
4684     }
4685   }
4686
4687 #ifdef KMP_ADJUST_BLOCKTIME
4688   /* Adjust blocktime back to zero if necessary       */
4689   /* Middle initialization might not have occurred yet */
4690   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4691     if (__kmp_nth > __kmp_avail_proc) {
4692       __kmp_zero_bt = TRUE;
4693     }
4694   }
4695 #endif /* KMP_ADJUST_BLOCKTIME */
4696
4697 #if KMP_AFFINITY_SUPPORTED
4698   // Set the affinity and topology information for new thread
4699   __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4700 #endif
4701
4702   /* actually fork it and create the new worker thread */
4703   KF_TRACE(
4704       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4705   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4706   KF_TRACE(10,
4707            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4708
4709   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4710                 new_gtid));
4711   KMP_MB();
4712   return new_thr;
4713 }
4714
4715 /* Reinitialize team for reuse.
4716    The hot team code calls this case at every fork barrier, so EPCC barrier
4717    test are extremely sensitive to changes in it, esp. writes to the team
4718    struct, which cause a cache invalidation in all threads.
4719    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4720 static void __kmp_reinitialize_team(kmp_team_t *team,
4721                                     kmp_internal_control_t *new_icvs,
4722                                     ident_t *loc) {
4723   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4724                 team->t.t_threads[0], team));
4725   KMP_DEBUG_ASSERT(team && new_icvs);
4726   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4727   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4728
4729   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4730   // Copy ICVs to the primary thread's implicit taskdata
4731   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4732   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4733
4734   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4735                 team->t.t_threads[0], team));
4736 }
4737
4738 /* Initialize the team data structure.
4739    This assumes the t_threads and t_max_nproc are already set.
4740    Also, we don't touch the arguments */
4741 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4742                                   kmp_internal_control_t *new_icvs,
4743                                   ident_t *loc) {
4744   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4745
4746   /* verify */
4747   KMP_DEBUG_ASSERT(team);
4748   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4749   KMP_DEBUG_ASSERT(team->t.t_threads);
4750   KMP_MB();
4751
4752   team->t.t_master_tid = 0; /* not needed */
4753   /* team->t.t_master_bar;        not needed */
4754   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4755   team->t.t_nproc = new_nproc;
4756
4757   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4758   team->t.t_next_pool = NULL;
4759   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4760    * up hot team */
4761
4762   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4763   team->t.t_invoke = NULL; /* not needed */
4764
4765   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4766   team->t.t_sched.sched = new_icvs->sched.sched;
4767
4768 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4769   team->t.t_fp_control_saved = FALSE; /* not needed */
4770   team->t.t_x87_fpu_control_word = 0; /* not needed */
4771   team->t.t_mxcsr = 0; /* not needed */
4772 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4773
4774   team->t.t_construct = 0;
4775
4776   team->t.t_ordered.dt.t_value = 0;
4777   team->t.t_master_active = FALSE;
4778
4779 #ifdef KMP_DEBUG
4780   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4781 #endif
4782 #if KMP_OS_WINDOWS
4783   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4784 #endif
4785
4786   team->t.t_control_stack_top = NULL;
4787
4788   __kmp_reinitialize_team(team, new_icvs, loc);
4789
4790   KMP_MB();
4791   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4792 }
4793
4794 #if KMP_AFFINITY_SUPPORTED
4795 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4796                                           int first, int last, int newp) {
4797   th->th.th_first_place = first;
4798   th->th.th_last_place = last;
4799   th->th.th_new_place = newp;
4800   if (newp != th->th.th_current_place) {
4801     if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4802       team->t.t_display_affinity = 1;
4803     // Copy topology information associated with the new place
4804     th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4805     th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4806   }
4807 }
4808
4809 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4810 // It calculates the worker + primary thread's partition based upon the parent
4811 // thread's partition, and binds each worker to a thread in their partition.
4812 // The primary thread's partition should already include its current binding.
4813 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4814   // Do not partition places for the hidden helper team
4815   if (KMP_HIDDEN_HELPER_TEAM(team))
4816     return;
4817   // Copy the primary thread's place partition to the team struct
4818   kmp_info_t *master_th = team->t.t_threads[0];
4819   KMP_DEBUG_ASSERT(master_th != NULL);
4820   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4821   int first_place = master_th->th.th_first_place;
4822   int last_place = master_th->th.th_last_place;
4823   int masters_place = master_th->th.th_current_place;
4824   int num_masks = __kmp_affinity.num_masks;
4825   team->t.t_first_place = first_place;
4826   team->t.t_last_place = last_place;
4827
4828   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4829                 "bound to place %d partition = [%d,%d]\n",
4830                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4831                 team->t.t_id, masters_place, first_place, last_place));
4832
4833   switch (proc_bind) {
4834
4835   case proc_bind_default:
4836     // Serial teams might have the proc_bind policy set to proc_bind_default.
4837     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4838     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4839     break;
4840
4841   case proc_bind_primary: {
4842     int f;
4843     int n_th = team->t.t_nproc;
4844     for (f = 1; f < n_th; f++) {
4845       kmp_info_t *th = team->t.t_threads[f];
4846       KMP_DEBUG_ASSERT(th != NULL);
4847       __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4848
4849       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4850                      "partition = [%d,%d]\n",
4851                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4852                      f, masters_place, first_place, last_place));
4853     }
4854   } break;
4855
4856   case proc_bind_close: {
4857     int f;
4858     int n_th = team->t.t_nproc;
4859     int n_places;
4860     if (first_place <= last_place) {
4861       n_places = last_place - first_place + 1;
4862     } else {
4863       n_places = num_masks - first_place + last_place + 1;
4864     }
4865     if (n_th <= n_places) {
4866       int place = masters_place;
4867       for (f = 1; f < n_th; f++) {
4868         kmp_info_t *th = team->t.t_threads[f];
4869         KMP_DEBUG_ASSERT(th != NULL);
4870
4871         if (place == last_place) {
4872           place = first_place;
4873         } else if (place == (num_masks - 1)) {
4874           place = 0;
4875         } else {
4876           place++;
4877         }
4878         __kmp_set_thread_place(team, th, first_place, last_place, place);
4879
4880         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4881                        "partition = [%d,%d]\n",
4882                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4883                        team->t.t_id, f, place, first_place, last_place));
4884       }
4885     } else {
4886       int S, rem, gap, s_count;
4887       S = n_th / n_places;
4888       s_count = 0;
4889       rem = n_th - (S * n_places);
4890       gap = rem > 0 ? n_places / rem : n_places;
4891       int place = masters_place;
4892       int gap_ct = gap;
4893       for (f = 0; f < n_th; f++) {
4894         kmp_info_t *th = team->t.t_threads[f];
4895         KMP_DEBUG_ASSERT(th != NULL);
4896
4897         __kmp_set_thread_place(team, th, first_place, last_place, place);
4898         s_count++;
4899
4900         if ((s_count == S) && rem && (gap_ct == gap)) {
4901           // do nothing, add an extra thread to place on next iteration
4902         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4903           // we added an extra thread to this place; move to next place
4904           if (place == last_place) {
4905             place = first_place;
4906           } else if (place == (num_masks - 1)) {
4907             place = 0;
4908           } else {
4909             place++;
4910           }
4911           s_count = 0;
4912           gap_ct = 1;
4913           rem--;
4914         } else if (s_count == S) { // place full; don't add extra
4915           if (place == last_place) {
4916             place = first_place;
4917           } else if (place == (num_masks - 1)) {
4918             place = 0;
4919           } else {
4920             place++;
4921           }
4922           gap_ct++;
4923           s_count = 0;
4924         }
4925
4926         KA_TRACE(100,
4927                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4928                   "partition = [%d,%d]\n",
4929                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4930                   th->th.th_new_place, first_place, last_place));
4931       }
4932       KMP_DEBUG_ASSERT(place == masters_place);
4933     }
4934   } break;
4935
4936   case proc_bind_spread: {
4937     int f;
4938     int n_th = team->t.t_nproc;
4939     int n_places;
4940     int thidx;
4941     if (first_place <= last_place) {
4942       n_places = last_place - first_place + 1;
4943     } else {
4944       n_places = num_masks - first_place + last_place + 1;
4945     }
4946     if (n_th <= n_places) {
4947       int place = -1;
4948
4949       if (n_places != num_masks) {
4950         int S = n_places / n_th;
4951         int s_count, rem, gap, gap_ct;
4952
4953         place = masters_place;
4954         rem = n_places - n_th * S;
4955         gap = rem ? n_th / rem : 1;
4956         gap_ct = gap;
4957         thidx = n_th;
4958         if (update_master_only == 1)
4959           thidx = 1;
4960         for (f = 0; f < thidx; f++) {
4961           kmp_info_t *th = team->t.t_threads[f];
4962           KMP_DEBUG_ASSERT(th != NULL);
4963
4964           int fplace = place, nplace = place;
4965           s_count = 1;
4966           while (s_count < S) {
4967             if (place == last_place) {
4968               place = first_place;
4969             } else if (place == (num_masks - 1)) {
4970               place = 0;
4971             } else {
4972               place++;
4973             }
4974             s_count++;
4975           }
4976           if (rem && (gap_ct == gap)) {
4977             if (place == last_place) {
4978               place = first_place;
4979             } else if (place == (num_masks - 1)) {
4980               place = 0;
4981             } else {
4982               place++;
4983             }
4984             rem--;
4985             gap_ct = 0;
4986           }
4987           __kmp_set_thread_place(team, th, fplace, place, nplace);
4988           gap_ct++;
4989
4990           if (place == last_place) {
4991             place = first_place;
4992           } else if (place == (num_masks - 1)) {
4993             place = 0;
4994           } else {
4995             place++;
4996           }
4997
4998           KA_TRACE(100,
4999                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5000                     "partition = [%d,%d], num_masks: %u\n",
5001                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5002                     f, th->th.th_new_place, th->th.th_first_place,
5003                     th->th.th_last_place, num_masks));
5004         }
5005       } else {
5006         /* Having uniform space of available computation places I can create
5007            T partitions of round(P/T) size and put threads into the first
5008            place of each partition. */
5009         double current = static_cast<double>(masters_place);
5010         double spacing =
5011             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5012         int first, last;
5013         kmp_info_t *th;
5014
5015         thidx = n_th + 1;
5016         if (update_master_only == 1)
5017           thidx = 1;
5018         for (f = 0; f < thidx; f++) {
5019           first = static_cast<int>(current);
5020           last = static_cast<int>(current + spacing) - 1;
5021           KMP_DEBUG_ASSERT(last >= first);
5022           if (first >= n_places) {
5023             if (masters_place) {
5024               first -= n_places;
5025               last -= n_places;
5026               if (first == (masters_place + 1)) {
5027                 KMP_DEBUG_ASSERT(f == n_th);
5028                 first--;
5029               }
5030               if (last == masters_place) {
5031                 KMP_DEBUG_ASSERT(f == (n_th - 1));
5032                 last--;
5033               }
5034             } else {
5035               KMP_DEBUG_ASSERT(f == n_th);
5036               first = 0;
5037               last = 0;
5038             }
5039           }
5040           if (last >= n_places) {
5041             last = (n_places - 1);
5042           }
5043           place = first;
5044           current += spacing;
5045           if (f < n_th) {
5046             KMP_DEBUG_ASSERT(0 <= first);
5047             KMP_DEBUG_ASSERT(n_places > first);
5048             KMP_DEBUG_ASSERT(0 <= last);
5049             KMP_DEBUG_ASSERT(n_places > last);
5050             KMP_DEBUG_ASSERT(last_place >= first_place);
5051             th = team->t.t_threads[f];
5052             KMP_DEBUG_ASSERT(th);
5053             __kmp_set_thread_place(team, th, first, last, place);
5054             KA_TRACE(100,
5055                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5056                       "partition = [%d,%d], spacing = %.4f\n",
5057                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5058                       team->t.t_id, f, th->th.th_new_place,
5059                       th->th.th_first_place, th->th.th_last_place, spacing));
5060           }
5061         }
5062       }
5063       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5064     } else {
5065       int S, rem, gap, s_count;
5066       S = n_th / n_places;
5067       s_count = 0;
5068       rem = n_th - (S * n_places);
5069       gap = rem > 0 ? n_places / rem : n_places;
5070       int place = masters_place;
5071       int gap_ct = gap;
5072       thidx = n_th;
5073       if (update_master_only == 1)
5074         thidx = 1;
5075       for (f = 0; f < thidx; f++) {
5076         kmp_info_t *th = team->t.t_threads[f];
5077         KMP_DEBUG_ASSERT(th != NULL);
5078
5079         __kmp_set_thread_place(team, th, place, place, place);
5080         s_count++;
5081
5082         if ((s_count == S) && rem && (gap_ct == gap)) {
5083           // do nothing, add an extra thread to place on next iteration
5084         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085           // we added an extra thread to this place; move on to next place
5086           if (place == last_place) {
5087             place = first_place;
5088           } else if (place == (num_masks - 1)) {
5089             place = 0;
5090           } else {
5091             place++;
5092           }
5093           s_count = 0;
5094           gap_ct = 1;
5095           rem--;
5096         } else if (s_count == S) { // place is full; don't add extra thread
5097           if (place == last_place) {
5098             place = first_place;
5099           } else if (place == (num_masks - 1)) {
5100             place = 0;
5101           } else {
5102             place++;
5103           }
5104           gap_ct++;
5105           s_count = 0;
5106         }
5107
5108         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109                        "partition = [%d,%d]\n",
5110                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5111                        team->t.t_id, f, th->th.th_new_place,
5112                        th->th.th_first_place, th->th.th_last_place));
5113       }
5114       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115     }
5116   } break;
5117
5118   default:
5119     break;
5120   }
5121
5122   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 }
5124
5125 #endif // KMP_AFFINITY_SUPPORTED
5126
5127 /* allocate a new team data structure to use.  take one off of the free pool if
5128    available */
5129 kmp_team_t *
5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131 #if OMPT_SUPPORT
5132                     ompt_data_t ompt_parallel_data,
5133 #endif
5134                     kmp_proc_bind_t new_proc_bind,
5135                     kmp_internal_control_t *new_icvs,
5136                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138   int f;
5139   kmp_team_t *team;
5140   int use_hot_team = !root->r.r_active;
5141   int level = 0;
5142   int do_place_partition = 1;
5143
5144   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147   KMP_MB();
5148
5149 #if KMP_NESTED_HOT_TEAMS
5150   kmp_hot_team_ptr_t *hot_teams;
5151   if (master) {
5152     team = master->th.th_team;
5153     level = team->t.t_active_level;
5154     if (master->th.th_teams_microtask) { // in teams construct?
5155       if (master->th.th_teams_size.nteams > 1 &&
5156           ( // #teams > 1
5157               team->t.t_pkfn ==
5158                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5159               master->th.th_teams_level <
5160                   team->t.t_level)) { // or nested parallel inside the teams
5161         ++level; // not increment if #teams==1, or for outer fork of the teams;
5162         // increment otherwise
5163       }
5164       // Do not perform the place partition if inner fork of the teams
5165       // Wait until nested parallel region encountered inside teams construct
5166       if ((master->th.th_teams_size.nteams == 1 &&
5167            master->th.th_teams_level >= team->t.t_level) ||
5168           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169         do_place_partition = 0;
5170     }
5171     hot_teams = master->th.th_hot_teams;
5172     if (level < __kmp_hot_teams_max_level && hot_teams &&
5173         hot_teams[level].hot_team) {
5174       // hot team has already been allocated for given level
5175       use_hot_team = 1;
5176     } else {
5177       use_hot_team = 0;
5178     }
5179   } else {
5180     // check we won't access uninitialized hot_teams, just in case
5181     KMP_DEBUG_ASSERT(new_nproc == 1);
5182   }
5183 #endif
5184   // Optimization to use a "hot" team
5185   if (use_hot_team && new_nproc > 1) {
5186     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187 #if KMP_NESTED_HOT_TEAMS
5188     team = hot_teams[level].hot_team;
5189 #else
5190     team = root->r.r_hot_team;
5191 #endif
5192 #if KMP_DEBUG
5193     if (__kmp_tasking_mode != tskm_immediate_exec) {
5194       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195                     "task_team[1] = %p before reinit\n",
5196                     team->t.t_task_team[0], team->t.t_task_team[1]));
5197     }
5198 #endif
5199
5200     if (team->t.t_nproc != new_nproc &&
5201         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202       // Distributed barrier may need a resize
5203       int old_nthr = team->t.t_nproc;
5204       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205     }
5206
5207     // If not doing the place partition, then reset the team's proc bind
5208     // to indicate that partitioning of all threads still needs to take place
5209     if (do_place_partition == 0)
5210       team->t.t_proc_bind = proc_bind_default;
5211     // Has the number of threads changed?
5212     /* Let's assume the most common case is that the number of threads is
5213        unchanged, and put that case first. */
5214     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216       // This case can mean that omp_set_num_threads() was called and the hot
5217       // team size was already reduced, so we check the special flag
5218       if (team->t.t_size_changed == -1) {
5219         team->t.t_size_changed = 1;
5220       } else {
5221         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222       }
5223
5224       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225       kmp_r_sched_t new_sched = new_icvs->sched;
5226       // set primary thread's schedule as new run-time schedule
5227       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228
5229       __kmp_reinitialize_team(team, new_icvs,
5230                               root->r.r_uber_thread->th.th_ident);
5231
5232       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233                     team->t.t_threads[0], team));
5234       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235
5236 #if KMP_AFFINITY_SUPPORTED
5237       if ((team->t.t_size_changed == 0) &&
5238           (team->t.t_proc_bind == new_proc_bind)) {
5239         if (new_proc_bind == proc_bind_spread) {
5240           if (do_place_partition) {
5241             // add flag to update only master for spread
5242             __kmp_partition_places(team, 1);
5243           }
5244         }
5245         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246                        "proc_bind = %d, partition = [%d,%d]\n",
5247                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5248                        team->t.t_last_place));
5249       } else {
5250         if (do_place_partition) {
5251           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252           __kmp_partition_places(team);
5253         }
5254       }
5255 #else
5256       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257 #endif /* KMP_AFFINITY_SUPPORTED */
5258     } else if (team->t.t_nproc > new_nproc) {
5259       KA_TRACE(20,
5260                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261                 new_nproc));
5262
5263       team->t.t_size_changed = 1;
5264       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265         // Barrier size already reduced earlier in this function
5266         // Activate team threads via th_used_in_team
5267         __kmp_add_threads_to_team(team, new_nproc);
5268       }
5269 #if KMP_NESTED_HOT_TEAMS
5270       if (__kmp_hot_teams_mode == 0) {
5271         // AC: saved number of threads should correspond to team's value in this
5272         // mode, can be bigger in mode 1, when hot team has threads in reserve
5273         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274         hot_teams[level].hot_team_nth = new_nproc;
5275 #endif // KMP_NESTED_HOT_TEAMS
5276         /* release the extra threads we don't need any more */
5277         for (f = new_nproc; f < team->t.t_nproc; f++) {
5278           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279           if (__kmp_tasking_mode != tskm_immediate_exec) {
5280             // When decreasing team size, threads no longer in the team should
5281             // unref task team.
5282             team->t.t_threads[f]->th.th_task_team = NULL;
5283           }
5284           __kmp_free_thread(team->t.t_threads[f]);
5285           team->t.t_threads[f] = NULL;
5286         }
5287 #if KMP_NESTED_HOT_TEAMS
5288       } // (__kmp_hot_teams_mode == 0)
5289       else {
5290         // When keeping extra threads in team, switch threads to wait on own
5291         // b_go flag
5292         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295           for (int b = 0; b < bs_last_barrier; ++b) {
5296             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298             }
5299             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300           }
5301         }
5302       }
5303 #endif // KMP_NESTED_HOT_TEAMS
5304       team->t.t_nproc = new_nproc;
5305       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307       __kmp_reinitialize_team(team, new_icvs,
5308                               root->r.r_uber_thread->th.th_ident);
5309
5310       // Update remaining threads
5311       for (f = 0; f < new_nproc; ++f) {
5312         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313       }
5314
5315       // restore the current task state of the primary thread: should be the
5316       // implicit task
5317       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318                     team->t.t_threads[0], team));
5319
5320       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321
5322 #ifdef KMP_DEBUG
5323       for (f = 0; f < team->t.t_nproc; f++) {
5324         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325                          team->t.t_threads[f]->th.th_team_nproc ==
5326                              team->t.t_nproc);
5327       }
5328 #endif
5329
5330       if (do_place_partition) {
5331         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333         __kmp_partition_places(team);
5334 #endif
5335       }
5336     } else { // team->t.t_nproc < new_nproc
5337
5338       KA_TRACE(20,
5339                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5340                 new_nproc));
5341       int old_nproc = team->t.t_nproc; // save old value and use to update only
5342       team->t.t_size_changed = 1;
5343
5344 #if KMP_NESTED_HOT_TEAMS
5345       int avail_threads = hot_teams[level].hot_team_nth;
5346       if (new_nproc < avail_threads)
5347         avail_threads = new_nproc;
5348       kmp_info_t **other_threads = team->t.t_threads;
5349       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5350         // Adjust barrier data of reserved threads (if any) of the team
5351         // Other data will be set in __kmp_initialize_info() below.
5352         int b;
5353         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5354         for (b = 0; b < bs_last_barrier; ++b) {
5355           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5356           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5357 #if USE_DEBUGGER
5358           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5359 #endif
5360         }
5361       }
5362       if (hot_teams[level].hot_team_nth >= new_nproc) {
5363         // we have all needed threads in reserve, no need to allocate any
5364         // this only possible in mode 1, cannot have reserved threads in mode 0
5365         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5366         team->t.t_nproc = new_nproc; // just get reserved threads involved
5367       } else {
5368         // We may have some threads in reserve, but not enough;
5369         // get reserved threads involved if any.
5370         team->t.t_nproc = hot_teams[level].hot_team_nth;
5371         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5372 #endif // KMP_NESTED_HOT_TEAMS
5373         if (team->t.t_max_nproc < new_nproc) {
5374           /* reallocate larger arrays */
5375           __kmp_reallocate_team_arrays(team, new_nproc);
5376           __kmp_reinitialize_team(team, new_icvs, NULL);
5377         }
5378
5379 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5380         /* Temporarily set full mask for primary thread before creation of
5381            workers. The reason is that workers inherit the affinity from the
5382            primary thread, so if a lot of workers are created on the single
5383            core quickly, they don't get a chance to set their own affinity for
5384            a long time. */
5385         kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5386 #endif
5387
5388         /* allocate new threads for the hot team */
5389         for (f = team->t.t_nproc; f < new_nproc; f++) {
5390           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5391           KMP_DEBUG_ASSERT(new_worker);
5392           team->t.t_threads[f] = new_worker;
5393
5394           KA_TRACE(20,
5395                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5396                     "join=%llu, plain=%llu\n",
5397                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5398                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5399                     team->t.t_bar[bs_plain_barrier].b_arrived));
5400
5401           { // Initialize barrier data for new threads.
5402             int b;
5403             kmp_balign_t *balign = new_worker->th.th_bar;
5404             for (b = 0; b < bs_last_barrier; ++b) {
5405               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5406               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5407                                KMP_BARRIER_PARENT_FLAG);
5408 #if USE_DEBUGGER
5409               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5410 #endif
5411             }
5412           }
5413         }
5414
5415 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5416         /* Restore initial primary thread's affinity mask */
5417         new_temp_affinity.restore();
5418 #endif
5419 #if KMP_NESTED_HOT_TEAMS
5420       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5421 #endif // KMP_NESTED_HOT_TEAMS
5422       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5423         // Barrier size already increased earlier in this function
5424         // Activate team threads via th_used_in_team
5425         __kmp_add_threads_to_team(team, new_nproc);
5426       }
5427       /* make sure everyone is syncronized */
5428       // new threads below
5429       __kmp_initialize_team(team, new_nproc, new_icvs,
5430                             root->r.r_uber_thread->th.th_ident);
5431
5432       /* reinitialize the threads */
5433       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5434       for (f = 0; f < team->t.t_nproc; ++f)
5435         __kmp_initialize_info(team->t.t_threads[f], team, f,
5436                               __kmp_gtid_from_tid(f, team));
5437
5438       // set th_task_state for new threads in hot team with older thread's state
5439       kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5440       for (f = old_nproc; f < team->t.t_nproc; ++f)
5441         team->t.t_threads[f]->th.th_task_state = old_state;
5442
5443 #ifdef KMP_DEBUG
5444       for (f = 0; f < team->t.t_nproc; ++f) {
5445         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5446                          team->t.t_threads[f]->th.th_team_nproc ==
5447                              team->t.t_nproc);
5448       }
5449 #endif
5450
5451       if (do_place_partition) {
5452         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5453 #if KMP_AFFINITY_SUPPORTED
5454         __kmp_partition_places(team);
5455 #endif
5456       }
5457     } // Check changes in number of threads
5458
5459     kmp_info_t *master = team->t.t_threads[0];
5460     if (master->th.th_teams_microtask) {
5461       for (f = 1; f < new_nproc; ++f) {
5462         // propagate teams construct specific info to workers
5463         kmp_info_t *thr = team->t.t_threads[f];
5464         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5465         thr->th.th_teams_level = master->th.th_teams_level;
5466         thr->th.th_teams_size = master->th.th_teams_size;
5467       }
5468     }
5469 #if KMP_NESTED_HOT_TEAMS
5470     if (level) {
5471       // Sync barrier state for nested hot teams, not needed for outermost hot
5472       // team.
5473       for (f = 1; f < new_nproc; ++f) {
5474         kmp_info_t *thr = team->t.t_threads[f];
5475         int b;
5476         kmp_balign_t *balign = thr->th.th_bar;
5477         for (b = 0; b < bs_last_barrier; ++b) {
5478           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5479           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5480 #if USE_DEBUGGER
5481           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5482 #endif
5483         }
5484       }
5485     }
5486 #endif // KMP_NESTED_HOT_TEAMS
5487
5488     /* reallocate space for arguments if necessary */
5489     __kmp_alloc_argv_entries(argc, team, TRUE);
5490     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5491     // The hot team re-uses the previous task team,
5492     // if untouched during the previous release->gather phase.
5493
5494     KF_TRACE(10, (" hot_team = %p\n", team));
5495
5496 #if KMP_DEBUG
5497     if (__kmp_tasking_mode != tskm_immediate_exec) {
5498       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5499                     "task_team[1] = %p after reinit\n",
5500                     team->t.t_task_team[0], team->t.t_task_team[1]));
5501     }
5502 #endif
5503
5504 #if OMPT_SUPPORT
5505     __ompt_team_assign_id(team, ompt_parallel_data);
5506 #endif
5507
5508     KMP_MB();
5509
5510     return team;
5511   }
5512
5513   /* next, let's try to take one from the team pool */
5514   KMP_MB();
5515   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5516     /* TODO: consider resizing undersized teams instead of reaping them, now
5517        that we have a resizing mechanism */
5518     if (team->t.t_max_nproc >= max_nproc) {
5519       /* take this team from the team pool */
5520       __kmp_team_pool = team->t.t_next_pool;
5521
5522       if (max_nproc > 1 &&
5523           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5524         if (!team->t.b) { // Allocate barrier structure
5525           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5526         }
5527       }
5528
5529       /* setup the team for fresh use */
5530       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5531
5532       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5533                     "task_team[1] %p to NULL\n",
5534                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5535       team->t.t_task_team[0] = NULL;
5536       team->t.t_task_team[1] = NULL;
5537
5538       /* reallocate space for arguments if necessary */
5539       __kmp_alloc_argv_entries(argc, team, TRUE);
5540       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5541
5542       KA_TRACE(
5543           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5544                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5545       { // Initialize barrier data.
5546         int b;
5547         for (b = 0; b < bs_last_barrier; ++b) {
5548           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5549 #if USE_DEBUGGER
5550           team->t.t_bar[b].b_master_arrived = 0;
5551           team->t.t_bar[b].b_team_arrived = 0;
5552 #endif
5553         }
5554       }
5555
5556       team->t.t_proc_bind = new_proc_bind;
5557
5558       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5559                     team->t.t_id));
5560
5561 #if OMPT_SUPPORT
5562       __ompt_team_assign_id(team, ompt_parallel_data);
5563 #endif
5564
5565       KMP_MB();
5566
5567       return team;
5568     }
5569
5570     /* reap team if it is too small, then loop back and check the next one */
5571     // not sure if this is wise, but, will be redone during the hot-teams
5572     // rewrite.
5573     /* TODO: Use technique to find the right size hot-team, don't reap them */
5574     team = __kmp_reap_team(team);
5575     __kmp_team_pool = team;
5576   }
5577
5578   /* nothing available in the pool, no matter, make a new team! */
5579   KMP_MB();
5580   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5581
5582   /* and set it up */
5583   team->t.t_max_nproc = max_nproc;
5584   if (max_nproc > 1 &&
5585       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5586     // Allocate barrier structure
5587     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5588   }
5589
5590   /* NOTE well, for some reason allocating one big buffer and dividing it up
5591      seems to really hurt performance a lot on the P4, so, let's not use this */
5592   __kmp_allocate_team_arrays(team, max_nproc);
5593
5594   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5595   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5596
5597   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5598                 "%p to NULL\n",
5599                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5600   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5601   // memory, no need to duplicate
5602   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5603   // memory, no need to duplicate
5604
5605   if (__kmp_storage_map) {
5606     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5607   }
5608
5609   /* allocate space for arguments */
5610   __kmp_alloc_argv_entries(argc, team, FALSE);
5611   team->t.t_argc = argc;
5612
5613   KA_TRACE(20,
5614            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5615             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5616   { // Initialize barrier data.
5617     int b;
5618     for (b = 0; b < bs_last_barrier; ++b) {
5619       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5620 #if USE_DEBUGGER
5621       team->t.t_bar[b].b_master_arrived = 0;
5622       team->t.t_bar[b].b_team_arrived = 0;
5623 #endif
5624     }
5625   }
5626
5627   team->t.t_proc_bind = new_proc_bind;
5628
5629 #if OMPT_SUPPORT
5630   __ompt_team_assign_id(team, ompt_parallel_data);
5631   team->t.ompt_serialized_team_info = NULL;
5632 #endif
5633
5634   KMP_MB();
5635
5636   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5637                 team->t.t_id));
5638
5639   return team;
5640 }
5641
5642 /* TODO implement hot-teams at all levels */
5643 /* TODO implement lazy thread release on demand (disband request) */
5644
5645 /* free the team.  return it to the team pool.  release all the threads
5646  * associated with it */
5647 void __kmp_free_team(kmp_root_t *root,
5648                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5649   int f;
5650   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5651                 team->t.t_id));
5652
5653   /* verify state */
5654   KMP_DEBUG_ASSERT(root);
5655   KMP_DEBUG_ASSERT(team);
5656   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5657   KMP_DEBUG_ASSERT(team->t.t_threads);
5658
5659   int use_hot_team = team == root->r.r_hot_team;
5660 #if KMP_NESTED_HOT_TEAMS
5661   int level;
5662   if (master) {
5663     level = team->t.t_active_level - 1;
5664     if (master->th.th_teams_microtask) { // in teams construct?
5665       if (master->th.th_teams_size.nteams > 1) {
5666         ++level; // level was not increased in teams construct for
5667         // team_of_masters
5668       }
5669       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5670           master->th.th_teams_level == team->t.t_level) {
5671         ++level; // level was not increased in teams construct for
5672         // team_of_workers before the parallel
5673       } // team->t.t_level will be increased inside parallel
5674     }
5675 #if KMP_DEBUG
5676     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5677 #endif
5678     if (level < __kmp_hot_teams_max_level) {
5679       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5680       use_hot_team = 1;
5681     }
5682   }
5683 #endif // KMP_NESTED_HOT_TEAMS
5684
5685   /* team is done working */
5686   TCW_SYNC_PTR(team->t.t_pkfn,
5687                NULL); // Important for Debugging Support Library.
5688 #if KMP_OS_WINDOWS
5689   team->t.t_copyin_counter = 0; // init counter for possible reuse
5690 #endif
5691   // Do not reset pointer to parent team to NULL for hot teams.
5692
5693   /* if we are non-hot team, release our threads */
5694   if (!use_hot_team) {
5695     if (__kmp_tasking_mode != tskm_immediate_exec) {
5696       // Wait for threads to reach reapable state
5697       for (f = 1; f < team->t.t_nproc; ++f) {
5698         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5699         kmp_info_t *th = team->t.t_threads[f];
5700         volatile kmp_uint32 *state = &th->th.th_reap_state;
5701         while (*state != KMP_SAFE_TO_REAP) {
5702 #if KMP_OS_WINDOWS
5703           // On Windows a thread can be killed at any time, check this
5704           DWORD ecode;
5705           if (!__kmp_is_thread_alive(th, &ecode)) {
5706             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5707             break;
5708           }
5709 #endif
5710           // first check if thread is sleeping
5711           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5712           if (fl.is_sleeping())
5713             fl.resume(__kmp_gtid_from_thread(th));
5714           KMP_CPU_PAUSE();
5715         }
5716       }
5717
5718       // Delete task teams
5719       int tt_idx;
5720       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5721         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5722         if (task_team != NULL) {
5723           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5724             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5725             team->t.t_threads[f]->th.th_task_team = NULL;
5726           }
5727           KA_TRACE(
5728               20,
5729               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5730                __kmp_get_gtid(), task_team, team->t.t_id));
5731 #if KMP_NESTED_HOT_TEAMS
5732           __kmp_free_task_team(master, task_team);
5733 #endif
5734           team->t.t_task_team[tt_idx] = NULL;
5735         }
5736       }
5737     }
5738
5739     // Reset pointer to parent team only for non-hot teams.
5740     team->t.t_parent = NULL;
5741     team->t.t_level = 0;
5742     team->t.t_active_level = 0;
5743
5744     /* free the worker threads */
5745     for (f = 1; f < team->t.t_nproc; ++f) {
5746       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5747       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5748         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5749                                     1, 2);
5750       }
5751       __kmp_free_thread(team->t.t_threads[f]);
5752     }
5753
5754     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5755       if (team->t.b) {
5756         // wake up thread at old location
5757         team->t.b->go_release();
5758         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5759           for (f = 1; f < team->t.t_nproc; ++f) {
5760             if (team->t.b->sleep[f].sleep) {
5761               __kmp_atomic_resume_64(
5762                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5763                   (kmp_atomic_flag_64<> *)NULL);
5764             }
5765           }
5766         }
5767         // Wait for threads to be removed from team
5768         for (int f = 1; f < team->t.t_nproc; ++f) {
5769           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5770             KMP_CPU_PAUSE();
5771         }
5772       }
5773     }
5774
5775     for (f = 1; f < team->t.t_nproc; ++f) {
5776       team->t.t_threads[f] = NULL;
5777     }
5778
5779     if (team->t.t_max_nproc > 1 &&
5780         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5781       distributedBarrier::deallocate(team->t.b);
5782       team->t.b = NULL;
5783     }
5784     /* put the team back in the team pool */
5785     /* TODO limit size of team pool, call reap_team if pool too large */
5786     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5787     __kmp_team_pool = (volatile kmp_team_t *)team;
5788   } else { // Check if team was created for primary threads in teams construct
5789     // See if first worker is a CG root
5790     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5791                      team->t.t_threads[1]->th.th_cg_roots);
5792     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5793       // Clean up the CG root nodes on workers so that this team can be re-used
5794       for (f = 1; f < team->t.t_nproc; ++f) {
5795         kmp_info_t *thr = team->t.t_threads[f];
5796         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5797                          thr->th.th_cg_roots->cg_root == thr);
5798         // Pop current CG root off list
5799         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5800         thr->th.th_cg_roots = tmp->up;
5801         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5802                        " up to node %p. cg_nthreads was %d\n",
5803                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5804         int i = tmp->cg_nthreads--;
5805         if (i == 1) {
5806           __kmp_free(tmp); // free CG if we are the last thread in it
5807         }
5808         // Restore current task's thread_limit from CG root
5809         if (thr->th.th_cg_roots)
5810           thr->th.th_current_task->td_icvs.thread_limit =
5811               thr->th.th_cg_roots->cg_thread_limit;
5812       }
5813     }
5814   }
5815
5816   KMP_MB();
5817 }
5818
5819 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5820 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5821   kmp_team_t *next_pool = team->t.t_next_pool;
5822
5823   KMP_DEBUG_ASSERT(team);
5824   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5825   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5826   KMP_DEBUG_ASSERT(team->t.t_threads);
5827   KMP_DEBUG_ASSERT(team->t.t_argv);
5828
5829   /* TODO clean the threads that are a part of this? */
5830
5831   /* free stuff */
5832   __kmp_free_team_arrays(team);
5833   if (team->t.t_argv != &team->t.t_inline_argv[0])
5834     __kmp_free((void *)team->t.t_argv);
5835   __kmp_free(team);
5836
5837   KMP_MB();
5838   return next_pool;
5839 }
5840
5841 // Free the thread.  Don't reap it, just place it on the pool of available
5842 // threads.
5843 //
5844 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5845 // binding for the affinity mechanism to be useful.
5846 //
5847 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5848 // However, we want to avoid a potential performance problem by always
5849 // scanning through the list to find the correct point at which to insert
5850 // the thread (potential N**2 behavior).  To do this we keep track of the
5851 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5852 // With single-level parallelism, threads will always be added to the tail
5853 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5854 // parallelism, all bets are off and we may need to scan through the entire
5855 // free list.
5856 //
5857 // This change also has a potentially large performance benefit, for some
5858 // applications.  Previously, as threads were freed from the hot team, they
5859 // would be placed back on the free list in inverse order.  If the hot team
5860 // grew back to it's original size, then the freed thread would be placed
5861 // back on the hot team in reverse order.  This could cause bad cache
5862 // locality problems on programs where the size of the hot team regularly
5863 // grew and shrunk.
5864 //
5865 // Now, for single-level parallelism, the OMP tid is always == gtid.
5866 void __kmp_free_thread(kmp_info_t *this_th) {
5867   int gtid;
5868   kmp_info_t **scan;
5869
5870   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5871                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5872
5873   KMP_DEBUG_ASSERT(this_th);
5874
5875   // When moving thread to pool, switch thread to wait on own b_go flag, and
5876   // uninitialized (NULL team).
5877   int b;
5878   kmp_balign_t *balign = this_th->th.th_bar;
5879   for (b = 0; b < bs_last_barrier; ++b) {
5880     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5881       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5882     balign[b].bb.team = NULL;
5883     balign[b].bb.leaf_kids = 0;
5884   }
5885   this_th->th.th_task_state = 0;
5886   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5887
5888   /* put thread back on the free pool */
5889   TCW_PTR(this_th->th.th_team, NULL);
5890   TCW_PTR(this_th->th.th_root, NULL);
5891   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5892
5893   while (this_th->th.th_cg_roots) {
5894     this_th->th.th_cg_roots->cg_nthreads--;
5895     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5896                    " %p of thread  %p to %d\n",
5897                    this_th, this_th->th.th_cg_roots,
5898                    this_th->th.th_cg_roots->cg_root,
5899                    this_th->th.th_cg_roots->cg_nthreads));
5900     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5901     if (tmp->cg_root == this_th) { // Thread is a cg_root
5902       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5903       KA_TRACE(
5904           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5905       this_th->th.th_cg_roots = tmp->up;
5906       __kmp_free(tmp);
5907     } else { // Worker thread
5908       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5909         __kmp_free(tmp);
5910       }
5911       this_th->th.th_cg_roots = NULL;
5912       break;
5913     }
5914   }
5915
5916   /* If the implicit task assigned to this thread can be used by other threads
5917    * -> multiple threads can share the data and try to free the task at
5918    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5919    * with higher probability when hot team is disabled but can occurs even when
5920    * the hot team is enabled */
5921   __kmp_free_implicit_task(this_th);
5922   this_th->th.th_current_task = NULL;
5923
5924   // If the __kmp_thread_pool_insert_pt is already past the new insert
5925   // point, then we need to re-scan the entire list.
5926   gtid = this_th->th.th_info.ds.ds_gtid;
5927   if (__kmp_thread_pool_insert_pt != NULL) {
5928     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5929     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5930       __kmp_thread_pool_insert_pt = NULL;
5931     }
5932   }
5933
5934   // Scan down the list to find the place to insert the thread.
5935   // scan is the address of a link in the list, possibly the address of
5936   // __kmp_thread_pool itself.
5937   //
5938   // In the absence of nested parallelism, the for loop will have 0 iterations.
5939   if (__kmp_thread_pool_insert_pt != NULL) {
5940     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5941   } else {
5942     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5943   }
5944   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5945        scan = &((*scan)->th.th_next_pool))
5946     ;
5947
5948   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5949   // to its address.
5950   TCW_PTR(this_th->th.th_next_pool, *scan);
5951   __kmp_thread_pool_insert_pt = *scan = this_th;
5952   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5953                    (this_th->th.th_info.ds.ds_gtid <
5954                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5955   TCW_4(this_th->th.th_in_pool, TRUE);
5956   __kmp_suspend_initialize_thread(this_th);
5957   __kmp_lock_suspend_mx(this_th);
5958   if (this_th->th.th_active == TRUE) {
5959     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5960     this_th->th.th_active_in_pool = TRUE;
5961   }
5962 #if KMP_DEBUG
5963   else {
5964     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5965   }
5966 #endif
5967   __kmp_unlock_suspend_mx(this_th);
5968
5969   TCW_4(__kmp_nth, __kmp_nth - 1);
5970
5971 #ifdef KMP_ADJUST_BLOCKTIME
5972   /* Adjust blocktime back to user setting or default if necessary */
5973   /* Middle initialization might never have occurred                */
5974   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5975     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5976     if (__kmp_nth <= __kmp_avail_proc) {
5977       __kmp_zero_bt = FALSE;
5978     }
5979   }
5980 #endif /* KMP_ADJUST_BLOCKTIME */
5981
5982   KMP_MB();
5983 }
5984
5985 /* ------------------------------------------------------------------------ */
5986
5987 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5988 #if OMP_PROFILING_SUPPORT
5989   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5990   // TODO: add a configuration option for time granularity
5991   if (ProfileTraceFile)
5992     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5993 #endif
5994
5995   int gtid = this_thr->th.th_info.ds.ds_gtid;
5996   /*    void                 *stack_data;*/
5997   kmp_team_t **volatile pteam;
5998
5999   KMP_MB();
6000   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6001
6002   if (__kmp_env_consistency_check) {
6003     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6004   }
6005
6006 #if OMPD_SUPPORT
6007   if (ompd_state & OMPD_ENABLE_BP)
6008     ompd_bp_thread_begin();
6009 #endif
6010
6011 #if OMPT_SUPPORT
6012   ompt_data_t *thread_data = nullptr;
6013   if (ompt_enabled.enabled) {
6014     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6015     *thread_data = ompt_data_none;
6016
6017     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6018     this_thr->th.ompt_thread_info.wait_id = 0;
6019     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6020     this_thr->th.ompt_thread_info.parallel_flags = 0;
6021     if (ompt_enabled.ompt_callback_thread_begin) {
6022       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6023           ompt_thread_worker, thread_data);
6024     }
6025     this_thr->th.ompt_thread_info.state = ompt_state_idle;
6026   }
6027 #endif
6028
6029   /* This is the place where threads wait for work */
6030   while (!TCR_4(__kmp_global.g.g_done)) {
6031     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6032     KMP_MB();
6033
6034     /* wait for work to do */
6035     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6036
6037     /* No tid yet since not part of a team */
6038     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6039
6040 #if OMPT_SUPPORT
6041     if (ompt_enabled.enabled) {
6042       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6043     }
6044 #endif
6045
6046     pteam = &this_thr->th.th_team;
6047
6048     /* have we been allocated? */
6049     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6050       /* we were just woken up, so run our new task */
6051       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6052         int rc;
6053         KA_TRACE(20,
6054                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6055                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6056                   (*pteam)->t.t_pkfn));
6057
6058         updateHWFPControl(*pteam);
6059
6060 #if OMPT_SUPPORT
6061         if (ompt_enabled.enabled) {
6062           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6063         }
6064 #endif
6065
6066         rc = (*pteam)->t.t_invoke(gtid);
6067         KMP_ASSERT(rc);
6068
6069         KMP_MB();
6070         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6071                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6072                       (*pteam)->t.t_pkfn));
6073       }
6074 #if OMPT_SUPPORT
6075       if (ompt_enabled.enabled) {
6076         /* no frame set while outside task */
6077         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6078
6079         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6080       }
6081 #endif
6082       /* join barrier after parallel region */
6083       __kmp_join_barrier(gtid);
6084     }
6085   }
6086
6087 #if OMPD_SUPPORT
6088   if (ompd_state & OMPD_ENABLE_BP)
6089     ompd_bp_thread_end();
6090 #endif
6091
6092 #if OMPT_SUPPORT
6093   if (ompt_enabled.ompt_callback_thread_end) {
6094     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6095   }
6096 #endif
6097
6098   this_thr->th.th_task_team = NULL;
6099   /* run the destructors for the threadprivate data for this thread */
6100   __kmp_common_destroy_gtid(gtid);
6101
6102   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6103   KMP_MB();
6104
6105 #if OMP_PROFILING_SUPPORT
6106   llvm::timeTraceProfilerFinishThread();
6107 #endif
6108   return this_thr;
6109 }
6110
6111 /* ------------------------------------------------------------------------ */
6112
6113 void __kmp_internal_end_dest(void *specific_gtid) {
6114   // Make sure no significant bits are lost
6115   int gtid;
6116   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6117
6118   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6119   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6120    * this is because 0 is reserved for the nothing-stored case */
6121
6122   __kmp_internal_end_thread(gtid);
6123 }
6124
6125 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6126
6127 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6128   __kmp_internal_end_atexit();
6129 }
6130
6131 #endif
6132
6133 /* [Windows] josh: when the atexit handler is called, there may still be more
6134    than one thread alive */
6135 void __kmp_internal_end_atexit(void) {
6136   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6137   /* [Windows]
6138      josh: ideally, we want to completely shutdown the library in this atexit
6139      handler, but stat code that depends on thread specific data for gtid fails
6140      because that data becomes unavailable at some point during the shutdown, so
6141      we call __kmp_internal_end_thread instead. We should eventually remove the
6142      dependency on __kmp_get_specific_gtid in the stat code and use
6143      __kmp_internal_end_library to cleanly shutdown the library.
6144
6145      // TODO: Can some of this comment about GVS be removed?
6146      I suspect that the offending stat code is executed when the calling thread
6147      tries to clean up a dead root thread's data structures, resulting in GVS
6148      code trying to close the GVS structures for that thread, but since the stat
6149      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6150      the calling thread is cleaning up itself instead of another thread, it get
6151      confused. This happens because allowing a thread to unregister and cleanup
6152      another thread is a recent modification for addressing an issue.
6153      Based on the current design (20050722), a thread may end up
6154      trying to unregister another thread only if thread death does not trigger
6155      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6156      thread specific data destructor function to detect thread death. For
6157      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6158      is nothing.  Thus, the workaround is applicable only for Windows static
6159      stat library. */
6160   __kmp_internal_end_library(-1);
6161 #if KMP_OS_WINDOWS
6162   __kmp_close_console();
6163 #endif
6164 }
6165
6166 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6167   // It is assumed __kmp_forkjoin_lock is acquired.
6168
6169   int gtid;
6170
6171   KMP_DEBUG_ASSERT(thread != NULL);
6172
6173   gtid = thread->th.th_info.ds.ds_gtid;
6174
6175   if (!is_root) {
6176     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6177       /* Assume the threads are at the fork barrier here */
6178       KA_TRACE(
6179           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6180                gtid));
6181       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6182         while (
6183             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6184           KMP_CPU_PAUSE();
6185         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6186       } else {
6187         /* Need release fence here to prevent seg faults for tree forkjoin
6188            barrier (GEH) */
6189         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6190                            thread);
6191         __kmp_release_64(&flag);
6192       }
6193     }
6194
6195     // Terminate OS thread.
6196     __kmp_reap_worker(thread);
6197
6198     // The thread was killed asynchronously.  If it was actively
6199     // spinning in the thread pool, decrement the global count.
6200     //
6201     // There is a small timing hole here - if the worker thread was just waking
6202     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6203     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6204     // the global counter might not get updated.
6205     //
6206     // Currently, this can only happen as the library is unloaded,
6207     // so there are no harmful side effects.
6208     if (thread->th.th_active_in_pool) {
6209       thread->th.th_active_in_pool = FALSE;
6210       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6211       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6212     }
6213   }
6214
6215   __kmp_free_implicit_task(thread);
6216
6217 // Free the fast memory for tasking
6218 #if USE_FAST_MEMORY
6219   __kmp_free_fast_memory(thread);
6220 #endif /* USE_FAST_MEMORY */
6221
6222   __kmp_suspend_uninitialize_thread(thread);
6223
6224   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6225   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6226
6227   --__kmp_all_nth;
6228   // __kmp_nth was decremented when thread is added to the pool.
6229
6230 #ifdef KMP_ADJUST_BLOCKTIME
6231   /* Adjust blocktime back to user setting or default if necessary */
6232   /* Middle initialization might never have occurred                */
6233   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6234     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6235     if (__kmp_nth <= __kmp_avail_proc) {
6236       __kmp_zero_bt = FALSE;
6237     }
6238   }
6239 #endif /* KMP_ADJUST_BLOCKTIME */
6240
6241   /* free the memory being used */
6242   if (__kmp_env_consistency_check) {
6243     if (thread->th.th_cons) {
6244       __kmp_free_cons_stack(thread->th.th_cons);
6245       thread->th.th_cons = NULL;
6246     }
6247   }
6248
6249   if (thread->th.th_pri_common != NULL) {
6250     __kmp_free(thread->th.th_pri_common);
6251     thread->th.th_pri_common = NULL;
6252   }
6253
6254   if (thread->th.th_task_state_memo_stack != NULL) {
6255     __kmp_free(thread->th.th_task_state_memo_stack);
6256     thread->th.th_task_state_memo_stack = NULL;
6257   }
6258
6259 #if KMP_USE_BGET
6260   if (thread->th.th_local.bget_data != NULL) {
6261     __kmp_finalize_bget(thread);
6262   }
6263 #endif
6264
6265 #if KMP_AFFINITY_SUPPORTED
6266   if (thread->th.th_affin_mask != NULL) {
6267     KMP_CPU_FREE(thread->th.th_affin_mask);
6268     thread->th.th_affin_mask = NULL;
6269   }
6270 #endif /* KMP_AFFINITY_SUPPORTED */
6271
6272 #if KMP_USE_HIER_SCHED
6273   if (thread->th.th_hier_bar_data != NULL) {
6274     __kmp_free(thread->th.th_hier_bar_data);
6275     thread->th.th_hier_bar_data = NULL;
6276   }
6277 #endif
6278
6279   __kmp_reap_team(thread->th.th_serial_team);
6280   thread->th.th_serial_team = NULL;
6281   __kmp_free(thread);
6282
6283   KMP_MB();
6284
6285 } // __kmp_reap_thread
6286
6287 static void __kmp_itthash_clean(kmp_info_t *th) {
6288 #if USE_ITT_NOTIFY
6289   if (__kmp_itt_region_domains.count > 0) {
6290     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6291       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6292       while (bucket) {
6293         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6294         __kmp_thread_free(th, bucket);
6295         bucket = next;
6296       }
6297     }
6298   }
6299   if (__kmp_itt_barrier_domains.count > 0) {
6300     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6301       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6302       while (bucket) {
6303         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6304         __kmp_thread_free(th, bucket);
6305         bucket = next;
6306       }
6307     }
6308   }
6309 #endif
6310 }
6311
6312 static void __kmp_internal_end(void) {
6313   int i;
6314
6315   /* First, unregister the library */
6316   __kmp_unregister_library();
6317
6318 #if KMP_OS_WINDOWS
6319   /* In Win static library, we can't tell when a root actually dies, so we
6320      reclaim the data structures for any root threads that have died but not
6321      unregistered themselves, in order to shut down cleanly.
6322      In Win dynamic library we also can't tell when a thread dies.  */
6323   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6324 // dead roots
6325 #endif
6326
6327   for (i = 0; i < __kmp_threads_capacity; i++)
6328     if (__kmp_root[i])
6329       if (__kmp_root[i]->r.r_active)
6330         break;
6331   KMP_MB(); /* Flush all pending memory write invalidates.  */
6332   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333
6334   if (i < __kmp_threads_capacity) {
6335 #if KMP_USE_MONITOR
6336     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6337     KMP_MB(); /* Flush all pending memory write invalidates.  */
6338
6339     // Need to check that monitor was initialized before reaping it. If we are
6340     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6341     // __kmp_monitor will appear to contain valid data, but it is only valid in
6342     // the parent process, not the child.
6343     // New behavior (201008): instead of keying off of the flag
6344     // __kmp_init_parallel, the monitor thread creation is keyed off
6345     // of the new flag __kmp_init_monitor.
6346     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6347     if (TCR_4(__kmp_init_monitor)) {
6348       __kmp_reap_monitor(&__kmp_monitor);
6349       TCW_4(__kmp_init_monitor, 0);
6350     }
6351     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6352     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6353 #endif // KMP_USE_MONITOR
6354   } else {
6355 /* TODO move this to cleanup code */
6356 #ifdef KMP_DEBUG
6357     /* make sure that everything has properly ended */
6358     for (i = 0; i < __kmp_threads_capacity; i++) {
6359       if (__kmp_root[i]) {
6360         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6361         //                    there can be uber threads alive here
6362         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6363       }
6364     }
6365 #endif
6366
6367     KMP_MB();
6368
6369     // Reap the worker threads.
6370     // This is valid for now, but be careful if threads are reaped sooner.
6371     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6372       // Get the next thread from the pool.
6373       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6374       __kmp_thread_pool = thread->th.th_next_pool;
6375       // Reap it.
6376       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6377       thread->th.th_next_pool = NULL;
6378       thread->th.th_in_pool = FALSE;
6379       __kmp_reap_thread(thread, 0);
6380     }
6381     __kmp_thread_pool_insert_pt = NULL;
6382
6383     // Reap teams.
6384     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6385       // Get the next team from the pool.
6386       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6387       __kmp_team_pool = team->t.t_next_pool;
6388       // Reap it.
6389       team->t.t_next_pool = NULL;
6390       __kmp_reap_team(team);
6391     }
6392
6393     __kmp_reap_task_teams();
6394
6395 #if KMP_OS_UNIX
6396     // Threads that are not reaped should not access any resources since they
6397     // are going to be deallocated soon, so the shutdown sequence should wait
6398     // until all threads either exit the final spin-waiting loop or begin
6399     // sleeping after the given blocktime.
6400     for (i = 0; i < __kmp_threads_capacity; i++) {
6401       kmp_info_t *thr = __kmp_threads[i];
6402       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6403         KMP_CPU_PAUSE();
6404     }
6405 #endif
6406
6407     for (i = 0; i < __kmp_threads_capacity; ++i) {
6408       // TBD: Add some checking...
6409       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6410     }
6411
6412     /* Make sure all threadprivate destructors get run by joining with all
6413        worker threads before resetting this flag */
6414     TCW_SYNC_4(__kmp_init_common, FALSE);
6415
6416     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6417     KMP_MB();
6418
6419 #if KMP_USE_MONITOR
6420     // See note above: One of the possible fixes for CQ138434 / CQ140126
6421     //
6422     // FIXME: push both code fragments down and CSE them?
6423     // push them into __kmp_cleanup() ?
6424     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6425     if (TCR_4(__kmp_init_monitor)) {
6426       __kmp_reap_monitor(&__kmp_monitor);
6427       TCW_4(__kmp_init_monitor, 0);
6428     }
6429     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6430     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6431 #endif
6432   } /* else !__kmp_global.t_active */
6433   TCW_4(__kmp_init_gtid, FALSE);
6434   KMP_MB(); /* Flush all pending memory write invalidates.  */
6435
6436   __kmp_cleanup();
6437 #if OMPT_SUPPORT
6438   ompt_fini();
6439 #endif
6440 }
6441
6442 void __kmp_internal_end_library(int gtid_req) {
6443   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6444   /* this shouldn't be a race condition because __kmp_internal_end() is the
6445      only place to clear __kmp_serial_init */
6446   /* we'll check this later too, after we get the lock */
6447   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6448   // redundant, because the next check will work in any case.
6449   if (__kmp_global.g.g_abort) {
6450     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6451     /* TODO abort? */
6452     return;
6453   }
6454   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6455     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6456     return;
6457   }
6458
6459   // If hidden helper team has been initialized, we need to deinit it
6460   if (TCR_4(__kmp_init_hidden_helper) &&
6461       !TCR_4(__kmp_hidden_helper_team_done)) {
6462     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6463     // First release the main thread to let it continue its work
6464     __kmp_hidden_helper_main_thread_release();
6465     // Wait until the hidden helper team has been destroyed
6466     __kmp_hidden_helper_threads_deinitz_wait();
6467   }
6468
6469   KMP_MB(); /* Flush all pending memory write invalidates.  */
6470   /* find out who we are and what we should do */
6471   {
6472     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6473     KA_TRACE(
6474         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6475     if (gtid == KMP_GTID_SHUTDOWN) {
6476       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6477                     "already shutdown\n"));
6478       return;
6479     } else if (gtid == KMP_GTID_MONITOR) {
6480       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6481                     "registered, or system shutdown\n"));
6482       return;
6483     } else if (gtid == KMP_GTID_DNE) {
6484       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6485                     "shutdown\n"));
6486       /* we don't know who we are, but we may still shutdown the library */
6487     } else if (KMP_UBER_GTID(gtid)) {
6488       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6489       if (__kmp_root[gtid]->r.r_active) {
6490         __kmp_global.g.g_abort = -1;
6491         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6492         __kmp_unregister_library();
6493         KA_TRACE(10,
6494                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6495                   gtid));
6496         return;
6497       } else {
6498         __kmp_itthash_clean(__kmp_threads[gtid]);
6499         KA_TRACE(
6500             10,
6501             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6502         __kmp_unregister_root_current_thread(gtid);
6503       }
6504     } else {
6505 /* worker threads may call this function through the atexit handler, if they
6506  * call exit() */
6507 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6508    TODO: do a thorough shutdown instead */
6509 #ifdef DUMP_DEBUG_ON_EXIT
6510       if (__kmp_debug_buf)
6511         __kmp_dump_debug_buffer();
6512 #endif
6513       // added unregister library call here when we switch to shm linux
6514       // if we don't, it will leave lots of files in /dev/shm
6515       // cleanup shared memory file before exiting.
6516       __kmp_unregister_library();
6517       return;
6518     }
6519   }
6520   /* synchronize the termination process */
6521   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6522
6523   /* have we already finished */
6524   if (__kmp_global.g.g_abort) {
6525     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6526     /* TODO abort? */
6527     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6528     return;
6529   }
6530   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6531     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6532     return;
6533   }
6534
6535   /* We need this lock to enforce mutex between this reading of
6536      __kmp_threads_capacity and the writing by __kmp_register_root.
6537      Alternatively, we can use a counter of roots that is atomically updated by
6538      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6539      __kmp_internal_end_*.  */
6540   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6541
6542   /* now we can safely conduct the actual termination */
6543   __kmp_internal_end();
6544
6545   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6546   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547
6548   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6549
6550 #ifdef DUMP_DEBUG_ON_EXIT
6551   if (__kmp_debug_buf)
6552     __kmp_dump_debug_buffer();
6553 #endif
6554
6555 #if KMP_OS_WINDOWS
6556   __kmp_close_console();
6557 #endif
6558
6559   __kmp_fini_allocator();
6560
6561 } // __kmp_internal_end_library
6562
6563 void __kmp_internal_end_thread(int gtid_req) {
6564   int i;
6565
6566   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6567   /* this shouldn't be a race condition because __kmp_internal_end() is the
6568    * only place to clear __kmp_serial_init */
6569   /* we'll check this later too, after we get the lock */
6570   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6571   // redundant, because the next check will work in any case.
6572   if (__kmp_global.g.g_abort) {
6573     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6574     /* TODO abort? */
6575     return;
6576   }
6577   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6578     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6579     return;
6580   }
6581
6582   // If hidden helper team has been initialized, we need to deinit it
6583   if (TCR_4(__kmp_init_hidden_helper) &&
6584       !TCR_4(__kmp_hidden_helper_team_done)) {
6585     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6586     // First release the main thread to let it continue its work
6587     __kmp_hidden_helper_main_thread_release();
6588     // Wait until the hidden helper team has been destroyed
6589     __kmp_hidden_helper_threads_deinitz_wait();
6590   }
6591
6592   KMP_MB(); /* Flush all pending memory write invalidates.  */
6593
6594   /* find out who we are and what we should do */
6595   {
6596     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6597     KA_TRACE(10,
6598              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6599     if (gtid == KMP_GTID_SHUTDOWN) {
6600       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6601                     "already shutdown\n"));
6602       return;
6603     } else if (gtid == KMP_GTID_MONITOR) {
6604       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6605                     "registered, or system shutdown\n"));
6606       return;
6607     } else if (gtid == KMP_GTID_DNE) {
6608       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6609                     "shutdown\n"));
6610       return;
6611       /* we don't know who we are */
6612     } else if (KMP_UBER_GTID(gtid)) {
6613       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6614       if (__kmp_root[gtid]->r.r_active) {
6615         __kmp_global.g.g_abort = -1;
6616         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6617         KA_TRACE(10,
6618                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6619                   gtid));
6620         return;
6621       } else {
6622         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6623                       gtid));
6624         __kmp_unregister_root_current_thread(gtid);
6625       }
6626     } else {
6627       /* just a worker thread, let's leave */
6628       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6629
6630       if (gtid >= 0) {
6631         __kmp_threads[gtid]->th.th_task_team = NULL;
6632       }
6633
6634       KA_TRACE(10,
6635                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6636                 gtid));
6637       return;
6638     }
6639   }
6640 #if KMP_DYNAMIC_LIB
6641   if (__kmp_pause_status != kmp_hard_paused)
6642   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6643   // because we will better shutdown later in the library destructor.
6644   {
6645     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6646     return;
6647   }
6648 #endif
6649   /* synchronize the termination process */
6650   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6651
6652   /* have we already finished */
6653   if (__kmp_global.g.g_abort) {
6654     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6655     /* TODO abort? */
6656     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657     return;
6658   }
6659   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6660     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6661     return;
6662   }
6663
6664   /* We need this lock to enforce mutex between this reading of
6665      __kmp_threads_capacity and the writing by __kmp_register_root.
6666      Alternatively, we can use a counter of roots that is atomically updated by
6667      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6668      __kmp_internal_end_*.  */
6669
6670   /* should we finish the run-time?  are all siblings done? */
6671   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6672
6673   for (i = 0; i < __kmp_threads_capacity; ++i) {
6674     if (KMP_UBER_GTID(i)) {
6675       KA_TRACE(
6676           10,
6677           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6678       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6679       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6680       return;
6681     }
6682   }
6683
6684   /* now we can safely conduct the actual termination */
6685
6686   __kmp_internal_end();
6687
6688   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6689   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6690
6691   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6692
6693 #ifdef DUMP_DEBUG_ON_EXIT
6694   if (__kmp_debug_buf)
6695     __kmp_dump_debug_buffer();
6696 #endif
6697 } // __kmp_internal_end_thread
6698
6699 // -----------------------------------------------------------------------------
6700 // Library registration stuff.
6701
6702 static long __kmp_registration_flag = 0;
6703 // Random value used to indicate library initialization.
6704 static char *__kmp_registration_str = NULL;
6705 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6706
6707 static inline char *__kmp_reg_status_name() {
6708 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6709    each thread. If registration and unregistration go in different threads
6710    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6711    env var can not be found, because the name will contain different pid. */
6712 // macOS* complains about name being too long with additional getuid()
6713 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6714   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6715                           (int)getuid());
6716 #else
6717   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6718 #endif
6719 } // __kmp_reg_status_get
6720
6721 #if defined(KMP_USE_SHM)
6722 bool __kmp_shm_available = false;
6723 bool __kmp_tmp_available = false;
6724 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6725 char *temp_reg_status_file_name = nullptr;
6726 #endif
6727
6728 void __kmp_register_library_startup(void) {
6729
6730   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6731   int done = 0;
6732   union {
6733     double dtime;
6734     long ltime;
6735   } time;
6736 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6737   __kmp_initialize_system_tick();
6738 #endif
6739   __kmp_read_system_time(&time.dtime);
6740   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6741   __kmp_registration_str =
6742       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6743                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6744
6745   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6746                 __kmp_registration_str));
6747
6748   while (!done) {
6749
6750     char *value = NULL; // Actual value of the environment variable.
6751
6752 #if defined(KMP_USE_SHM)
6753     char *shm_name = nullptr;
6754     char *data1 = nullptr;
6755     __kmp_shm_available = __kmp_detect_shm();
6756     if (__kmp_shm_available) {
6757       int fd1 = -1;
6758       shm_name = __kmp_str_format("/%s", name);
6759       int shm_preexist = 0;
6760       fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6761       if ((fd1 == -1) && (errno == EEXIST)) {
6762         // file didn't open because it already exists.
6763         // try opening existing file
6764         fd1 = shm_open(shm_name, O_RDWR, 0666);
6765         if (fd1 == -1) { // file didn't open
6766           KMP_WARNING(FunctionError, "Can't open SHM");
6767           __kmp_shm_available = false;
6768         } else { // able to open existing file
6769           shm_preexist = 1;
6770         }
6771       }
6772       if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6773         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6774           KMP_WARNING(FunctionError, "Can't set size of SHM");
6775           __kmp_shm_available = false;
6776         }
6777       }
6778       if (__kmp_shm_available) { // SHM exists, now map it
6779         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6780                              fd1, 0);
6781         if (data1 == MAP_FAILED) { // failed to map shared memory
6782           KMP_WARNING(FunctionError, "Can't map SHM");
6783           __kmp_shm_available = false;
6784         }
6785       }
6786       if (__kmp_shm_available) { // SHM mapped
6787         if (shm_preexist == 0) { // set data to SHM, set value
6788           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6789         }
6790         // Read value from either what we just wrote or existing file.
6791         value = __kmp_str_format("%s", data1); // read value from SHM
6792         munmap(data1, SHM_SIZE);
6793       }
6794       if (fd1 != -1)
6795         close(fd1);
6796     }
6797     if (!__kmp_shm_available)
6798       __kmp_tmp_available = __kmp_detect_tmp();
6799     if (!__kmp_shm_available && __kmp_tmp_available) {
6800       // SHM failed to work due to an error other than that the file already
6801       // exists. Try to create a temp file under /tmp.
6802       // If /tmp isn't accessible, fall back to using environment variable.
6803       // TODO: /tmp might not always be the temporary directory. For now we will
6804       // not consider TMPDIR.
6805       int fd1 = -1;
6806       temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6807       int tmp_preexist = 0;
6808       fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6809       if ((fd1 == -1) && (errno == EEXIST)) {
6810         // file didn't open because it already exists.
6811         // try opening existing file
6812         fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6813         if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6814           KMP_WARNING(FunctionError, "Can't open TEMP");
6815           __kmp_tmp_available = false;
6816         } else {
6817           tmp_preexist = 1;
6818         }
6819       }
6820       if (__kmp_tmp_available && tmp_preexist == 0) {
6821         // we created /tmp file now set size
6822         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6823           KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6824           __kmp_tmp_available = false;
6825         }
6826       }
6827       if (__kmp_tmp_available) {
6828         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6829                              fd1, 0);
6830         if (data1 == MAP_FAILED) { // failed to map /tmp
6831           KMP_WARNING(FunctionError, "Can't map /tmp");
6832           __kmp_tmp_available = false;
6833         }
6834       }
6835       if (__kmp_tmp_available) {
6836         if (tmp_preexist == 0) { // set data to TMP, set value
6837           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6838         }
6839         // Read value from either what we just wrote or existing file.
6840         value = __kmp_str_format("%s", data1); // read value from SHM
6841         munmap(data1, SHM_SIZE);
6842       }
6843       if (fd1 != -1)
6844         close(fd1);
6845     }
6846     if (!__kmp_shm_available && !__kmp_tmp_available) {
6847       // no /dev/shm and no /tmp -- fall back to environment variable
6848       // Set environment variable, but do not overwrite if it exists.
6849       __kmp_env_set(name, __kmp_registration_str, 0);
6850       // read value to see if it got set
6851       value = __kmp_env_get(name);
6852     }
6853 #else // Windows and unix with static library
6854     // Set environment variable, but do not overwrite if it exists.
6855     __kmp_env_set(name, __kmp_registration_str, 0);
6856     // read value to see if it got set
6857     value = __kmp_env_get(name);
6858 #endif
6859
6860     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6861       done = 1; // Ok, environment variable set successfully, exit the loop.
6862     } else {
6863       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6864       // Check whether it alive or dead.
6865       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6866       char *tail = value;
6867       char *flag_addr_str = NULL;
6868       char *flag_val_str = NULL;
6869       char const *file_name = NULL;
6870       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6871       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6872       file_name = tail;
6873       if (tail != NULL) {
6874         unsigned long *flag_addr = 0;
6875         unsigned long flag_val = 0;
6876         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6877         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6878         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6879           // First, check whether environment-encoded address is mapped into
6880           // addr space.
6881           // If so, dereference it to see if it still has the right value.
6882           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6883             neighbor = 1;
6884           } else {
6885             // If not, then we know the other copy of the library is no longer
6886             // running.
6887             neighbor = 2;
6888           }
6889         }
6890       }
6891       switch (neighbor) {
6892       case 0: // Cannot parse environment variable -- neighbor status unknown.
6893         // Assume it is the incompatible format of future version of the
6894         // library. Assume the other library is alive.
6895         // WARN( ... ); // TODO: Issue a warning.
6896         file_name = "unknown library";
6897         KMP_FALLTHROUGH();
6898       // Attention! Falling to the next case. That's intentional.
6899       case 1: { // Neighbor is alive.
6900         // Check it is allowed.
6901         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6902         if (!__kmp_str_match_true(duplicate_ok)) {
6903           // That's not allowed. Issue fatal error.
6904           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6905                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6906         }
6907         KMP_INTERNAL_FREE(duplicate_ok);
6908         __kmp_duplicate_library_ok = 1;
6909         done = 1; // Exit the loop.
6910       } break;
6911       case 2: { // Neighbor is dead.
6912
6913 #if defined(KMP_USE_SHM)
6914         if (__kmp_shm_available) { // close shared memory.
6915           shm_unlink(shm_name); // this removes file in /dev/shm
6916         } else if (__kmp_tmp_available) {
6917           unlink(temp_reg_status_file_name); // this removes the temp file
6918         } else {
6919           // Clear the variable and try to register library again.
6920           __kmp_env_unset(name);
6921         }
6922 #else
6923         // Clear the variable and try to register library again.
6924         __kmp_env_unset(name);
6925 #endif
6926       } break;
6927       default: {
6928         KMP_DEBUG_ASSERT(0);
6929       } break;
6930       }
6931     }
6932     KMP_INTERNAL_FREE((void *)value);
6933 #if defined(KMP_USE_SHM)
6934     if (shm_name)
6935       KMP_INTERNAL_FREE((void *)shm_name);
6936 #endif
6937   } // while
6938   KMP_INTERNAL_FREE((void *)name);
6939
6940 } // func __kmp_register_library_startup
6941
6942 void __kmp_unregister_library(void) {
6943
6944   char *name = __kmp_reg_status_name();
6945   char *value = NULL;
6946
6947 #if defined(KMP_USE_SHM)
6948   char *shm_name = nullptr;
6949   int fd1;
6950   if (__kmp_shm_available) {
6951     shm_name = __kmp_str_format("/%s", name);
6952     fd1 = shm_open(shm_name, O_RDONLY, 0666);
6953     if (fd1 != -1) { // File opened successfully
6954       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6955       if (data1 != MAP_FAILED) {
6956         value = __kmp_str_format("%s", data1); // read value from SHM
6957         munmap(data1, SHM_SIZE);
6958       }
6959       close(fd1);
6960     }
6961   } else if (__kmp_tmp_available) { // try /tmp
6962     fd1 = open(temp_reg_status_file_name, O_RDONLY);
6963     if (fd1 != -1) { // File opened successfully
6964       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6965       if (data1 != MAP_FAILED) {
6966         value = __kmp_str_format("%s", data1); // read value from /tmp
6967         munmap(data1, SHM_SIZE);
6968       }
6969       close(fd1);
6970     }
6971   } else { // fall back to envirable
6972     value = __kmp_env_get(name);
6973   }
6974 #else
6975   value = __kmp_env_get(name);
6976 #endif
6977
6978   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6979   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6980   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6981 //  Ok, this is our variable. Delete it.
6982 #if defined(KMP_USE_SHM)
6983     if (__kmp_shm_available) {
6984       shm_unlink(shm_name); // this removes file in /dev/shm
6985     } else if (__kmp_tmp_available) {
6986       unlink(temp_reg_status_file_name); // this removes the temp file
6987     } else {
6988       __kmp_env_unset(name);
6989     }
6990 #else
6991     __kmp_env_unset(name);
6992 #endif
6993   }
6994
6995 #if defined(KMP_USE_SHM)
6996   if (shm_name)
6997     KMP_INTERNAL_FREE(shm_name);
6998   if (temp_reg_status_file_name)
6999     KMP_INTERNAL_FREE(temp_reg_status_file_name);
7000 #endif
7001
7002   KMP_INTERNAL_FREE(__kmp_registration_str);
7003   KMP_INTERNAL_FREE(value);
7004   KMP_INTERNAL_FREE(name);
7005
7006   __kmp_registration_flag = 0;
7007   __kmp_registration_str = NULL;
7008
7009 } // __kmp_unregister_library
7010
7011 // End of Library registration stuff.
7012 // -----------------------------------------------------------------------------
7013
7014 #if KMP_MIC_SUPPORTED
7015
7016 static void __kmp_check_mic_type() {
7017   kmp_cpuid_t cpuid_state = {0};
7018   kmp_cpuid_t *cs_p = &cpuid_state;
7019   __kmp_x86_cpuid(1, 0, cs_p);
7020   // We don't support mic1 at the moment
7021   if ((cs_p->eax & 0xff0) == 0xB10) {
7022     __kmp_mic_type = mic2;
7023   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7024     __kmp_mic_type = mic3;
7025   } else {
7026     __kmp_mic_type = non_mic;
7027   }
7028 }
7029
7030 #endif /* KMP_MIC_SUPPORTED */
7031
7032 #if KMP_HAVE_UMWAIT
7033 static void __kmp_user_level_mwait_init() {
7034   struct kmp_cpuid buf;
7035   __kmp_x86_cpuid(7, 0, &buf);
7036   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7037   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7038   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7039   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7040                 __kmp_umwait_enabled));
7041 }
7042 #elif KMP_HAVE_MWAIT
7043 #ifndef AT_INTELPHIUSERMWAIT
7044 // Spurious, non-existent value that should always fail to return anything.
7045 // Will be replaced with the correct value when we know that.
7046 #define AT_INTELPHIUSERMWAIT 10000
7047 #endif
7048 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7049 // earlier OS is used to build the RTL, we'll use the following internal
7050 // function when the entry is not found.
7051 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7052 unsigned long getauxval(unsigned long) { return 0; }
7053
7054 static void __kmp_user_level_mwait_init() {
7055   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7056   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7057   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7058   // KMP_USER_LEVEL_MWAIT was set to TRUE.
7059   if (__kmp_mic_type == mic3) {
7060     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7061     if ((res & 0x1) || __kmp_user_level_mwait) {
7062       __kmp_mwait_enabled = TRUE;
7063       if (__kmp_user_level_mwait) {
7064         KMP_INFORM(EnvMwaitWarn);
7065       }
7066     } else {
7067       __kmp_mwait_enabled = FALSE;
7068     }
7069   }
7070   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7071                 "__kmp_mwait_enabled = %d\n",
7072                 __kmp_mic_type, __kmp_mwait_enabled));
7073 }
7074 #endif /* KMP_HAVE_UMWAIT */
7075
7076 static void __kmp_do_serial_initialize(void) {
7077   int i, gtid;
7078   size_t size;
7079
7080   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7081
7082   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7083   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7084   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7085   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7086   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7087
7088 #if OMPT_SUPPORT
7089   ompt_pre_init();
7090 #endif
7091 #if OMPD_SUPPORT
7092   __kmp_env_dump();
7093   ompd_init();
7094 #endif
7095
7096   __kmp_validate_locks();
7097
7098 #if ENABLE_LIBOMPTARGET
7099   /* Initialize functions from libomptarget */
7100   __kmp_init_omptarget();
7101 #endif
7102
7103   /* Initialize internal memory allocator */
7104   __kmp_init_allocator();
7105
7106   /* Register the library startup via an environment variable or via mapped
7107      shared memory file and check to see whether another copy of the library is
7108      already registered. Since forked child process is often terminated, we
7109      postpone the registration till middle initialization in the child */
7110   if (__kmp_need_register_serial)
7111     __kmp_register_library_startup();
7112
7113   /* TODO reinitialization of library */
7114   if (TCR_4(__kmp_global.g.g_done)) {
7115     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7116   }
7117
7118   __kmp_global.g.g_abort = 0;
7119   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7120
7121 /* initialize the locks */
7122 #if KMP_USE_ADAPTIVE_LOCKS
7123 #if KMP_DEBUG_ADAPTIVE_LOCKS
7124   __kmp_init_speculative_stats();
7125 #endif
7126 #endif
7127 #if KMP_STATS_ENABLED
7128   __kmp_stats_init();
7129 #endif
7130   __kmp_init_lock(&__kmp_global_lock);
7131   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7132   __kmp_init_lock(&__kmp_debug_lock);
7133   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7134   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7135   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7136   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7137   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7138   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7139   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7140   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7141   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7142   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7143   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7144   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7145   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7146   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7147   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7148 #if KMP_USE_MONITOR
7149   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7150 #endif
7151   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7152
7153   /* conduct initialization and initial setup of configuration */
7154
7155   __kmp_runtime_initialize();
7156
7157 #if KMP_MIC_SUPPORTED
7158   __kmp_check_mic_type();
7159 #endif
7160
7161 // Some global variable initialization moved here from kmp_env_initialize()
7162 #ifdef KMP_DEBUG
7163   kmp_diag = 0;
7164 #endif
7165   __kmp_abort_delay = 0;
7166
7167   // From __kmp_init_dflt_team_nth()
7168   /* assume the entire machine will be used */
7169   __kmp_dflt_team_nth_ub = __kmp_xproc;
7170   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7171     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7172   }
7173   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7174     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7175   }
7176   __kmp_max_nth = __kmp_sys_max_nth;
7177   __kmp_cg_max_nth = __kmp_sys_max_nth;
7178   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7179   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7180     __kmp_teams_max_nth = __kmp_sys_max_nth;
7181   }
7182
7183   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7184   // part
7185   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7186 #if KMP_USE_MONITOR
7187   __kmp_monitor_wakeups =
7188       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189   __kmp_bt_intervals =
7190       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7191 #endif
7192   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7193   __kmp_library = library_throughput;
7194   // From KMP_SCHEDULE initialization
7195   __kmp_static = kmp_sch_static_balanced;
7196 // AC: do not use analytical here, because it is non-monotonous
7197 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7198 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7199 // need to repeat assignment
7200 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7201 // bit control and barrier method control parts
7202 #if KMP_FAST_REDUCTION_BARRIER
7203 #define kmp_reduction_barrier_gather_bb ((int)1)
7204 #define kmp_reduction_barrier_release_bb ((int)1)
7205 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7206 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7207 #endif // KMP_FAST_REDUCTION_BARRIER
7208   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7209     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7210     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7211     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7212     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7213 #if KMP_FAST_REDUCTION_BARRIER
7214     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7215       // lin_64 ): hyper,1
7216       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7217       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7218       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7219       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7220     }
7221 #endif // KMP_FAST_REDUCTION_BARRIER
7222   }
7223 #if KMP_FAST_REDUCTION_BARRIER
7224 #undef kmp_reduction_barrier_release_pat
7225 #undef kmp_reduction_barrier_gather_pat
7226 #undef kmp_reduction_barrier_release_bb
7227 #undef kmp_reduction_barrier_gather_bb
7228 #endif // KMP_FAST_REDUCTION_BARRIER
7229 #if KMP_MIC_SUPPORTED
7230   if (__kmp_mic_type == mic2) { // KNC
7231     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7232     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7233     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7234         1; // forkjoin release
7235     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7236     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7237   }
7238 #if KMP_FAST_REDUCTION_BARRIER
7239   if (__kmp_mic_type == mic2) { // KNC
7240     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7241     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7242   }
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244 #endif // KMP_MIC_SUPPORTED
7245
7246 // From KMP_CHECKS initialization
7247 #ifdef KMP_DEBUG
7248   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7249 #else
7250   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7251 #endif
7252
7253   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7254   __kmp_foreign_tp = TRUE;
7255
7256   __kmp_global.g.g_dynamic = FALSE;
7257   __kmp_global.g.g_dynamic_mode = dynamic_default;
7258
7259   __kmp_init_nesting_mode();
7260
7261   __kmp_env_initialize(NULL);
7262
7263 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7264   __kmp_user_level_mwait_init();
7265 #endif
7266 // Print all messages in message catalog for testing purposes.
7267 #ifdef KMP_DEBUG
7268   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7269   if (__kmp_str_match_true(val)) {
7270     kmp_str_buf_t buffer;
7271     __kmp_str_buf_init(&buffer);
7272     __kmp_i18n_dump_catalog(&buffer);
7273     __kmp_printf("%s", buffer.str);
7274     __kmp_str_buf_free(&buffer);
7275   }
7276   __kmp_env_free(&val);
7277 #endif
7278
7279   __kmp_threads_capacity =
7280       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7281   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7282   __kmp_tp_capacity = __kmp_default_tp_capacity(
7283       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7284
7285   // If the library is shut down properly, both pools must be NULL. Just in
7286   // case, set them to NULL -- some memory may leak, but subsequent code will
7287   // work even if pools are not freed.
7288   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7289   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7290   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7291   __kmp_thread_pool = NULL;
7292   __kmp_thread_pool_insert_pt = NULL;
7293   __kmp_team_pool = NULL;
7294
7295   /* Allocate all of the variable sized records */
7296   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7297    * expandable */
7298   /* Since allocation is cache-aligned, just add extra padding at the end */
7299   size =
7300       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7301       CACHE_LINE;
7302   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7303   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7304                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7305
7306   /* init thread counts */
7307   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7308                    0); // Asserts fail if the library is reinitializing and
7309   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7310   __kmp_all_nth = 0;
7311   __kmp_nth = 0;
7312
7313   /* setup the uber master thread and hierarchy */
7314   gtid = __kmp_register_root(TRUE);
7315   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7316   KMP_ASSERT(KMP_UBER_GTID(gtid));
7317   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7318
7319   KMP_MB(); /* Flush all pending memory write invalidates.  */
7320
7321   __kmp_common_initialize();
7322
7323 #if KMP_OS_UNIX
7324   /* invoke the child fork handler */
7325   __kmp_register_atfork();
7326 #endif
7327
7328 #if !KMP_DYNAMIC_LIB ||                                                        \
7329     ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7330   {
7331     /* Invoke the exit handler when the program finishes, only for static
7332        library and macOS* dynamic. For other dynamic libraries, we already
7333        have _fini and DllMain. */
7334     int rc = atexit(__kmp_internal_end_atexit);
7335     if (rc != 0) {
7336       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7337                   __kmp_msg_null);
7338     }
7339   }
7340 #endif
7341
7342 #if KMP_HANDLE_SIGNALS
7343 #if KMP_OS_UNIX
7344   /* NOTE: make sure that this is called before the user installs their own
7345      signal handlers so that the user handlers are called first. this way they
7346      can return false, not call our handler, avoid terminating the library, and
7347      continue execution where they left off. */
7348   __kmp_install_signals(FALSE);
7349 #endif /* KMP_OS_UNIX */
7350 #if KMP_OS_WINDOWS
7351   __kmp_install_signals(TRUE);
7352 #endif /* KMP_OS_WINDOWS */
7353 #endif
7354
7355   /* we have finished the serial initialization */
7356   __kmp_init_counter++;
7357
7358   __kmp_init_serial = TRUE;
7359
7360   if (__kmp_version) {
7361     __kmp_print_version_1();
7362   }
7363
7364   if (__kmp_settings) {
7365     __kmp_env_print();
7366   }
7367
7368   if (__kmp_display_env || __kmp_display_env_verbose) {
7369     __kmp_env_print_2();
7370   }
7371
7372 #if OMPT_SUPPORT
7373   ompt_post_init();
7374 #endif
7375
7376   KMP_MB();
7377
7378   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7379 }
7380
7381 void __kmp_serial_initialize(void) {
7382   if (__kmp_init_serial) {
7383     return;
7384   }
7385   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7386   if (__kmp_init_serial) {
7387     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7388     return;
7389   }
7390   __kmp_do_serial_initialize();
7391   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7392 }
7393
7394 static void __kmp_do_middle_initialize(void) {
7395   int i, j;
7396   int prev_dflt_team_nth;
7397
7398   if (!__kmp_init_serial) {
7399     __kmp_do_serial_initialize();
7400   }
7401
7402   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7403
7404   if (UNLIKELY(!__kmp_need_register_serial)) {
7405     // We are in a forked child process. The registration was skipped during
7406     // serial initialization in __kmp_atfork_child handler. Do it here.
7407     __kmp_register_library_startup();
7408   }
7409
7410   // Save the previous value for the __kmp_dflt_team_nth so that
7411   // we can avoid some reinitialization if it hasn't changed.
7412   prev_dflt_team_nth = __kmp_dflt_team_nth;
7413
7414 #if KMP_AFFINITY_SUPPORTED
7415   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7416   // number of cores on the machine.
7417   __kmp_affinity_initialize(__kmp_affinity);
7418
7419 #endif /* KMP_AFFINITY_SUPPORTED */
7420
7421   KMP_ASSERT(__kmp_xproc > 0);
7422   if (__kmp_avail_proc == 0) {
7423     __kmp_avail_proc = __kmp_xproc;
7424   }
7425
7426   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7427   // correct them now
7428   j = 0;
7429   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7430     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7431         __kmp_avail_proc;
7432     j++;
7433   }
7434
7435   if (__kmp_dflt_team_nth == 0) {
7436 #ifdef KMP_DFLT_NTH_CORES
7437     // Default #threads = #cores
7438     __kmp_dflt_team_nth = __kmp_ncores;
7439     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7440                   "__kmp_ncores (%d)\n",
7441                   __kmp_dflt_team_nth));
7442 #else
7443     // Default #threads = #available OS procs
7444     __kmp_dflt_team_nth = __kmp_avail_proc;
7445     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7446                   "__kmp_avail_proc(%d)\n",
7447                   __kmp_dflt_team_nth));
7448 #endif /* KMP_DFLT_NTH_CORES */
7449   }
7450
7451   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7452     __kmp_dflt_team_nth = KMP_MIN_NTH;
7453   }
7454   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7455     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7456   }
7457
7458   if (__kmp_nesting_mode > 0)
7459     __kmp_set_nesting_mode_threads();
7460
7461   // There's no harm in continuing if the following check fails,
7462   // but it indicates an error in the previous logic.
7463   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7464
7465   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7466     // Run through the __kmp_threads array and set the num threads icv for each
7467     // root thread that is currently registered with the RTL (which has not
7468     // already explicitly set its nthreads-var with a call to
7469     // omp_set_num_threads()).
7470     for (i = 0; i < __kmp_threads_capacity; i++) {
7471       kmp_info_t *thread = __kmp_threads[i];
7472       if (thread == NULL)
7473         continue;
7474       if (thread->th.th_current_task->td_icvs.nproc != 0)
7475         continue;
7476
7477       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7478     }
7479   }
7480   KA_TRACE(
7481       20,
7482       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7483        __kmp_dflt_team_nth));
7484
7485 #ifdef KMP_ADJUST_BLOCKTIME
7486   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7487   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7488     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7489     if (__kmp_nth > __kmp_avail_proc) {
7490       __kmp_zero_bt = TRUE;
7491     }
7492   }
7493 #endif /* KMP_ADJUST_BLOCKTIME */
7494
7495   /* we have finished middle initialization */
7496   TCW_SYNC_4(__kmp_init_middle, TRUE);
7497
7498   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7499 }
7500
7501 void __kmp_middle_initialize(void) {
7502   if (__kmp_init_middle) {
7503     return;
7504   }
7505   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7506   if (__kmp_init_middle) {
7507     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7508     return;
7509   }
7510   __kmp_do_middle_initialize();
7511   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7512 }
7513
7514 void __kmp_parallel_initialize(void) {
7515   int gtid = __kmp_entry_gtid(); // this might be a new root
7516
7517   /* synchronize parallel initialization (for sibling) */
7518   if (TCR_4(__kmp_init_parallel))
7519     return;
7520   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521   if (TCR_4(__kmp_init_parallel)) {
7522     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523     return;
7524   }
7525
7526   /* TODO reinitialization after we have already shut down */
7527   if (TCR_4(__kmp_global.g.g_done)) {
7528     KA_TRACE(
7529         10,
7530         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7531     __kmp_infinite_loop();
7532   }
7533
7534   /* jc: The lock __kmp_initz_lock is already held, so calling
7535      __kmp_serial_initialize would cause a deadlock.  So we call
7536      __kmp_do_serial_initialize directly. */
7537   if (!__kmp_init_middle) {
7538     __kmp_do_middle_initialize();
7539   }
7540   __kmp_assign_root_init_mask();
7541   __kmp_resume_if_hard_paused();
7542
7543   /* begin initialization */
7544   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7545   KMP_ASSERT(KMP_UBER_GTID(gtid));
7546
7547 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7548   // Save the FP control regs.
7549   // Worker threads will set theirs to these values at thread startup.
7550   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7551   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7552   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7553 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7554
7555 #if KMP_OS_UNIX
7556 #if KMP_HANDLE_SIGNALS
7557   /*  must be after __kmp_serial_initialize  */
7558   __kmp_install_signals(TRUE);
7559 #endif
7560 #endif
7561
7562   __kmp_suspend_initialize();
7563
7564 #if defined(USE_LOAD_BALANCE)
7565   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7566     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7567   }
7568 #else
7569   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7570     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7571   }
7572 #endif
7573
7574   if (__kmp_version) {
7575     __kmp_print_version_2();
7576   }
7577
7578   /* we have finished parallel initialization */
7579   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7580
7581   KMP_MB();
7582   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7583
7584   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7585 }
7586
7587 void __kmp_hidden_helper_initialize() {
7588   if (TCR_4(__kmp_init_hidden_helper))
7589     return;
7590
7591   // __kmp_parallel_initialize is required before we initialize hidden helper
7592   if (!TCR_4(__kmp_init_parallel))
7593     __kmp_parallel_initialize();
7594
7595   // Double check. Note that this double check should not be placed before
7596   // __kmp_parallel_initialize as it will cause dead lock.
7597   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7598   if (TCR_4(__kmp_init_hidden_helper)) {
7599     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600     return;
7601   }
7602
7603 #if KMP_AFFINITY_SUPPORTED
7604   // Initialize hidden helper affinity settings.
7605   // The above __kmp_parallel_initialize() will initialize
7606   // regular affinity (and topology) if not already done.
7607   if (!__kmp_hh_affinity.flags.initialized)
7608     __kmp_affinity_initialize(__kmp_hh_affinity);
7609 #endif
7610
7611   // Set the count of hidden helper tasks to be executed to zero
7612   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7613
7614   // Set the global variable indicating that we're initializing hidden helper
7615   // team/threads
7616   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7617
7618   // Platform independent initialization
7619   __kmp_do_initialize_hidden_helper_threads();
7620
7621   // Wait here for the finish of initialization of hidden helper teams
7622   __kmp_hidden_helper_threads_initz_wait();
7623
7624   // We have finished hidden helper initialization
7625   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7626
7627   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7628 }
7629
7630 /* ------------------------------------------------------------------------ */
7631
7632 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7633                                    kmp_team_t *team) {
7634   kmp_disp_t *dispatch;
7635
7636   KMP_MB();
7637
7638   /* none of the threads have encountered any constructs, yet. */
7639   this_thr->th.th_local.this_construct = 0;
7640 #if KMP_CACHE_MANAGE
7641   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7642 #endif /* KMP_CACHE_MANAGE */
7643   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7644   KMP_DEBUG_ASSERT(dispatch);
7645   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7646   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7647   // this_thr->th.th_info.ds.ds_tid ] );
7648
7649   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7650   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7651   if (__kmp_env_consistency_check)
7652     __kmp_push_parallel(gtid, team->t.t_ident);
7653
7654   KMP_MB(); /* Flush all pending memory write invalidates.  */
7655 }
7656
7657 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7658                                   kmp_team_t *team) {
7659   if (__kmp_env_consistency_check)
7660     __kmp_pop_parallel(gtid, team->t.t_ident);
7661
7662   __kmp_finish_implicit_task(this_thr);
7663 }
7664
7665 int __kmp_invoke_task_func(int gtid) {
7666   int rc;
7667   int tid = __kmp_tid_from_gtid(gtid);
7668   kmp_info_t *this_thr = __kmp_threads[gtid];
7669   kmp_team_t *team = this_thr->th.th_team;
7670
7671   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7672 #if USE_ITT_BUILD
7673   if (__itt_stack_caller_create_ptr) {
7674     // inform ittnotify about entering user's code
7675     if (team->t.t_stack_id != NULL) {
7676       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7677     } else {
7678       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7679       __kmp_itt_stack_callee_enter(
7680           (__itt_caller)team->t.t_parent->t.t_stack_id);
7681     }
7682   }
7683 #endif /* USE_ITT_BUILD */
7684 #if INCLUDE_SSC_MARKS
7685   SSC_MARK_INVOKING();
7686 #endif
7687
7688 #if OMPT_SUPPORT
7689   void *dummy;
7690   void **exit_frame_p;
7691   ompt_data_t *my_task_data;
7692   ompt_data_t *my_parallel_data;
7693   int ompt_team_size;
7694
7695   if (ompt_enabled.enabled) {
7696     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7697                          .ompt_task_info.frame.exit_frame.ptr);
7698   } else {
7699     exit_frame_p = &dummy;
7700   }
7701
7702   my_task_data =
7703       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7704   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7705   if (ompt_enabled.ompt_callback_implicit_task) {
7706     ompt_team_size = team->t.t_nproc;
7707     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7708         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7709         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7710     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7711   }
7712 #endif
7713
7714 #if KMP_STATS_ENABLED
7715   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7716   if (previous_state == stats_state_e::TEAMS_REGION) {
7717     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7718   } else {
7719     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7720   }
7721   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7722 #endif
7723
7724   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7725                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7726 #if OMPT_SUPPORT
7727                               ,
7728                               exit_frame_p
7729 #endif
7730   );
7731 #if OMPT_SUPPORT
7732   *exit_frame_p = NULL;
7733   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7734 #endif
7735
7736 #if KMP_STATS_ENABLED
7737   if (previous_state == stats_state_e::TEAMS_REGION) {
7738     KMP_SET_THREAD_STATE(previous_state);
7739   }
7740   KMP_POP_PARTITIONED_TIMER();
7741 #endif
7742
7743 #if USE_ITT_BUILD
7744   if (__itt_stack_caller_create_ptr) {
7745     // inform ittnotify about leaving user's code
7746     if (team->t.t_stack_id != NULL) {
7747       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7748     } else {
7749       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7750       __kmp_itt_stack_callee_leave(
7751           (__itt_caller)team->t.t_parent->t.t_stack_id);
7752     }
7753   }
7754 #endif /* USE_ITT_BUILD */
7755   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7756
7757   return rc;
7758 }
7759
7760 void __kmp_teams_master(int gtid) {
7761   // This routine is called by all primary threads in teams construct
7762   kmp_info_t *thr = __kmp_threads[gtid];
7763   kmp_team_t *team = thr->th.th_team;
7764   ident_t *loc = team->t.t_ident;
7765   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7766   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7767   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7768   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7769                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7770
7771   // This thread is a new CG root.  Set up the proper variables.
7772   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7773   tmp->cg_root = thr; // Make thr the CG root
7774   // Init to thread limit stored when league primary threads were forked
7775   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7776   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7777   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7778                  " cg_nthreads to 1\n",
7779                  thr, tmp));
7780   tmp->up = thr->th.th_cg_roots;
7781   thr->th.th_cg_roots = tmp;
7782
7783 // Launch league of teams now, but not let workers execute
7784 // (they hang on fork barrier until next parallel)
7785 #if INCLUDE_SSC_MARKS
7786   SSC_MARK_FORKING();
7787 #endif
7788   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7789                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7790                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7791 #if INCLUDE_SSC_MARKS
7792   SSC_MARK_JOINING();
7793 #endif
7794   // If the team size was reduced from the limit, set it to the new size
7795   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7796     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7797   // AC: last parameter "1" eliminates join barrier which won't work because
7798   // worker threads are in a fork barrier waiting for more parallel regions
7799   __kmp_join_call(loc, gtid
7800 #if OMPT_SUPPORT
7801                   ,
7802                   fork_context_intel
7803 #endif
7804                   ,
7805                   1);
7806 }
7807
7808 int __kmp_invoke_teams_master(int gtid) {
7809   kmp_info_t *this_thr = __kmp_threads[gtid];
7810   kmp_team_t *team = this_thr->th.th_team;
7811 #if KMP_DEBUG
7812   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7813     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7814                      (void *)__kmp_teams_master);
7815 #endif
7816   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7817 #if OMPT_SUPPORT
7818   int tid = __kmp_tid_from_gtid(gtid);
7819   ompt_data_t *task_data =
7820       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7821   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7822   if (ompt_enabled.ompt_callback_implicit_task) {
7823     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7824         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7825         ompt_task_initial);
7826     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7827   }
7828 #endif
7829   __kmp_teams_master(gtid);
7830 #if OMPT_SUPPORT
7831   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7832 #endif
7833   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7834   return 1;
7835 }
7836
7837 /* this sets the requested number of threads for the next parallel region
7838    encountered by this team. since this should be enclosed in the forkjoin
7839    critical section it should avoid race conditions with asymmetrical nested
7840    parallelism */
7841
7842 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7843   kmp_info_t *thr = __kmp_threads[gtid];
7844
7845   if (num_threads > 0)
7846     thr->th.th_set_nproc = num_threads;
7847 }
7848
7849 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7850                                     int num_threads) {
7851   KMP_DEBUG_ASSERT(thr);
7852   // Remember the number of threads for inner parallel regions
7853   if (!TCR_4(__kmp_init_middle))
7854     __kmp_middle_initialize(); // get internal globals calculated
7855   __kmp_assign_root_init_mask();
7856   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7857   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7858
7859   if (num_threads == 0) {
7860     if (__kmp_teams_thread_limit > 0) {
7861       num_threads = __kmp_teams_thread_limit;
7862     } else {
7863       num_threads = __kmp_avail_proc / num_teams;
7864     }
7865     // adjust num_threads w/o warning as it is not user setting
7866     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7867     // no thread_limit clause specified -  do not change thread-limit-var ICV
7868     if (num_threads > __kmp_dflt_team_nth) {
7869       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7870     }
7871     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7872       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7873     } // prevent team size to exceed thread-limit-var
7874     if (num_teams * num_threads > __kmp_teams_max_nth) {
7875       num_threads = __kmp_teams_max_nth / num_teams;
7876     }
7877     if (num_threads == 0) {
7878       num_threads = 1;
7879     }
7880   } else {
7881     if (num_threads < 0) {
7882       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7883                 __kmp_msg_null);
7884       num_threads = 1;
7885     }
7886     // This thread will be the primary thread of the league primary threads
7887     // Store new thread limit; old limit is saved in th_cg_roots list
7888     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7889     // num_threads = min(num_threads, nthreads-var)
7890     if (num_threads > __kmp_dflt_team_nth) {
7891       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7892     }
7893     if (num_teams * num_threads > __kmp_teams_max_nth) {
7894       int new_threads = __kmp_teams_max_nth / num_teams;
7895       if (new_threads == 0) {
7896         new_threads = 1;
7897       }
7898       if (new_threads != num_threads) {
7899         if (!__kmp_reserve_warn) { // user asked for too many threads
7900           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7901           __kmp_msg(kmp_ms_warning,
7902                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7903                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904         }
7905       }
7906       num_threads = new_threads;
7907     }
7908   }
7909   thr->th.th_teams_size.nth = num_threads;
7910 }
7911
7912 /* this sets the requested number of teams for the teams region and/or
7913    the number of threads for the next parallel region encountered  */
7914 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7915                           int num_threads) {
7916   kmp_info_t *thr = __kmp_threads[gtid];
7917   if (num_teams < 0) {
7918     // OpenMP specification requires requested values to be positive,
7919     // but people can send us any value, so we'd better check
7920     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7921               __kmp_msg_null);
7922     num_teams = 1;
7923   }
7924   if (num_teams == 0) {
7925     if (__kmp_nteams > 0) {
7926       num_teams = __kmp_nteams;
7927     } else {
7928       num_teams = 1; // default number of teams is 1.
7929     }
7930   }
7931   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7932     if (!__kmp_reserve_warn) {
7933       __kmp_reserve_warn = 1;
7934       __kmp_msg(kmp_ms_warning,
7935                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7936                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7937     }
7938     num_teams = __kmp_teams_max_nth;
7939   }
7940   // Set number of teams (number of threads in the outer "parallel" of the
7941   // teams)
7942   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7943
7944   __kmp_push_thread_limit(thr, num_teams, num_threads);
7945 }
7946
7947 /* This sets the requested number of teams for the teams region and/or
7948    the number of threads for the next parallel region encountered  */
7949 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7950                              int num_teams_ub, int num_threads) {
7951   kmp_info_t *thr = __kmp_threads[gtid];
7952   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7953   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7954   KMP_DEBUG_ASSERT(num_threads >= 0);
7955
7956   if (num_teams_lb > num_teams_ub) {
7957     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7958                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7959   }
7960
7961   int num_teams = 1; // defalt number of teams is 1.
7962
7963   if (num_teams_lb == 0 && num_teams_ub > 0)
7964     num_teams_lb = num_teams_ub;
7965
7966   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7967     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7968     if (num_teams > __kmp_teams_max_nth) {
7969       if (!__kmp_reserve_warn) {
7970         __kmp_reserve_warn = 1;
7971         __kmp_msg(kmp_ms_warning,
7972                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7973                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7974       }
7975       num_teams = __kmp_teams_max_nth;
7976     }
7977   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7978     num_teams = num_teams_ub;
7979   } else { // num_teams_lb <= num_teams <= num_teams_ub
7980     if (num_threads <= 0) {
7981       if (num_teams_ub > __kmp_teams_max_nth) {
7982         num_teams = num_teams_lb;
7983       } else {
7984         num_teams = num_teams_ub;
7985       }
7986     } else {
7987       num_teams = (num_threads > __kmp_teams_max_nth)
7988                       ? num_teams
7989                       : __kmp_teams_max_nth / num_threads;
7990       if (num_teams < num_teams_lb) {
7991         num_teams = num_teams_lb;
7992       } else if (num_teams > num_teams_ub) {
7993         num_teams = num_teams_ub;
7994       }
7995     }
7996   }
7997   // Set number of teams (number of threads in the outer "parallel" of the
7998   // teams)
7999   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8000
8001   __kmp_push_thread_limit(thr, num_teams, num_threads);
8002 }
8003
8004 // Set the proc_bind var to use in the following parallel region.
8005 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8006   kmp_info_t *thr = __kmp_threads[gtid];
8007   thr->th.th_set_proc_bind = proc_bind;
8008 }
8009
8010 /* Launch the worker threads into the microtask. */
8011
8012 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8013   kmp_info_t *this_thr = __kmp_threads[gtid];
8014
8015 #ifdef KMP_DEBUG
8016   int f;
8017 #endif /* KMP_DEBUG */
8018
8019   KMP_DEBUG_ASSERT(team);
8020   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8021   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8022   KMP_MB(); /* Flush all pending memory write invalidates.  */
8023
8024   team->t.t_construct = 0; /* no single directives seen yet */
8025   team->t.t_ordered.dt.t_value =
8026       0; /* thread 0 enters the ordered section first */
8027
8028   /* Reset the identifiers on the dispatch buffer */
8029   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8030   if (team->t.t_max_nproc > 1) {
8031     int i;
8032     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8033       team->t.t_disp_buffer[i].buffer_index = i;
8034       team->t.t_disp_buffer[i].doacross_buf_idx = i;
8035     }
8036   } else {
8037     team->t.t_disp_buffer[0].buffer_index = 0;
8038     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8039   }
8040
8041   KMP_MB(); /* Flush all pending memory write invalidates.  */
8042   KMP_ASSERT(this_thr->th.th_team == team);
8043
8044 #ifdef KMP_DEBUG
8045   for (f = 0; f < team->t.t_nproc; f++) {
8046     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8047                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8048   }
8049 #endif /* KMP_DEBUG */
8050
8051   /* release the worker threads so they may begin working */
8052   __kmp_fork_barrier(gtid, 0);
8053 }
8054
8055 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8056   kmp_info_t *this_thr = __kmp_threads[gtid];
8057
8058   KMP_DEBUG_ASSERT(team);
8059   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8060   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8061   KMP_MB(); /* Flush all pending memory write invalidates.  */
8062
8063   /* Join barrier after fork */
8064
8065 #ifdef KMP_DEBUG
8066   if (__kmp_threads[gtid] &&
8067       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8068     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8069                  __kmp_threads[gtid]);
8070     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8071                  "team->t.t_nproc=%d\n",
8072                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8073                  team->t.t_nproc);
8074     __kmp_print_structure();
8075   }
8076   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8077                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8078 #endif /* KMP_DEBUG */
8079
8080   __kmp_join_barrier(gtid); /* wait for everyone */
8081 #if OMPT_SUPPORT
8082   if (ompt_enabled.enabled &&
8083       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8084     int ds_tid = this_thr->th.th_info.ds.ds_tid;
8085     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8086     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8087 #if OMPT_OPTIONAL
8088     void *codeptr = NULL;
8089     if (KMP_MASTER_TID(ds_tid) &&
8090         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8091          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8092       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8093
8094     if (ompt_enabled.ompt_callback_sync_region_wait) {
8095       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8096           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8097           codeptr);
8098     }
8099     if (ompt_enabled.ompt_callback_sync_region) {
8100       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8101           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8102           codeptr);
8103     }
8104 #endif
8105     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8106       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8107           ompt_scope_end, NULL, task_data, 0, ds_tid,
8108           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8109     }
8110   }
8111 #endif
8112
8113   KMP_MB(); /* Flush all pending memory write invalidates.  */
8114   KMP_ASSERT(this_thr->th.th_team == team);
8115 }
8116
8117 /* ------------------------------------------------------------------------ */
8118
8119 #ifdef USE_LOAD_BALANCE
8120
8121 // Return the worker threads actively spinning in the hot team, if we
8122 // are at the outermost level of parallelism.  Otherwise, return 0.
8123 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8124   int i;
8125   int retval;
8126   kmp_team_t *hot_team;
8127
8128   if (root->r.r_active) {
8129     return 0;
8130   }
8131   hot_team = root->r.r_hot_team;
8132   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8133     return hot_team->t.t_nproc - 1; // Don't count primary thread
8134   }
8135
8136   // Skip the primary thread - it is accounted for elsewhere.
8137   retval = 0;
8138   for (i = 1; i < hot_team->t.t_nproc; i++) {
8139     if (hot_team->t.t_threads[i]->th.th_active) {
8140       retval++;
8141     }
8142   }
8143   return retval;
8144 }
8145
8146 // Perform an automatic adjustment to the number of
8147 // threads used by the next parallel region.
8148 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8149   int retval;
8150   int pool_active;
8151   int hot_team_active;
8152   int team_curr_active;
8153   int system_active;
8154
8155   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8156                 set_nproc));
8157   KMP_DEBUG_ASSERT(root);
8158   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8159                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8160   KMP_DEBUG_ASSERT(set_nproc > 1);
8161
8162   if (set_nproc == 1) {
8163     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8164     return 1;
8165   }
8166
8167   // Threads that are active in the thread pool, active in the hot team for this
8168   // particular root (if we are at the outer par level), and the currently
8169   // executing thread (to become the primary thread) are available to add to the
8170   // new team, but are currently contributing to the system load, and must be
8171   // accounted for.
8172   pool_active = __kmp_thread_pool_active_nth;
8173   hot_team_active = __kmp_active_hot_team_nproc(root);
8174   team_curr_active = pool_active + hot_team_active + 1;
8175
8176   // Check the system load.
8177   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8178   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8179                 "hot team active = %d\n",
8180                 system_active, pool_active, hot_team_active));
8181
8182   if (system_active < 0) {
8183     // There was an error reading the necessary info from /proc, so use the
8184     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8185     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8186     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8187     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8188
8189     // Make this call behave like the thread limit algorithm.
8190     retval = __kmp_avail_proc - __kmp_nth +
8191              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8192     if (retval > set_nproc) {
8193       retval = set_nproc;
8194     }
8195     if (retval < KMP_MIN_NTH) {
8196       retval = KMP_MIN_NTH;
8197     }
8198
8199     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8200                   retval));
8201     return retval;
8202   }
8203
8204   // There is a slight delay in the load balance algorithm in detecting new
8205   // running procs. The real system load at this instant should be at least as
8206   // large as the #active omp thread that are available to add to the team.
8207   if (system_active < team_curr_active) {
8208     system_active = team_curr_active;
8209   }
8210   retval = __kmp_avail_proc - system_active + team_curr_active;
8211   if (retval > set_nproc) {
8212     retval = set_nproc;
8213   }
8214   if (retval < KMP_MIN_NTH) {
8215     retval = KMP_MIN_NTH;
8216   }
8217
8218   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8219   return retval;
8220 } // __kmp_load_balance_nproc()
8221
8222 #endif /* USE_LOAD_BALANCE */
8223
8224 /* ------------------------------------------------------------------------ */
8225
8226 /* NOTE: this is called with the __kmp_init_lock held */
8227 void __kmp_cleanup(void) {
8228   int f;
8229
8230   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8231
8232   if (TCR_4(__kmp_init_parallel)) {
8233 #if KMP_HANDLE_SIGNALS
8234     __kmp_remove_signals();
8235 #endif
8236     TCW_4(__kmp_init_parallel, FALSE);
8237   }
8238
8239   if (TCR_4(__kmp_init_middle)) {
8240 #if KMP_AFFINITY_SUPPORTED
8241     __kmp_affinity_uninitialize();
8242 #endif /* KMP_AFFINITY_SUPPORTED */
8243     __kmp_cleanup_hierarchy();
8244     TCW_4(__kmp_init_middle, FALSE);
8245   }
8246
8247   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8248
8249   if (__kmp_init_serial) {
8250     __kmp_runtime_destroy();
8251     __kmp_init_serial = FALSE;
8252   }
8253
8254   __kmp_cleanup_threadprivate_caches();
8255
8256   for (f = 0; f < __kmp_threads_capacity; f++) {
8257     if (__kmp_root[f] != NULL) {
8258       __kmp_free(__kmp_root[f]);
8259       __kmp_root[f] = NULL;
8260     }
8261   }
8262   __kmp_free(__kmp_threads);
8263   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8264   // there is no need in freeing __kmp_root.
8265   __kmp_threads = NULL;
8266   __kmp_root = NULL;
8267   __kmp_threads_capacity = 0;
8268
8269   // Free old __kmp_threads arrays if they exist.
8270   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8271   while (ptr) {
8272     kmp_old_threads_list_t *next = ptr->next;
8273     __kmp_free(ptr->threads);
8274     __kmp_free(ptr);
8275     ptr = next;
8276   }
8277
8278 #if KMP_USE_DYNAMIC_LOCK
8279   __kmp_cleanup_indirect_user_locks();
8280 #else
8281   __kmp_cleanup_user_locks();
8282 #endif
8283 #if OMPD_SUPPORT
8284   if (ompd_state) {
8285     __kmp_free(ompd_env_block);
8286     ompd_env_block = NULL;
8287     ompd_env_block_size = 0;
8288   }
8289 #endif
8290
8291 #if KMP_AFFINITY_SUPPORTED
8292   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8293   __kmp_cpuinfo_file = NULL;
8294 #endif /* KMP_AFFINITY_SUPPORTED */
8295
8296 #if KMP_USE_ADAPTIVE_LOCKS
8297 #if KMP_DEBUG_ADAPTIVE_LOCKS
8298   __kmp_print_speculative_stats();
8299 #endif
8300 #endif
8301   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8302   __kmp_nested_nth.nth = NULL;
8303   __kmp_nested_nth.size = 0;
8304   __kmp_nested_nth.used = 0;
8305   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8306   __kmp_nested_proc_bind.bind_types = NULL;
8307   __kmp_nested_proc_bind.size = 0;
8308   __kmp_nested_proc_bind.used = 0;
8309   if (__kmp_affinity_format) {
8310     KMP_INTERNAL_FREE(__kmp_affinity_format);
8311     __kmp_affinity_format = NULL;
8312   }
8313
8314   __kmp_i18n_catclose();
8315
8316 #if KMP_USE_HIER_SCHED
8317   __kmp_hier_scheds.deallocate();
8318 #endif
8319
8320 #if KMP_STATS_ENABLED
8321   __kmp_stats_fini();
8322 #endif
8323
8324   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8325 }
8326
8327 /* ------------------------------------------------------------------------ */
8328
8329 int __kmp_ignore_mppbeg(void) {
8330   char *env;
8331
8332   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8333     if (__kmp_str_match_false(env))
8334       return FALSE;
8335   }
8336   // By default __kmpc_begin() is no-op.
8337   return TRUE;
8338 }
8339
8340 int __kmp_ignore_mppend(void) {
8341   char *env;
8342
8343   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8344     if (__kmp_str_match_false(env))
8345       return FALSE;
8346   }
8347   // By default __kmpc_end() is no-op.
8348   return TRUE;
8349 }
8350
8351 void __kmp_internal_begin(void) {
8352   int gtid;
8353   kmp_root_t *root;
8354
8355   /* this is a very important step as it will register new sibling threads
8356      and assign these new uber threads a new gtid */
8357   gtid = __kmp_entry_gtid();
8358   root = __kmp_threads[gtid]->th.th_root;
8359   KMP_ASSERT(KMP_UBER_GTID(gtid));
8360
8361   if (root->r.r_begin)
8362     return;
8363   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8364   if (root->r.r_begin) {
8365     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8366     return;
8367   }
8368
8369   root->r.r_begin = TRUE;
8370
8371   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8372 }
8373
8374 /* ------------------------------------------------------------------------ */
8375
8376 void __kmp_user_set_library(enum library_type arg) {
8377   int gtid;
8378   kmp_root_t *root;
8379   kmp_info_t *thread;
8380
8381   /* first, make sure we are initialized so we can get our gtid */
8382
8383   gtid = __kmp_entry_gtid();
8384   thread = __kmp_threads[gtid];
8385
8386   root = thread->th.th_root;
8387
8388   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8389                 library_serial));
8390   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8391                                   thread */
8392     KMP_WARNING(SetLibraryIncorrectCall);
8393     return;
8394   }
8395
8396   switch (arg) {
8397   case library_serial:
8398     thread->th.th_set_nproc = 0;
8399     set__nproc(thread, 1);
8400     break;
8401   case library_turnaround:
8402     thread->th.th_set_nproc = 0;
8403     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8404                                            : __kmp_dflt_team_nth_ub);
8405     break;
8406   case library_throughput:
8407     thread->th.th_set_nproc = 0;
8408     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8409                                            : __kmp_dflt_team_nth_ub);
8410     break;
8411   default:
8412     KMP_FATAL(UnknownLibraryType, arg);
8413   }
8414
8415   __kmp_aux_set_library(arg);
8416 }
8417
8418 void __kmp_aux_set_stacksize(size_t arg) {
8419   if (!__kmp_init_serial)
8420     __kmp_serial_initialize();
8421
8422 #if KMP_OS_DARWIN
8423   if (arg & (0x1000 - 1)) {
8424     arg &= ~(0x1000 - 1);
8425     if (arg + 0x1000) /* check for overflow if we round up */
8426       arg += 0x1000;
8427   }
8428 #endif
8429   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8430
8431   /* only change the default stacksize before the first parallel region */
8432   if (!TCR_4(__kmp_init_parallel)) {
8433     size_t value = arg; /* argument is in bytes */
8434
8435     if (value < __kmp_sys_min_stksize)
8436       value = __kmp_sys_min_stksize;
8437     else if (value > KMP_MAX_STKSIZE)
8438       value = KMP_MAX_STKSIZE;
8439
8440     __kmp_stksize = value;
8441
8442     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8443   }
8444
8445   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8446 }
8447
8448 /* set the behaviour of the runtime library */
8449 /* TODO this can cause some odd behaviour with sibling parallelism... */
8450 void __kmp_aux_set_library(enum library_type arg) {
8451   __kmp_library = arg;
8452
8453   switch (__kmp_library) {
8454   case library_serial: {
8455     KMP_INFORM(LibraryIsSerial);
8456   } break;
8457   case library_turnaround:
8458     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8459       __kmp_use_yield = 2; // only yield when oversubscribed
8460     break;
8461   case library_throughput:
8462     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8463       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8464     break;
8465   default:
8466     KMP_FATAL(UnknownLibraryType, arg);
8467   }
8468 }
8469
8470 /* Getting team information common for all team API */
8471 // Returns NULL if not in teams construct
8472 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8473   kmp_info_t *thr = __kmp_entry_thread();
8474   teams_serialized = 0;
8475   if (thr->th.th_teams_microtask) {
8476     kmp_team_t *team = thr->th.th_team;
8477     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8478     int ii = team->t.t_level;
8479     teams_serialized = team->t.t_serialized;
8480     int level = tlevel + 1;
8481     KMP_DEBUG_ASSERT(ii >= tlevel);
8482     while (ii > level) {
8483       for (teams_serialized = team->t.t_serialized;
8484            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8485       }
8486       if (team->t.t_serialized && (!teams_serialized)) {
8487         team = team->t.t_parent;
8488         continue;
8489       }
8490       if (ii > level) {
8491         team = team->t.t_parent;
8492         ii--;
8493       }
8494     }
8495     return team;
8496   }
8497   return NULL;
8498 }
8499
8500 int __kmp_aux_get_team_num() {
8501   int serialized;
8502   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8503   if (team) {
8504     if (serialized > 1) {
8505       return 0; // teams region is serialized ( 1 team of 1 thread ).
8506     } else {
8507       return team->t.t_master_tid;
8508     }
8509   }
8510   return 0;
8511 }
8512
8513 int __kmp_aux_get_num_teams() {
8514   int serialized;
8515   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8516   if (team) {
8517     if (serialized > 1) {
8518       return 1;
8519     } else {
8520       return team->t.t_parent->t.t_nproc;
8521     }
8522   }
8523   return 1;
8524 }
8525
8526 /* ------------------------------------------------------------------------ */
8527
8528 /*
8529  * Affinity Format Parser
8530  *
8531  * Field is in form of: %[[[0].]size]type
8532  * % and type are required (%% means print a literal '%')
8533  * type is either single char or long name surrounded by {},
8534  * e.g., N or {num_threads}
8535  * 0 => leading zeros
8536  * . => right justified when size is specified
8537  * by default output is left justified
8538  * size is the *minimum* field length
8539  * All other characters are printed as is
8540  *
8541  * Available field types:
8542  * L {thread_level}      - omp_get_level()
8543  * n {thread_num}        - omp_get_thread_num()
8544  * h {host}              - name of host machine
8545  * P {process_id}        - process id (integer)
8546  * T {thread_identifier} - native thread identifier (integer)
8547  * N {num_threads}       - omp_get_num_threads()
8548  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8549  * a {thread_affinity}   - comma separated list of integers or integer ranges
8550  *                         (values of affinity mask)
8551  *
8552  * Implementation-specific field types can be added
8553  * If a type is unknown, print "undefined"
8554  */
8555
8556 // Structure holding the short name, long name, and corresponding data type
8557 // for snprintf.  A table of these will represent the entire valid keyword
8558 // field types.
8559 typedef struct kmp_affinity_format_field_t {
8560   char short_name; // from spec e.g., L -> thread level
8561   const char *long_name; // from spec thread_level -> thread level
8562   char field_format; // data type for snprintf (typically 'd' or 's'
8563   // for integer or string)
8564 } kmp_affinity_format_field_t;
8565
8566 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8567 #if KMP_AFFINITY_SUPPORTED
8568     {'A', "thread_affinity", 's'},
8569 #endif
8570     {'t', "team_num", 'd'},
8571     {'T', "num_teams", 'd'},
8572     {'L', "nesting_level", 'd'},
8573     {'n', "thread_num", 'd'},
8574     {'N', "num_threads", 'd'},
8575     {'a', "ancestor_tnum", 'd'},
8576     {'H', "host", 's'},
8577     {'P', "process_id", 'd'},
8578     {'i', "native_thread_id", 'd'}};
8579
8580 // Return the number of characters it takes to hold field
8581 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8582                                             const char **ptr,
8583                                             kmp_str_buf_t *field_buffer) {
8584   int rc, format_index, field_value;
8585   const char *width_left, *width_right;
8586   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8587   static const int FORMAT_SIZE = 20;
8588   char format[FORMAT_SIZE] = {0};
8589   char absolute_short_name = 0;
8590
8591   KMP_DEBUG_ASSERT(gtid >= 0);
8592   KMP_DEBUG_ASSERT(th);
8593   KMP_DEBUG_ASSERT(**ptr == '%');
8594   KMP_DEBUG_ASSERT(field_buffer);
8595
8596   __kmp_str_buf_clear(field_buffer);
8597
8598   // Skip the initial %
8599   (*ptr)++;
8600
8601   // Check for %% first
8602   if (**ptr == '%') {
8603     __kmp_str_buf_cat(field_buffer, "%", 1);
8604     (*ptr)++; // skip over the second %
8605     return 1;
8606   }
8607
8608   // Parse field modifiers if they are present
8609   pad_zeros = false;
8610   if (**ptr == '0') {
8611     pad_zeros = true;
8612     (*ptr)++; // skip over 0
8613   }
8614   right_justify = false;
8615   if (**ptr == '.') {
8616     right_justify = true;
8617     (*ptr)++; // skip over .
8618   }
8619   // Parse width of field: [width_left, width_right)
8620   width_left = width_right = NULL;
8621   if (**ptr >= '0' && **ptr <= '9') {
8622     width_left = *ptr;
8623     SKIP_DIGITS(*ptr);
8624     width_right = *ptr;
8625   }
8626
8627   // Create the format for KMP_SNPRINTF based on flags parsed above
8628   format_index = 0;
8629   format[format_index++] = '%';
8630   if (!right_justify)
8631     format[format_index++] = '-';
8632   if (pad_zeros)
8633     format[format_index++] = '0';
8634   if (width_left && width_right) {
8635     int i = 0;
8636     // Only allow 8 digit number widths.
8637     // This also prevents overflowing format variable
8638     while (i < 8 && width_left < width_right) {
8639       format[format_index++] = *width_left;
8640       width_left++;
8641       i++;
8642     }
8643   }
8644
8645   // Parse a name (long or short)
8646   // Canonicalize the name into absolute_short_name
8647   found_valid_name = false;
8648   parse_long_name = (**ptr == '{');
8649   if (parse_long_name)
8650     (*ptr)++; // skip initial left brace
8651   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8652                              sizeof(__kmp_affinity_format_table[0]);
8653        ++i) {
8654     char short_name = __kmp_affinity_format_table[i].short_name;
8655     const char *long_name = __kmp_affinity_format_table[i].long_name;
8656     char field_format = __kmp_affinity_format_table[i].field_format;
8657     if (parse_long_name) {
8658       size_t length = KMP_STRLEN(long_name);
8659       if (strncmp(*ptr, long_name, length) == 0) {
8660         found_valid_name = true;
8661         (*ptr) += length; // skip the long name
8662       }
8663     } else if (**ptr == short_name) {
8664       found_valid_name = true;
8665       (*ptr)++; // skip the short name
8666     }
8667     if (found_valid_name) {
8668       format[format_index++] = field_format;
8669       format[format_index++] = '\0';
8670       absolute_short_name = short_name;
8671       break;
8672     }
8673   }
8674   if (parse_long_name) {
8675     if (**ptr != '}') {
8676       absolute_short_name = 0;
8677     } else {
8678       (*ptr)++; // skip over the right brace
8679     }
8680   }
8681
8682   // Attempt to fill the buffer with the requested
8683   // value using snprintf within __kmp_str_buf_print()
8684   switch (absolute_short_name) {
8685   case 't':
8686     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8687     break;
8688   case 'T':
8689     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8690     break;
8691   case 'L':
8692     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8693     break;
8694   case 'n':
8695     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8696     break;
8697   case 'H': {
8698     static const int BUFFER_SIZE = 256;
8699     char buf[BUFFER_SIZE];
8700     __kmp_expand_host_name(buf, BUFFER_SIZE);
8701     rc = __kmp_str_buf_print(field_buffer, format, buf);
8702   } break;
8703   case 'P':
8704     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8705     break;
8706   case 'i':
8707     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8708     break;
8709   case 'N':
8710     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8711     break;
8712   case 'a':
8713     field_value =
8714         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8715     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8716     break;
8717 #if KMP_AFFINITY_SUPPORTED
8718   case 'A': {
8719     kmp_str_buf_t buf;
8720     __kmp_str_buf_init(&buf);
8721     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8722     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8723     __kmp_str_buf_free(&buf);
8724   } break;
8725 #endif
8726   default:
8727     // According to spec, If an implementation does not have info for field
8728     // type, then "undefined" is printed
8729     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8730     // Skip the field
8731     if (parse_long_name) {
8732       SKIP_TOKEN(*ptr);
8733       if (**ptr == '}')
8734         (*ptr)++;
8735     } else {
8736       (*ptr)++;
8737     }
8738   }
8739
8740   KMP_ASSERT(format_index <= FORMAT_SIZE);
8741   return rc;
8742 }
8743
8744 /*
8745  * Return number of characters needed to hold the affinity string
8746  * (not including null byte character)
8747  * The resultant string is printed to buffer, which the caller can then
8748  * handle afterwards
8749  */
8750 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8751                                   kmp_str_buf_t *buffer) {
8752   const char *parse_ptr;
8753   size_t retval;
8754   const kmp_info_t *th;
8755   kmp_str_buf_t field;
8756
8757   KMP_DEBUG_ASSERT(buffer);
8758   KMP_DEBUG_ASSERT(gtid >= 0);
8759
8760   __kmp_str_buf_init(&field);
8761   __kmp_str_buf_clear(buffer);
8762
8763   th = __kmp_threads[gtid];
8764   retval = 0;
8765
8766   // If format is NULL or zero-length string, then we use
8767   // affinity-format-var ICV
8768   parse_ptr = format;
8769   if (parse_ptr == NULL || *parse_ptr == '\0') {
8770     parse_ptr = __kmp_affinity_format;
8771   }
8772   KMP_DEBUG_ASSERT(parse_ptr);
8773
8774   while (*parse_ptr != '\0') {
8775     // Parse a field
8776     if (*parse_ptr == '%') {
8777       // Put field in the buffer
8778       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8779       __kmp_str_buf_catbuf(buffer, &field);
8780       retval += rc;
8781     } else {
8782       // Put literal character in buffer
8783       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8784       retval++;
8785       parse_ptr++;
8786     }
8787   }
8788   __kmp_str_buf_free(&field);
8789   return retval;
8790 }
8791
8792 // Displays the affinity string to stdout
8793 void __kmp_aux_display_affinity(int gtid, const char *format) {
8794   kmp_str_buf_t buf;
8795   __kmp_str_buf_init(&buf);
8796   __kmp_aux_capture_affinity(gtid, format, &buf);
8797   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8798   __kmp_str_buf_free(&buf);
8799 }
8800
8801 /* ------------------------------------------------------------------------ */
8802 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8803   int blocktime = arg; /* argument is in microseconds */
8804 #if KMP_USE_MONITOR
8805   int bt_intervals;
8806 #endif
8807   kmp_int8 bt_set;
8808
8809   __kmp_save_internal_controls(thread);
8810
8811   /* Normalize and set blocktime for the teams */
8812   if (blocktime < KMP_MIN_BLOCKTIME)
8813     blocktime = KMP_MIN_BLOCKTIME;
8814   else if (blocktime > KMP_MAX_BLOCKTIME)
8815     blocktime = KMP_MAX_BLOCKTIME;
8816
8817   set__blocktime_team(thread->th.th_team, tid, blocktime);
8818   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8819
8820 #if KMP_USE_MONITOR
8821   /* Calculate and set blocktime intervals for the teams */
8822   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8823
8824   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8825   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8826 #endif
8827
8828   /* Set whether blocktime has been set to "TRUE" */
8829   bt_set = TRUE;
8830
8831   set__bt_set_team(thread->th.th_team, tid, bt_set);
8832   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8833 #if KMP_USE_MONITOR
8834   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8835                 "bt_intervals=%d, monitor_updates=%d\n",
8836                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8837                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8838                 __kmp_monitor_wakeups));
8839 #else
8840   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8841                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8842                 thread->th.th_team->t.t_id, tid, blocktime));
8843 #endif
8844 }
8845
8846 void __kmp_aux_set_defaults(char const *str, size_t len) {
8847   if (!__kmp_init_serial) {
8848     __kmp_serial_initialize();
8849   }
8850   __kmp_env_initialize(str);
8851
8852   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8853     __kmp_env_print();
8854   }
8855 } // __kmp_aux_set_defaults
8856
8857 /* ------------------------------------------------------------------------ */
8858 /* internal fast reduction routines */
8859
8860 PACKED_REDUCTION_METHOD_T
8861 __kmp_determine_reduction_method(
8862     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8863     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8864     kmp_critical_name *lck) {
8865
8866   // Default reduction method: critical construct ( lck != NULL, like in current
8867   // PAROPT )
8868   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8869   // can be selected by RTL
8870   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8871   // can be selected by RTL
8872   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8873   // among generated by PAROPT.
8874
8875   PACKED_REDUCTION_METHOD_T retval;
8876
8877   int team_size;
8878
8879   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8880
8881 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8882   (loc &&                                                                      \
8883    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8884 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8885
8886   retval = critical_reduce_block;
8887
8888   // another choice of getting a team size (with 1 dynamic deference) is slower
8889   team_size = __kmp_get_team_num_threads(global_tid);
8890   if (team_size == 1) {
8891
8892     retval = empty_reduce_block;
8893
8894   } else {
8895
8896     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8897
8898 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8899     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
8900     KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8901
8902 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8903     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
8904     KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8905
8906     int teamsize_cutoff = 4;
8907
8908 #if KMP_MIC_SUPPORTED
8909     if (__kmp_mic_type != non_mic) {
8910       teamsize_cutoff = 8;
8911     }
8912 #endif
8913     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8914     if (tree_available) {
8915       if (team_size <= teamsize_cutoff) {
8916         if (atomic_available) {
8917           retval = atomic_reduce_block;
8918         }
8919       } else {
8920         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8921       }
8922     } else if (atomic_available) {
8923       retval = atomic_reduce_block;
8924     }
8925 #else
8926 #error "Unknown or unsupported OS"
8927 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8928        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8929        // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8930
8931 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
8932     KMP_ARCH_WASM || KMP_ARCH_PPC
8933
8934 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8935     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
8936     KMP_OS_WASI || KMP_OS_AIX
8937
8938     // basic tuning
8939
8940     if (atomic_available) {
8941       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8942         retval = atomic_reduce_block;
8943       }
8944     } // otherwise: use critical section
8945
8946 #elif KMP_OS_DARWIN
8947
8948     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8949     if (atomic_available && (num_vars <= 3)) {
8950       retval = atomic_reduce_block;
8951     } else if (tree_available) {
8952       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8953           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8954         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8955       }
8956     } // otherwise: use critical section
8957
8958 #else
8959 #error "Unknown or unsupported OS"
8960 #endif
8961
8962 #else
8963 #error "Unknown or unsupported architecture"
8964 #endif
8965   }
8966
8967   // KMP_FORCE_REDUCTION
8968
8969   // If the team is serialized (team_size == 1), ignore the forced reduction
8970   // method and stay with the unsynchronized method (empty_reduce_block)
8971   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8972       team_size != 1) {
8973
8974     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8975
8976     int atomic_available, tree_available;
8977
8978     switch ((forced_retval = __kmp_force_reduction_method)) {
8979     case critical_reduce_block:
8980       KMP_ASSERT(lck); // lck should be != 0
8981       break;
8982
8983     case atomic_reduce_block:
8984       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8985       if (!atomic_available) {
8986         KMP_WARNING(RedMethodNotSupported, "atomic");
8987         forced_retval = critical_reduce_block;
8988       }
8989       break;
8990
8991     case tree_reduce_block:
8992       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8993       if (!tree_available) {
8994         KMP_WARNING(RedMethodNotSupported, "tree");
8995         forced_retval = critical_reduce_block;
8996       } else {
8997 #if KMP_FAST_REDUCTION_BARRIER
8998         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8999 #endif
9000       }
9001       break;
9002
9003     default:
9004       KMP_ASSERT(0); // "unsupported method specified"
9005     }
9006
9007     retval = forced_retval;
9008   }
9009
9010   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9011
9012 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9013 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9014
9015   return (retval);
9016 }
9017 // this function is for testing set/get/determine reduce method
9018 kmp_int32 __kmp_get_reduce_method(void) {
9019   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9020 }
9021
9022 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9023 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9024 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9025
9026 // Hard pause shuts down the runtime completely.  Resume happens naturally when
9027 // OpenMP is used subsequently.
9028 void __kmp_hard_pause() {
9029   __kmp_pause_status = kmp_hard_paused;
9030   __kmp_internal_end_thread(-1);
9031 }
9032
9033 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9034 void __kmp_resume_if_soft_paused() {
9035   if (__kmp_pause_status == kmp_soft_paused) {
9036     __kmp_pause_status = kmp_not_paused;
9037
9038     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9039       kmp_info_t *thread = __kmp_threads[gtid];
9040       if (thread) { // Wake it if sleeping
9041         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9042                          thread);
9043         if (fl.is_sleeping())
9044           fl.resume(gtid);
9045         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9046           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9047         } else { // thread holds the lock and may sleep soon
9048           do { // until either the thread sleeps, or we can get the lock
9049             if (fl.is_sleeping()) {
9050               fl.resume(gtid);
9051               break;
9052             } else if (__kmp_try_suspend_mx(thread)) {
9053               __kmp_unlock_suspend_mx(thread);
9054               break;
9055             }
9056           } while (1);
9057         }
9058       }
9059     }
9060   }
9061 }
9062
9063 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9064 // TODO: add warning messages
9065 int __kmp_pause_resource(kmp_pause_status_t level) {
9066   if (level == kmp_not_paused) { // requesting resume
9067     if (__kmp_pause_status == kmp_not_paused) {
9068       // error message about runtime not being paused, so can't resume
9069       return 1;
9070     } else {
9071       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9072                        __kmp_pause_status == kmp_hard_paused);
9073       __kmp_pause_status = kmp_not_paused;
9074       return 0;
9075     }
9076   } else if (level == kmp_soft_paused) { // requesting soft pause
9077     if (__kmp_pause_status != kmp_not_paused) {
9078       // error message about already being paused
9079       return 1;
9080     } else {
9081       __kmp_soft_pause();
9082       return 0;
9083     }
9084   } else if (level == kmp_hard_paused) { // requesting hard pause
9085     if (__kmp_pause_status != kmp_not_paused) {
9086       // error message about already being paused
9087       return 1;
9088     } else {
9089       __kmp_hard_pause();
9090       return 0;
9091     }
9092   } else {
9093     // error message about invalid level
9094     return 1;
9095   }
9096 }
9097
9098 void __kmp_omp_display_env(int verbose) {
9099   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9100   if (__kmp_init_serial == 0)
9101     __kmp_do_serial_initialize();
9102   __kmp_display_env_impl(!verbose, verbose);
9103   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9104 }
9105
9106 // The team size is changing, so distributed barrier must be modified
9107 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9108                                int new_nthreads) {
9109   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9110                    bp_dist_bar);
9111   kmp_info_t **other_threads = team->t.t_threads;
9112
9113   // We want all the workers to stop waiting on the barrier while we adjust the
9114   // size of the team.
9115   for (int f = 1; f < old_nthreads; ++f) {
9116     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9117     // Ignore threads that are already inactive or not present in the team
9118     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9119       // teams construct causes thread_limit to get passed in, and some of
9120       // those could be inactive; just ignore them
9121       continue;
9122     }
9123     // If thread is transitioning still to in_use state, wait for it
9124     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9125       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9126         KMP_CPU_PAUSE();
9127     }
9128     // The thread should be in_use now
9129     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9130     // Transition to unused state
9131     team->t.t_threads[f]->th.th_used_in_team.store(2);
9132     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9133   }
9134   // Release all the workers
9135   team->t.b->go_release();
9136
9137   KMP_MFENCE();
9138
9139   // Workers should see transition status 2 and move to 0; but may need to be
9140   // woken up first
9141   int count = old_nthreads - 1;
9142   while (count > 0) {
9143     count = old_nthreads - 1;
9144     for (int f = 1; f < old_nthreads; ++f) {
9145       if (other_threads[f]->th.th_used_in_team.load() != 0) {
9146         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9147           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9148               void *, other_threads[f]->th.th_sleep_loc);
9149           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9150         }
9151       } else {
9152         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9153         count--;
9154       }
9155     }
9156   }
9157   // Now update the barrier size
9158   team->t.b->update_num_threads(new_nthreads);
9159   team->t.b->go_reset();
9160 }
9161
9162 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9163   // Add the threads back to the team
9164   KMP_DEBUG_ASSERT(team);
9165   // Threads were paused and pointed at th_used_in_team temporarily during a
9166   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9167   // the thread that it should transition itself back into the team. Then, if
9168   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9169   // to wake it up.
9170   for (int f = 1; f < new_nthreads; ++f) {
9171     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9172     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9173                                 3);
9174     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9175       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9176                       (kmp_flag_32<false, false> *)NULL);
9177     }
9178   }
9179   // The threads should be transitioning to the team; when they are done, they
9180   // should have set th_used_in_team to 1. This loop forces master to wait until
9181   // all threads have moved into the team and are waiting in the barrier.
9182   int count = new_nthreads - 1;
9183   while (count > 0) {
9184     count = new_nthreads - 1;
9185     for (int f = 1; f < new_nthreads; ++f) {
9186       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9187         count--;
9188       }
9189     }
9190   }
9191 }
9192
9193 // Globals and functions for hidden helper task
9194 kmp_info_t **__kmp_hidden_helper_threads;
9195 kmp_info_t *__kmp_hidden_helper_main_thread;
9196 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9197 #if KMP_OS_LINUX
9198 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9199 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9200 #else
9201 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9202 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9203 #endif
9204
9205 namespace {
9206 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9207
9208 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9209   // This is an explicit synchronization on all hidden helper threads in case
9210   // that when a regular thread pushes a hidden helper task to one hidden
9211   // helper thread, the thread has not been awaken once since they're released
9212   // by the main thread after creating the team.
9213   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9214   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9215          __kmp_hidden_helper_threads_num)
9216     ;
9217
9218   // If main thread, then wait for signal
9219   if (__kmpc_master(nullptr, *gtid)) {
9220     // First, unset the initial state and release the initial thread
9221     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9222     __kmp_hidden_helper_initz_release();
9223     __kmp_hidden_helper_main_thread_wait();
9224     // Now wake up all worker threads
9225     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9226       __kmp_hidden_helper_worker_thread_signal();
9227     }
9228   }
9229 }
9230 } // namespace
9231
9232 void __kmp_hidden_helper_threads_initz_routine() {
9233   // Create a new root for hidden helper team/threads
9234   const int gtid = __kmp_register_root(TRUE);
9235   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9236   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9237   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9238       __kmp_hidden_helper_threads_num;
9239
9240   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9241
9242   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9243
9244   // Set the initialization flag to FALSE
9245   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9246
9247   __kmp_hidden_helper_threads_deinitz_release();
9248 }
9249
9250 /* Nesting Mode:
9251    Set via KMP_NESTING_MODE, which takes an integer.
9252    Note: we skip duplicate topology levels, and skip levels with only
9253       one entity.
9254    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9255    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9256       in the topology, and initializes the number of threads at each of those
9257       levels to the number of entities at each level, respectively, below the
9258       entity at the parent level.
9259    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9260       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9261       the user to turn nesting on explicitly. This is an even more experimental
9262       option to this experimental feature, and may change or go away in the
9263       future.
9264 */
9265
9266 // Allocate space to store nesting levels
9267 void __kmp_init_nesting_mode() {
9268   int levels = KMP_HW_LAST;
9269   __kmp_nesting_mode_nlevels = levels;
9270   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9271   for (int i = 0; i < levels; ++i)
9272     __kmp_nesting_nth_level[i] = 0;
9273   if (__kmp_nested_nth.size < levels) {
9274     __kmp_nested_nth.nth =
9275         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9276     __kmp_nested_nth.size = levels;
9277   }
9278 }
9279
9280 // Set # threads for top levels of nesting; must be called after topology set
9281 void __kmp_set_nesting_mode_threads() {
9282   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9283
9284   if (__kmp_nesting_mode == 1)
9285     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9286   else if (__kmp_nesting_mode > 1)
9287     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9288
9289   if (__kmp_topology) { // use topology info
9290     int loc, hw_level;
9291     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9292                                 loc < __kmp_nesting_mode_nlevels;
9293          loc++, hw_level++) {
9294       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9295       if (__kmp_nesting_nth_level[loc] == 1)
9296         loc--;
9297     }
9298     // Make sure all cores are used
9299     if (__kmp_nesting_mode > 1 && loc > 1) {
9300       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9301       int num_cores = __kmp_topology->get_count(core_level);
9302       int upper_levels = 1;
9303       for (int level = 0; level < loc - 1; ++level)
9304         upper_levels *= __kmp_nesting_nth_level[level];
9305       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9306         __kmp_nesting_nth_level[loc - 1] =
9307             num_cores / __kmp_nesting_nth_level[loc - 2];
9308     }
9309     __kmp_nesting_mode_nlevels = loc;
9310     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9311   } else { // no topology info available; provide a reasonable guesstimation
9312     if (__kmp_avail_proc >= 4) {
9313       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9314       __kmp_nesting_nth_level[1] = 2;
9315       __kmp_nesting_mode_nlevels = 2;
9316     } else {
9317       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9318       __kmp_nesting_mode_nlevels = 1;
9319     }
9320     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9321   }
9322   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9323     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9324   }
9325   set__nproc(thread, __kmp_nesting_nth_level[0]);
9326   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9327     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9328   if (get__max_active_levels(thread) > 1) {
9329     // if max levels was set, set nesting mode levels to same
9330     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9331   }
9332   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9333     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9334 }
9335
9336 // Empty symbols to export (see exports_so.txt) when feature is disabled
9337 extern "C" {
9338 #if !KMP_STATS_ENABLED
9339 void __kmp_reset_stats() {}
9340 #endif
9341 #if !USE_DEBUGGER
9342 int __kmp_omp_debug_struct_info = FALSE;
9343 int __kmp_debugging = FALSE;
9344 #endif
9345 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9346 void __kmp_itt_fini_ittlib() {}
9347 void __kmp_itt_init_ittlib() {}
9348 #endif
9349 }
9350
9351 // end of file