openmp/runtime/src/kmp_dispatch.cpp

   1 /*
   2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 /* Dynamic scheduling initialization and dispatch.
  14  *
  15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
  16  *       it may change values between parallel regions.  __kmp_max_nth
  17  *       is the largest value __kmp_nth may take, 1 is the smallest.
  18  */
  19
  20 #include "kmp.h"
  21 #include "kmp_error.h"
  22 #include "kmp_i18n.h"
  23 #include "kmp_itt.h"
  24 #include "kmp_stats.h"
  25 #include "kmp_str.h"
  26 #if KMP_USE_X87CONTROL
  27 #include <float.h>
  28 #endif
  29 #include "kmp_lock.h"
  30 #include "kmp_dispatch.h"
  31 #if KMP_USE_HIER_SCHED
  32 #include "kmp_dispatch_hier.h"
  33 #endif
  34
  35 #if OMPT_SUPPORT
  36 #include "ompt-specific.h"
  37 #endif
  38
  39 /* ------------------------------------------------------------------------ */
  40 /* ------------------------------------------------------------------------ */
  41
  42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  43   kmp_info_t *th;
  44
  45   KMP_DEBUG_ASSERT(gtid_ref);
  46
  47   if (__kmp_env_consistency_check) {
  48     th = __kmp_threads[*gtid_ref];
  49     if (th->th.th_root->r.r_active &&
  50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
  51 #if KMP_USE_DYNAMIC_LOCK
  52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
  53 #else
  54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
  55 #endif
  56     }
  57   }
  58 }
  59
  60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  61   kmp_info_t *th;
  62
  63   if (__kmp_env_consistency_check) {
  64     th = __kmp_threads[*gtid_ref];
  65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
  66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
  67     }
  68   }
  69 }
  70
  71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
  72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
  73                                          bool use_hier = false) {
  74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
  75   // Nonmonotonic as default for dynamic schedule when no modifier is specified
  76   int monotonicity = SCHEDULE_NONMONOTONIC;
  77
  78   // Let default be monotonic for executables
  79   // compiled with OpenMP* 4.5 or less compilers
  80   if (loc != NULL && loc->get_openmp_version() < 50)
  81     monotonicity = SCHEDULE_MONOTONIC;
  82
  83   if (use_hier || __kmp_force_monotonic)
  84     monotonicity = SCHEDULE_MONOTONIC;
  85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
  86     monotonicity = SCHEDULE_NONMONOTONIC;
  87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
  88     monotonicity = SCHEDULE_MONOTONIC;
  89
  90   return monotonicity;
  91 }
  92
  93 #if KMP_STATIC_STEAL_ENABLED
  94 enum { // values for steal_flag (possible states of private per-loop buffer)
  95   UNUSED = 0,
  96   CLAIMED = 1, // owner thread started initialization
  97   READY = 2, // available for stealing
  98   THIEF = 3 // finished by owner, or claimed by thief
  99   // possible state changes:
 100   // 0 -> 1 owner only, sync
 101   // 0 -> 3 thief only, sync
 102   // 1 -> 2 owner only, async
 103   // 2 -> 3 owner only, async
 104   // 3 -> 2 owner only, async
 105   // 3 -> 0 last thread finishing the loop, async
 106 };
 107 #endif
 108
 109 // Initialize a dispatch_private_info_template<T> buffer for a particular
 110 // type of schedule,chunk.  The loop description is found in lb (lower bound),
 111 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
 112 // to the scheduling (often the number of threads in a team, but not always if
 113 // hierarchical scheduling is used).  tid is the id of the thread calling
 114 // the function within the group of nproc threads.  It will have a value
 115 // between 0 and nproc - 1.  This is often just the thread id within a team, but
 116 // is not necessarily the case when using hierarchical scheduling.
 117 // loc is the source file location of the corresponding loop
 118 // gtid is the global thread id
 119 template <typename T>
 120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
 121                                    dispatch_private_info_template<T> *pr,
 122                                    enum sched_type schedule, T lb, T ub,
 123                                    typename traits_t<T>::signed_t st,
 124 #if USE_ITT_BUILD
 125                                    kmp_uint64 *cur_chunk,
 126 #endif
 127                                    typename traits_t<T>::signed_t chunk,
 128                                    T nproc, T tid) {
 129   typedef typename traits_t<T>::unsigned_t UT;
 130   typedef typename traits_t<T>::floating_t DBL;
 131
 132   int active;
 133   T tc;
 134   kmp_info_t *th;
 135   kmp_team_t *team;
 136   int monotonicity;
 137   bool use_hier;
 138
 139 #ifdef KMP_DEBUG
 140   typedef typename traits_t<T>::signed_t ST;
 141   {
 142     char *buff;
 143     // create format specifiers before the debug output
 144     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
 145                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
 146                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
 147                             traits_t<T>::spec, traits_t<T>::spec,
 148                             traits_t<ST>::spec, traits_t<ST>::spec,
 149                             traits_t<T>::spec, traits_t<T>::spec);
 150     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
 151     __kmp_str_free(&buff);
 152   }
 153 #endif
 154   /* setup data */
 155   th = __kmp_threads[gtid];
 156   team = th->th.th_team;
 157   active = !team->t.t_serialized;
 158
 159 #if USE_ITT_BUILD
 160   int itt_need_metadata_reporting =
 161       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
 162       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
 163       team->t.t_active_level == 1;
 164 #endif
 165
 166 #if KMP_USE_HIER_SCHED
 167   use_hier = pr->flags.use_hier;
 168 #else
 169   use_hier = false;
 170 #endif
 171
 172   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
 173   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
 174   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 175
 176   /* Pick up the nomerge/ordered bits from the scheduling type */
 177   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
 178     pr->flags.nomerge = TRUE;
 179     schedule =
 180         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
 181   } else {
 182     pr->flags.nomerge = FALSE;
 183   }
 184   pr->type_size = traits_t<T>::type_size; // remember the size of variables
 185   if (kmp_ord_lower & schedule) {
 186     pr->flags.ordered = TRUE;
 187     schedule =
 188         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
 189   } else {
 190     pr->flags.ordered = FALSE;
 191   }
 192   // Ordered overrides nonmonotonic
 193   if (pr->flags.ordered) {
 194     monotonicity = SCHEDULE_MONOTONIC;
 195   }
 196
 197   if (schedule == kmp_sch_static) {
 198     schedule = __kmp_static;
 199   } else {
 200     if (schedule == kmp_sch_runtime) {
 201       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
 202       // not specified)
 203       schedule = team->t.t_sched.r_sched_type;
 204       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
 205       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 206       if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
 207         monotonicity = SCHEDULE_MONOTONIC;
 208       // Detail the schedule if needed (global controls are differentiated
 209       // appropriately)
 210       if (schedule == kmp_sch_guided_chunked) {
 211         schedule = __kmp_guided;
 212       } else if (schedule == kmp_sch_static) {
 213         schedule = __kmp_static;
 214       }
 215       // Use the chunk size specified by OMP_SCHEDULE (or default if not
 216       // specified)
 217       chunk = team->t.t_sched.chunk;
 218 #if USE_ITT_BUILD
 219       if (cur_chunk)
 220         *cur_chunk = chunk;
 221 #endif
 222 #ifdef KMP_DEBUG
 223       {
 224         char *buff;
 225         // create format specifiers before the debug output
 226         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
 227                                 "schedule:%%d chunk:%%%s\n",
 228                                 traits_t<ST>::spec);
 229         KD_TRACE(10, (buff, gtid, schedule, chunk));
 230         __kmp_str_free(&buff);
 231       }
 232 #endif
 233     } else {
 234       if (schedule == kmp_sch_guided_chunked) {
 235         schedule = __kmp_guided;
 236       }
 237       if (chunk <= 0) {
 238         chunk = KMP_DEFAULT_CHUNK;
 239       }
 240     }
 241
 242     if (schedule == kmp_sch_auto) {
 243       // mapping and differentiation: in the __kmp_do_serial_initialize()
 244       schedule = __kmp_auto;
 245 #ifdef KMP_DEBUG
 246       {
 247         char *buff;
 248         // create format specifiers before the debug output
 249         buff = __kmp_str_format(
 250             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
 251             "schedule:%%d chunk:%%%s\n",
 252             traits_t<ST>::spec);
 253         KD_TRACE(10, (buff, gtid, schedule, chunk));
 254         __kmp_str_free(&buff);
 255       }
 256 #endif
 257     }
 258 #if KMP_STATIC_STEAL_ENABLED
 259     // map nonmonotonic:dynamic to static steal
 260     if (schedule == kmp_sch_dynamic_chunked) {
 261       if (monotonicity == SCHEDULE_NONMONOTONIC)
 262         schedule = kmp_sch_static_steal;
 263     }
 264 #endif
 265     /* guided analytical not safe for too many threads */
 266     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
 267       schedule = kmp_sch_guided_iterative_chunked;
 268       KMP_WARNING(DispatchManyThreads);
 269     }
 270     if (schedule == kmp_sch_runtime_simd) {
 271       // compiler provides simd_width in the chunk parameter
 272       schedule = team->t.t_sched.r_sched_type;
 273       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
 274       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 275       // Detail the schedule if needed (global controls are differentiated
 276       // appropriately)
 277       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
 278           schedule == __kmp_static) {
 279         schedule = kmp_sch_static_balanced_chunked;
 280       } else {
 281         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
 282           schedule = kmp_sch_guided_simd;
 283         }
 284         chunk = team->t.t_sched.chunk * chunk;
 285       }
 286 #if USE_ITT_BUILD
 287       if (cur_chunk)
 288         *cur_chunk = chunk;
 289 #endif
 290 #ifdef KMP_DEBUG
 291       {
 292         char *buff;
 293         // create format specifiers before the debug output
 294         buff = __kmp_str_format(
 295             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
 296             " chunk:%%%s\n",
 297             traits_t<ST>::spec);
 298         KD_TRACE(10, (buff, gtid, schedule, chunk));
 299         __kmp_str_free(&buff);
 300       }
 301 #endif
 302     }
 303     pr->u.p.parm1 = chunk;
 304   }
 305   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
 306               "unknown scheduling type");
 307
 308   pr->u.p.count = 0;
 309
 310   if (__kmp_env_consistency_check) {
 311     if (st == 0) {
 312       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
 313                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
 314     }
 315   }
 316   // compute trip count
 317   if (st == 1) { // most common case
 318     if (ub >= lb) {
 319       tc = ub - lb + 1;
 320     } else { // ub < lb
 321       tc = 0; // zero-trip
 322     }
 323   } else if (st < 0) {
 324     if (lb >= ub) {
 325       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
 326       // where the division needs to be unsigned regardless of the result type
 327       tc = (UT)(lb - ub) / (-st) + 1;
 328     } else { // lb < ub
 329       tc = 0; // zero-trip
 330     }
 331   } else { // st > 0
 332     if (ub >= lb) {
 333       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
 334       // where the division needs to be unsigned regardless of the result type
 335       tc = (UT)(ub - lb) / st + 1;
 336     } else { // ub < lb
 337       tc = 0; // zero-trip
 338     }
 339   }
 340
 341 #if KMP_STATS_ENABLED
 342   if (KMP_MASTER_GTID(gtid)) {
 343     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
 344   }
 345 #endif
 346
 347   pr->u.p.lb = lb;
 348   pr->u.p.ub = ub;
 349   pr->u.p.st = st;
 350   pr->u.p.tc = tc;
 351
 352 #if KMP_OS_WINDOWS
 353   pr->u.p.last_upper = ub + st;
 354 #endif /* KMP_OS_WINDOWS */
 355
 356   /* NOTE: only the active parallel region(s) has active ordered sections */
 357
 358   if (active) {
 359     if (pr->flags.ordered) {
 360       pr->ordered_bumped = 0;
 361       pr->u.p.ordered_lower = 1;
 362       pr->u.p.ordered_upper = 0;
 363     }
 364   }
 365
 366   switch (schedule) {
 367 #if KMP_STATIC_STEAL_ENABLED
 368   case kmp_sch_static_steal: {
 369     T ntc, init;
 370
 371     KD_TRACE(100,
 372              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
 373               gtid));
 374
 375     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
 376     if (nproc > 1 && ntc >= nproc) {
 377       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
 378       T id = tid;
 379       T small_chunk, extras;
 380       kmp_uint32 old = UNUSED;
 381       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
 382       if (traits_t<T>::type_size > 4) {
 383         // AC: TODO: check if 16-byte CAS available and use it to
 384         // improve performance (probably wait for explicit request
 385         // before spending time on this).
 386         // For now use dynamically allocated per-private-buffer lock,
 387         // free memory in __kmp_dispatch_next when status==0.
 388         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
 389         __kmp_init_lock(pr->u.p.steal_lock);
 390       }
 391       small_chunk = ntc / nproc;
 392       extras = ntc % nproc;
 393
 394       init = id * small_chunk + (id < extras ? id : extras);
 395       pr->u.p.count = init;
 396       if (claimed) { // are we succeeded in claiming own buffer?
 397         pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
 398         // Other threads will inspect steal_flag when searching for a victim.
 399         // READY means other threads may steal from this thread from now on.
 400         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
 401       } else {
 402         // other thread has stolen whole our range
 403         KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
 404         pr->u.p.ub = init; // mark there is no iterations to work on
 405       }
 406       pr->u.p.parm2 = ntc; // save number of chunks
 407       // parm3 is the number of times to attempt stealing which is
 408       // nproc (just a heuristics, could be optimized later on).
 409       pr->u.p.parm3 = nproc;
 410       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
 411       break;
 412     } else {
 413       /* too few chunks: switching to kmp_sch_dynamic_chunked */
 414       schedule = kmp_sch_dynamic_chunked;
 415       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
 416                      "kmp_sch_dynamic_chunked\n",
 417                      gtid));
 418       goto dynamic_init;
 419       break;
 420     } // if
 421   } // case
 422 #endif
 423   case kmp_sch_static_balanced: {
 424     T init, limit;
 425
 426     KD_TRACE(
 427         100,
 428         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
 429          gtid));
 430
 431     if (nproc > 1) {
 432       T id = tid;
 433
 434       if (tc < nproc) {
 435         if (id < tc) {
 436           init = id;
 437           limit = id;
 438           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
 439         } else {
 440           pr->u.p.count = 1; /* means no more chunks to execute */
 441           pr->u.p.parm1 = FALSE;
 442           break;
 443         }
 444       } else {
 445         T small_chunk = tc / nproc;
 446         T extras = tc % nproc;
 447         init = id * small_chunk + (id < extras ? id : extras);
 448         limit = init + small_chunk - (id < extras ? 0 : 1);
 449         pr->u.p.parm1 = (id == nproc - 1);
 450       }
 451     } else {
 452       if (tc > 0) {
 453         init = 0;
 454         limit = tc - 1;
 455         pr->u.p.parm1 = TRUE;
 456       } else {
 457         // zero trip count
 458         pr->u.p.count = 1; /* means no more chunks to execute */
 459         pr->u.p.parm1 = FALSE;
 460         break;
 461       }
 462     }
 463 #if USE_ITT_BUILD
 464     // Calculate chunk for metadata report
 465     if (itt_need_metadata_reporting)
 466       if (cur_chunk)
 467         *cur_chunk = limit - init + 1;
 468 #endif
 469     if (st == 1) {
 470       pr->u.p.lb = lb + init;
 471       pr->u.p.ub = lb + limit;
 472     } else {
 473       // calculated upper bound, "ub" is user-defined upper bound
 474       T ub_tmp = lb + limit * st;
 475       pr->u.p.lb = lb + init * st;
 476       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
 477       // it exactly
 478       if (st > 0) {
 479         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
 480       } else {
 481         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
 482       }
 483     }
 484     if (pr->flags.ordered) {
 485       pr->u.p.ordered_lower = init;
 486       pr->u.p.ordered_upper = limit;
 487     }
 488     break;
 489   } // case
 490   case kmp_sch_static_balanced_chunked: {
 491     // similar to balanced, but chunk adjusted to multiple of simd width
 492     T nth = nproc;
 493     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
 494                    " -> falling-through to static_greedy\n",
 495                    gtid));
 496     schedule = kmp_sch_static_greedy;
 497     if (nth > 1)
 498       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
 499     else
 500       pr->u.p.parm1 = tc;
 501     break;
 502   } // case
 503   case kmp_sch_guided_simd:
 504   case kmp_sch_guided_iterative_chunked: {
 505     KD_TRACE(
 506         100,
 507         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
 508          " case\n",
 509          gtid));
 510
 511     if (nproc > 1) {
 512       if ((2L * chunk + 1) * nproc >= tc) {
 513         /* chunk size too large, switch to dynamic */
 514         schedule = kmp_sch_dynamic_chunked;
 515         goto dynamic_init;
 516       } else {
 517         // when remaining iters become less than parm2 - switch to dynamic
 518         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
 519         *(double *)&pr->u.p.parm3 =
 520             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
 521       }
 522     } else {
 523       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 524                      "kmp_sch_static_greedy\n",
 525                      gtid));
 526       schedule = kmp_sch_static_greedy;
 527       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
 528       KD_TRACE(
 529           100,
 530           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
 531            gtid));
 532       pr->u.p.parm1 = tc;
 533     } // if
 534   } // case
 535   break;
 536   case kmp_sch_guided_analytical_chunked: {
 537     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
 538                    "kmp_sch_guided_analytical_chunked case\n",
 539                    gtid));
 540
 541     if (nproc > 1) {
 542       if ((2L * chunk + 1) * nproc >= tc) {
 543         /* chunk size too large, switch to dynamic */
 544         schedule = kmp_sch_dynamic_chunked;
 545         goto dynamic_init;
 546       } else {
 547         /* commonly used term: (2 nproc - 1)/(2 nproc) */
 548         DBL x;
 549
 550 #if KMP_USE_X87CONTROL
 551         /* Linux* OS already has 64-bit computation by default for long double,
 552            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
 553            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
 554            instead of the default 53-bit. Even though long double doesn't work
 555            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
 556            expected to impact the correctness of the algorithm, but this has not
 557            been mathematically proven. */
 558         // save original FPCW and set precision to 64-bit, as
 559         // Windows* OS on IA-32 architecture defaults to 53-bit
 560         unsigned int oldFpcw = _control87(0, 0);
 561         _control87(_PC_64, _MCW_PC); // 0,0x30000
 562 #endif
 563         /* value used for comparison in solver for cross-over point */
 564         KMP_ASSERT(tc > 0);
 565         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
 566
 567         /* crossover point--chunk indexes equal to or greater than
 568            this point switch to dynamic-style scheduling */
 569         UT cross;
 570
 571         /* commonly used term: (2 nproc - 1)/(2 nproc) */
 572         x = 1.0 - 0.5 / (double)nproc;
 573
 574 #ifdef KMP_DEBUG
 575         { // test natural alignment
 576           struct _test_a {
 577             char a;
 578             union {
 579               char b;
 580               DBL d;
 581             };
 582           } t;
 583           ptrdiff_t natural_alignment =
 584               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
 585           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
 586           // long)natural_alignment );
 587           KMP_DEBUG_ASSERT(
 588               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
 589         }
 590 #endif // KMP_DEBUG
 591
 592         /* save the term in thread private dispatch structure */
 593         *(DBL *)&pr->u.p.parm3 = x;
 594
 595         /* solve for the crossover point to the nearest integer i for which C_i
 596            <= chunk */
 597         {
 598           UT left, right, mid;
 599           long double p;
 600
 601           /* estimate initial upper and lower bound */
 602
 603           /* doesn't matter what value right is as long as it is positive, but
 604              it affects performance of the solver */
 605           right = 229;
 606           p = __kmp_pow<UT>(x, right);
 607           if (p > target) {
 608             do {
 609               p *= p;
 610               right <<= 1;
 611             } while (p > target && right < (1 << 27));
 612             /* lower bound is previous (failed) estimate of upper bound */
 613             left = right >> 1;
 614           } else {
 615             left = 0;
 616           }
 617
 618           /* bisection root-finding method */
 619           while (left + 1 < right) {
 620             mid = (left + right) / 2;
 621             if (__kmp_pow<UT>(x, mid) > target) {
 622               left = mid;
 623             } else {
 624               right = mid;
 625             }
 626           } // while
 627           cross = right;
 628         }
 629         /* assert sanity of computed crossover point */
 630         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
 631                    __kmp_pow<UT>(x, cross) <= target);
 632
 633         /* save the crossover point in thread private dispatch structure */
 634         pr->u.p.parm2 = cross;
 635
 636 // C75803
 637 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
 638 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
 639 #else
 640 #define GUIDED_ANALYTICAL_WORKAROUND (x)
 641 #endif
 642         /* dynamic-style scheduling offset */
 643         pr->u.p.count = tc -
 644                         __kmp_dispatch_guided_remaining(
 645                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
 646                         cross * chunk;
 647 #if KMP_USE_X87CONTROL
 648         // restore FPCW
 649         _control87(oldFpcw, _MCW_PC);
 650 #endif
 651       } // if
 652     } else {
 653       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 654                      "kmp_sch_static_greedy\n",
 655                      gtid));
 656       schedule = kmp_sch_static_greedy;
 657       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
 658       pr->u.p.parm1 = tc;
 659     } // if
 660   } // case
 661   break;
 662   case kmp_sch_static_greedy:
 663     KD_TRACE(
 664         100,
 665         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
 666          gtid));
 667     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
 668     break;
 669   case kmp_sch_static_chunked:
 670   case kmp_sch_dynamic_chunked:
 671   dynamic_init:
 672     if (tc == 0)
 673       break;
 674     if (pr->u.p.parm1 <= 0)
 675       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
 676     else if (pr->u.p.parm1 > tc)
 677       pr->u.p.parm1 = tc;
 678     // Store the total number of chunks to prevent integer overflow during
 679     // bounds calculations in the get next chunk routine.
 680     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
 681     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
 682                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
 683                    gtid));
 684     break;
 685   case kmp_sch_trapezoidal: {
 686     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
 687
 688     T parm1, parm2, parm3, parm4;
 689     KD_TRACE(100,
 690              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
 691               gtid));
 692
 693     parm1 = chunk;
 694
 695     /* F : size of the first cycle */
 696     parm2 = (tc / (2 * nproc));
 697
 698     if (parm2 < 1) {
 699       parm2 = 1;
 700     }
 701
 702     /* L : size of the last cycle.  Make sure the last cycle is not larger
 703        than the first cycle. */
 704     if (parm1 < 1) {
 705       parm1 = 1;
 706     } else if (parm1 > parm2) {
 707       parm1 = parm2;
 708     }
 709
 710     /* N : number of cycles */
 711     parm3 = (parm2 + parm1);
 712     parm3 = (2 * tc + parm3 - 1) / parm3;
 713
 714     if (parm3 < 2) {
 715       parm3 = 2;
 716     }
 717
 718     /* sigma : decreasing incr of the trapezoid */
 719     parm4 = (parm3 - 1);
 720     parm4 = (parm2 - parm1) / parm4;
 721
 722     // pointless check, because parm4 >= 0 always
 723     // if ( parm4 < 0 ) {
 724     //    parm4 = 0;
 725     //}
 726
 727     pr->u.p.parm1 = parm1;
 728     pr->u.p.parm2 = parm2;
 729     pr->u.p.parm3 = parm3;
 730     pr->u.p.parm4 = parm4;
 731   } // case
 732   break;
 733
 734   default: {
 735     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
 736                 KMP_HNT(GetNewerLibrary), // Hint
 737                 __kmp_msg_null // Variadic argument list terminator
 738     );
 739   } break;
 740   } // switch
 741   pr->schedule = schedule;
 742 }
 743
 744 #if KMP_USE_HIER_SCHED
 745 template <typename T>
 746 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
 747                                              typename traits_t<T>::signed_t st);
 748 template <>
 749 inline void
 750 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
 751                                             kmp_int32 ub, kmp_int32 st) {
 752   __kmp_dispatch_init_hierarchy<kmp_int32>(
 753       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 754       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
 755 }
 756 template <>
 757 inline void
 758 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
 759                                              kmp_uint32 ub, kmp_int32 st) {
 760   __kmp_dispatch_init_hierarchy<kmp_uint32>(
 761       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 762       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
 763 }
 764 template <>
 765 inline void
 766 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
 767                                             kmp_int64 ub, kmp_int64 st) {
 768   __kmp_dispatch_init_hierarchy<kmp_int64>(
 769       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 770       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
 771 }
 772 template <>
 773 inline void
 774 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
 775                                              kmp_uint64 ub, kmp_int64 st) {
 776   __kmp_dispatch_init_hierarchy<kmp_uint64>(
 777       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 778       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
 779 }
 780
 781 // free all the hierarchy scheduling memory associated with the team
 782 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
 783   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
 784   for (int i = 0; i < num_disp_buff; ++i) {
 785     // type does not matter here so use kmp_int32
 786     auto sh =
 787         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
 788             &team->t.t_disp_buffer[i]);
 789     if (sh->hier) {
 790       sh->hier->deallocate();
 791       __kmp_free(sh->hier);
 792     }
 793   }
 794 }
 795 #endif
 796
 797 // UT - unsigned flavor of T, ST - signed flavor of T,
 798 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
 799 template <typename T>
 800 static void
 801 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 802                     T ub, typename traits_t<T>::signed_t st,
 803                     typename traits_t<T>::signed_t chunk, int push_ws) {
 804   typedef typename traits_t<T>::unsigned_t UT;
 805
 806   int active;
 807   kmp_info_t *th;
 808   kmp_team_t *team;
 809   kmp_uint32 my_buffer_index;
 810   dispatch_private_info_template<T> *pr;
 811   dispatch_shared_info_template<T> volatile *sh;
 812
 813   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
 814                    sizeof(dispatch_private_info));
 815   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
 816                    sizeof(dispatch_shared_info));
 817   __kmp_assert_valid_gtid(gtid);
 818
 819   if (!TCR_4(__kmp_init_parallel))
 820     __kmp_parallel_initialize();
 821
 822   __kmp_resume_if_soft_paused();
 823
 824 #if INCLUDE_SSC_MARKS
 825   SSC_MARK_DISPATCH_INIT();
 826 #endif
 827 #ifdef KMP_DEBUG
 828   typedef typename traits_t<T>::signed_t ST;
 829   {
 830     char *buff;
 831     // create format specifiers before the debug output
 832     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
 833                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
 834                             traits_t<ST>::spec, traits_t<T>::spec,
 835                             traits_t<T>::spec, traits_t<ST>::spec);
 836     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
 837     __kmp_str_free(&buff);
 838   }
 839 #endif
 840   /* setup data */
 841   th = __kmp_threads[gtid];
 842   team = th->th.th_team;
 843   active = !team->t.t_serialized;
 844   th->th.th_ident = loc;
 845
 846   // Any half-decent optimizer will remove this test when the blocks are empty
 847   // since the macros expand to nothing
 848   // when statistics are disabled.
 849   if (schedule == __kmp_static) {
 850     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
 851   } else {
 852     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
 853   }
 854
 855 #if KMP_USE_HIER_SCHED
 856   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
 857   // Hierarchical scheduling does not work with ordered, so if ordered is
 858   // detected, then revert back to threaded scheduling.
 859   bool ordered;
 860   enum sched_type my_sched = schedule;
 861   my_buffer_index = th->th.th_dispatch->th_disp_index;
 862   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 863       &th->th.th_dispatch
 864            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 865   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
 866   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
 867     my_sched =
 868         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
 869   ordered = (kmp_ord_lower & my_sched);
 870   if (pr->flags.use_hier) {
 871     if (ordered) {
 872       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
 873                      "Disabling hierarchical scheduling.\n",
 874                      gtid));
 875       pr->flags.use_hier = FALSE;
 876     }
 877   }
 878   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
 879     // Don't use hierarchical for ordered parallel loops and don't
 880     // use the runtime hierarchy if one was specified in the program
 881     if (!ordered && !pr->flags.use_hier)
 882       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
 883   }
 884 #endif // KMP_USE_HIER_SCHED
 885
 886 #if USE_ITT_BUILD
 887   kmp_uint64 cur_chunk = chunk;
 888   int itt_need_metadata_reporting =
 889       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
 890       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
 891       team->t.t_active_level == 1;
 892 #endif
 893   if (!active) {
 894     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 895         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
 896   } else {
 897     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
 898                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 899
 900     my_buffer_index = th->th.th_dispatch->th_disp_index++;
 901
 902     /* What happens when number of threads changes, need to resize buffer? */
 903     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 904         &th->th.th_dispatch
 905              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 906     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 907         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 908     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
 909                   my_buffer_index));
 910     if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
 911       KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
 912                      " sh->buffer_index:%d\n",
 913                      gtid, my_buffer_index, sh->buffer_index));
 914       __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
 915                              __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
 916       // Note: KMP_WAIT() cannot be used there: buffer index and
 917       // my_buffer_index are *always* 32-bit integers.
 918       KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
 919                      "sh->buffer_index:%d\n",
 920                      gtid, my_buffer_index, sh->buffer_index));
 921     }
 922   }
 923
 924   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
 925 #if USE_ITT_BUILD
 926                                 &cur_chunk,
 927 #endif
 928                                 chunk, (T)th->th.th_team_nproc,
 929                                 (T)th->th.th_info.ds.ds_tid);
 930   if (active) {
 931     if (pr->flags.ordered == 0) {
 932       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
 933       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
 934     } else {
 935       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
 936       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
 937     }
 938     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
 939     th->th.th_dispatch->th_dispatch_sh_current =
 940         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
 941 #if USE_ITT_BUILD
 942     if (pr->flags.ordered) {
 943       __kmp_itt_ordered_init(gtid);
 944     }
 945     // Report loop metadata
 946     if (itt_need_metadata_reporting) {
 947       // Only report metadata by primary thread of active team at level 1
 948       kmp_uint64 schedtype = 0;
 949       switch (schedule) {
 950       case kmp_sch_static_chunked:
 951       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
 952         break;
 953       case kmp_sch_static_greedy:
 954         cur_chunk = pr->u.p.parm1;
 955         break;
 956       case kmp_sch_dynamic_chunked:
 957         schedtype = 1;
 958         break;
 959       case kmp_sch_guided_iterative_chunked:
 960       case kmp_sch_guided_analytical_chunked:
 961       case kmp_sch_guided_simd:
 962         schedtype = 2;
 963         break;
 964       default:
 965         // Should we put this case under "static"?
 966         // case kmp_sch_static_steal:
 967         schedtype = 3;
 968         break;
 969       }
 970       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
 971     }
 972 #if KMP_USE_HIER_SCHED
 973     if (pr->flags.use_hier) {
 974       pr->u.p.count = 0;
 975       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
 976     }
 977 #endif // KMP_USER_HIER_SCHED
 978 #endif /* USE_ITT_BUILD */
 979   }
 980
 981 #ifdef KMP_DEBUG
 982   {
 983     char *buff;
 984     // create format specifiers before the debug output
 985     buff = __kmp_str_format(
 986         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
 987         "lb:%%%s ub:%%%s"
 988         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
 989         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
 990         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
 991         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
 992         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
 993         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
 994     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
 995                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
 996                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
 997                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
 998     __kmp_str_free(&buff);
 999   }
1000 #endif
1001 #if OMPT_SUPPORT && OMPT_OPTIONAL
1002   if (ompt_enabled.ompt_callback_work) {
1003     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1004     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1005     ompt_callbacks.ompt_callback(ompt_callback_work)(
1006         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1007         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1008   }
1009 #endif
1010   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1011 }
1012
1013 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1014  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1015  * every chunk of iterations.  If the ordered section(s) were not executed
1016  * for this iteration (or every iteration in this chunk), we need to set the
1017  * ordered iteration counters so that the next thread can proceed. */
1018 template <typename UT>
1019 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1020   typedef typename traits_t<UT>::signed_t ST;
1021   __kmp_assert_valid_gtid(gtid);
1022   kmp_info_t *th = __kmp_threads[gtid];
1023
1024   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1025   if (!th->th.th_team->t.t_serialized) {
1026
1027     dispatch_private_info_template<UT> *pr =
1028         reinterpret_cast<dispatch_private_info_template<UT> *>(
1029             th->th.th_dispatch->th_dispatch_pr_current);
1030     dispatch_shared_info_template<UT> volatile *sh =
1031         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1032             th->th.th_dispatch->th_dispatch_sh_current);
1033     KMP_DEBUG_ASSERT(pr);
1034     KMP_DEBUG_ASSERT(sh);
1035     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1036                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1037
1038     if (pr->ordered_bumped) {
1039       KD_TRACE(
1040           1000,
1041           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1042            gtid));
1043       pr->ordered_bumped = 0;
1044     } else {
1045       UT lower = pr->u.p.ordered_lower;
1046
1047 #ifdef KMP_DEBUG
1048       {
1049         char *buff;
1050         // create format specifiers before the debug output
1051         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1052                                 "ordered_iteration:%%%s lower:%%%s\n",
1053                                 traits_t<UT>::spec, traits_t<UT>::spec);
1054         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1055         __kmp_str_free(&buff);
1056       }
1057 #endif
1058
1059       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1060                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1061       KMP_MB(); /* is this necessary? */
1062 #ifdef KMP_DEBUG
1063       {
1064         char *buff;
1065         // create format specifiers before the debug output
1066         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1067                                 "ordered_iteration:%%%s lower:%%%s\n",
1068                                 traits_t<UT>::spec, traits_t<UT>::spec);
1069         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1070         __kmp_str_free(&buff);
1071       }
1072 #endif
1073
1074       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1075     } // if
1076   } // if
1077   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1078 }
1079
1080 #ifdef KMP_GOMP_COMPAT
1081
1082 template <typename UT>
1083 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1084   typedef typename traits_t<UT>::signed_t ST;
1085   __kmp_assert_valid_gtid(gtid);
1086   kmp_info_t *th = __kmp_threads[gtid];
1087
1088   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1089   if (!th->th.th_team->t.t_serialized) {
1090     dispatch_private_info_template<UT> *pr =
1091         reinterpret_cast<dispatch_private_info_template<UT> *>(
1092             th->th.th_dispatch->th_dispatch_pr_current);
1093     dispatch_shared_info_template<UT> volatile *sh =
1094         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1095             th->th.th_dispatch->th_dispatch_sh_current);
1096     KMP_DEBUG_ASSERT(pr);
1097     KMP_DEBUG_ASSERT(sh);
1098     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1099                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1100
1101     UT lower = pr->u.p.ordered_lower;
1102     UT upper = pr->u.p.ordered_upper;
1103     UT inc = upper - lower + 1;
1104
1105     if (pr->ordered_bumped == inc) {
1106       KD_TRACE(
1107           1000,
1108           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1109            gtid));
1110       pr->ordered_bumped = 0;
1111     } else {
1112       inc -= pr->ordered_bumped;
1113
1114 #ifdef KMP_DEBUG
1115       {
1116         char *buff;
1117         // create format specifiers before the debug output
1118         buff = __kmp_str_format(
1119             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1120             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1121             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1122         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1123         __kmp_str_free(&buff);
1124       }
1125 #endif
1126
1127       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1128                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1129
1130       KMP_MB(); /* is this necessary? */
1131       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1132                       "ordered_bumped to zero\n",
1133                       gtid));
1134       pr->ordered_bumped = 0;
1135 //!!!!! TODO check if the inc should be unsigned, or signed???
1136 #ifdef KMP_DEBUG
1137       {
1138         char *buff;
1139         // create format specifiers before the debug output
1140         buff = __kmp_str_format(
1141             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1142             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1143             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1144             traits_t<UT>::spec);
1145         KD_TRACE(1000,
1146                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1147         __kmp_str_free(&buff);
1148       }
1149 #endif
1150
1151       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1152     }
1153     //        }
1154   }
1155   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1156 }
1157
1158 #endif /* KMP_GOMP_COMPAT */
1159
1160 template <typename T>
1161 int __kmp_dispatch_next_algorithm(int gtid,
1162                                   dispatch_private_info_template<T> *pr,
1163                                   dispatch_shared_info_template<T> volatile *sh,
1164                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1165                                   typename traits_t<T>::signed_t *p_st, T nproc,
1166                                   T tid) {
1167   typedef typename traits_t<T>::unsigned_t UT;
1168   typedef typename traits_t<T>::signed_t ST;
1169   typedef typename traits_t<T>::floating_t DBL;
1170   int status = 0;
1171   bool last = false;
1172   T start;
1173   ST incr;
1174   UT limit, trip, init;
1175   kmp_info_t *th = __kmp_threads[gtid];
1176   kmp_team_t *team = th->th.th_team;
1177
1178   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1179                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1180   KMP_DEBUG_ASSERT(pr);
1181   KMP_DEBUG_ASSERT(sh);
1182   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1183 #ifdef KMP_DEBUG
1184   {
1185     char *buff;
1186     // create format specifiers before the debug output
1187     buff =
1188         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1189                          "sh:%%p nproc:%%%s tid:%%%s\n",
1190                          traits_t<T>::spec, traits_t<T>::spec);
1191     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1192     __kmp_str_free(&buff);
1193   }
1194 #endif
1195
1196   // zero trip count
1197   if (pr->u.p.tc == 0) {
1198     KD_TRACE(10,
1199              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1200               "zero status:%d\n",
1201               gtid, status));
1202     return 0;
1203   }
1204
1205   switch (pr->schedule) {
1206 #if KMP_STATIC_STEAL_ENABLED
1207   case kmp_sch_static_steal: {
1208     T chunk = pr->u.p.parm1;
1209     UT nchunks = pr->u.p.parm2;
1210     KD_TRACE(100,
1211              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1212               gtid));
1213
1214     trip = pr->u.p.tc - 1;
1215
1216     if (traits_t<T>::type_size > 4) {
1217       // use lock for 8-byte induction variable.
1218       // TODO (optional): check presence and use 16-byte CAS
1219       kmp_lock_t *lck = pr->u.p.steal_lock;
1220       KMP_DEBUG_ASSERT(lck != NULL);
1221       if (pr->u.p.count < (UT)pr->u.p.ub) {
1222         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1223         __kmp_acquire_lock(lck, gtid);
1224         // try to get own chunk of iterations
1225         init = (pr->u.p.count)++;
1226         status = (init < (UT)pr->u.p.ub);
1227         __kmp_release_lock(lck, gtid);
1228       } else {
1229         status = 0; // no own chunks
1230       }
1231       if (!status) { // try to steal
1232         kmp_lock_t *lckv; // victim buffer's lock
1233         T while_limit = pr->u.p.parm3;
1234         T while_index = 0;
1235         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1236                   __kmp_dispatch_num_buffers; // current loop index
1237         // note: victim thread can potentially execute another loop
1238         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1239         while ((!status) && (while_limit != ++while_index)) {
1240           dispatch_private_info_template<T> *v;
1241           T remaining;
1242           T victimId = pr->u.p.parm4;
1243           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1244           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1245               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1246           KMP_DEBUG_ASSERT(v);
1247           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1248                  oldVictimId != victimId) {
1249             victimId = (victimId + 1) % nproc;
1250             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1251                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1252             KMP_DEBUG_ASSERT(v);
1253           }
1254           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1255             continue; // try once more (nproc attempts in total)
1256           }
1257           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1258             kmp_uint32 old = UNUSED;
1259             // try to steal whole range from inactive victim
1260             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1261             if (status) {
1262               // initialize self buffer with victim's whole range of chunks
1263               T id = victimId;
1264               T small_chunk, extras;
1265               small_chunk = nchunks / nproc; // chunks per thread
1266               extras = nchunks % nproc;
1267               init = id * small_chunk + (id < extras ? id : extras);
1268               __kmp_acquire_lock(lck, gtid);
1269               pr->u.p.count = init + 1; // exclude one we execute immediately
1270               pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1271               __kmp_release_lock(lck, gtid);
1272               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1273               // no need to reinitialize other thread invariants: lb, st, etc.
1274 #ifdef KMP_DEBUG
1275               {
1276                 char *buff;
1277                 // create format specifiers before the debug output
1278                 buff = __kmp_str_format(
1279                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1280                     "count:%%%s ub:%%%s\n",
1281                     traits_t<UT>::spec, traits_t<T>::spec);
1282                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1283                 __kmp_str_free(&buff);
1284               }
1285 #endif
1286               // activate non-empty buffer and let others steal from us
1287               if (pr->u.p.count < (UT)pr->u.p.ub)
1288                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1289               break;
1290             }
1291           }
1292           if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1293               v->u.p.count >= (UT)v->u.p.ub) {
1294             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1295             continue; // no chunks to steal, try next victim
1296           }
1297           lckv = v->u.p.steal_lock;
1298           KMP_ASSERT(lckv != NULL);
1299           __kmp_acquire_lock(lckv, gtid);
1300           limit = v->u.p.ub; // keep initial ub
1301           if (v->u.p.count >= limit) {
1302             __kmp_release_lock(lckv, gtid);
1303             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1304             continue; // no chunks to steal, try next victim
1305           }
1306
1307           // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1308           // TODO: is this heuristics good enough??
1309           remaining = limit - v->u.p.count;
1310           if (remaining > 7) {
1311             // steal 1/4 of remaining
1312             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1313             init = (v->u.p.ub -= (remaining >> 2));
1314           } else {
1315             // steal 1 chunk of 1..7 remaining
1316             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1317             init = (v->u.p.ub -= 1);
1318           }
1319           __kmp_release_lock(lckv, gtid);
1320 #ifdef KMP_DEBUG
1321           {
1322             char *buff;
1323             // create format specifiers before the debug output
1324             buff = __kmp_str_format(
1325                 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1326                 "count:%%%s ub:%%%s\n",
1327                 traits_t<UT>::spec, traits_t<UT>::spec);
1328             KD_TRACE(10, (buff, gtid, victimId, init, limit));
1329             __kmp_str_free(&buff);
1330           }
1331 #endif
1332           KMP_DEBUG_ASSERT(init + 1 <= limit);
1333           pr->u.p.parm4 = victimId; // remember victim to steal from
1334           status = 1;
1335           // now update own count and ub with stolen range excluding init chunk
1336           __kmp_acquire_lock(lck, gtid);
1337           pr->u.p.count = init + 1;
1338           pr->u.p.ub = limit;
1339           __kmp_release_lock(lck, gtid);
1340           // activate non-empty buffer and let others steal from us
1341           if (init + 1 < limit)
1342             KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1343         } // while (search for victim)
1344       } // if (try to find victim and steal)
1345     } else {
1346       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1347       // as all operations on pair (count, ub) must be done atomically
1348       typedef union {
1349         struct {
1350           UT count;
1351           T ub;
1352         } p;
1353         kmp_int64 b;
1354       } union_i4;
1355       union_i4 vold, vnew;
1356       if (pr->u.p.count < (UT)pr->u.p.ub) {
1357         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1358         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1359         vnew.b = vold.b;
1360         vnew.p.count++; // get chunk from head of self range
1361         while (!KMP_COMPARE_AND_STORE_REL64(
1362             (volatile kmp_int64 *)&pr->u.p.count,
1363             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1364             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1365           KMP_CPU_PAUSE();
1366           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1367           vnew.b = vold.b;
1368           vnew.p.count++;
1369         }
1370         init = vold.p.count;
1371         status = (init < (UT)vold.p.ub);
1372       } else {
1373         status = 0; // no own chunks
1374       }
1375       if (!status) { // try to steal
1376         T while_limit = pr->u.p.parm3;
1377         T while_index = 0;
1378         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1379                   __kmp_dispatch_num_buffers; // current loop index
1380         // note: victim thread can potentially execute another loop
1381         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1382         while ((!status) && (while_limit != ++while_index)) {
1383           dispatch_private_info_template<T> *v;
1384           T remaining;
1385           T victimId = pr->u.p.parm4;
1386           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1387           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1388               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1389           KMP_DEBUG_ASSERT(v);
1390           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1391                  oldVictimId != victimId) {
1392             victimId = (victimId + 1) % nproc;
1393             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1394                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1395             KMP_DEBUG_ASSERT(v);
1396           }
1397           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1398             continue; // try once more (nproc attempts in total)
1399           }
1400           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1401             kmp_uint32 old = UNUSED;
1402             // try to steal whole range from inactive victim
1403             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1404             if (status) {
1405               // initialize self buffer with victim's whole range of chunks
1406               T id = victimId;
1407               T small_chunk, extras;
1408               small_chunk = nchunks / nproc; // chunks per thread
1409               extras = nchunks % nproc;
1410               init = id * small_chunk + (id < extras ? id : extras);
1411               vnew.p.count = init + 1;
1412               vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1413               // write pair (count, ub) at once atomically
1414 #if KMP_ARCH_X86
1415               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1416 #else
1417               *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1418 #endif
1419               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1420               // no need to initialize other thread invariants: lb, st, etc.
1421 #ifdef KMP_DEBUG
1422               {
1423                 char *buff;
1424                 // create format specifiers before the debug output
1425                 buff = __kmp_str_format(
1426                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1427                     "count:%%%s ub:%%%s\n",
1428                     traits_t<UT>::spec, traits_t<T>::spec);
1429                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1430                 __kmp_str_free(&buff);
1431               }
1432 #endif
1433               // activate non-empty buffer and let others steal from us
1434               if (pr->u.p.count < (UT)pr->u.p.ub)
1435                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1436               break;
1437             }
1438           }
1439           while (1) { // CAS loop with check if victim still has enough chunks
1440             // many threads may be stealing concurrently from same victim
1441             vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1442             if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1443                 vold.p.count >= (UT)vold.p.ub) {
1444               pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1445               break; // no chunks to steal, try next victim
1446             }
1447             vnew.b = vold.b;
1448             remaining = vold.p.ub - vold.p.count;
1449             // try to steal 1/4 of remaining
1450             // TODO: is this heuristics good enough??
1451             if (remaining > 7) {
1452               vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1453             } else {
1454               vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1455             }
1456             KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1457             if (KMP_COMPARE_AND_STORE_REL64(
1458                     (volatile kmp_int64 *)&v->u.p.count,
1459                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1460                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1461               // stealing succedded
1462 #ifdef KMP_DEBUG
1463               {
1464                 char *buff;
1465                 // create format specifiers before the debug output
1466                 buff = __kmp_str_format(
1467                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1468                     "count:%%%s ub:%%%s\n",
1469                     traits_t<T>::spec, traits_t<T>::spec);
1470                 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1471                 __kmp_str_free(&buff);
1472               }
1473 #endif
1474               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1475                                         vold.p.ub - vnew.p.ub);
1476               status = 1;
1477               pr->u.p.parm4 = victimId; // keep victim id
1478               // now update own count and ub
1479               init = vnew.p.ub;
1480               vold.p.count = init + 1;
1481 #if KMP_ARCH_X86
1482               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1483 #else
1484               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1485 #endif
1486               // activate non-empty buffer and let others steal from us
1487               if (vold.p.count < (UT)vold.p.ub)
1488                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1489               break;
1490             } // if (check CAS result)
1491             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1492           } // while (try to steal from particular victim)
1493         } // while (search for victim)
1494       } // if (try to find victim and steal)
1495     } // if (4-byte induction variable)
1496     if (!status) {
1497       *p_lb = 0;
1498       *p_ub = 0;
1499       if (p_st != NULL)
1500         *p_st = 0;
1501     } else {
1502       start = pr->u.p.lb;
1503       init *= chunk;
1504       limit = chunk + init - 1;
1505       incr = pr->u.p.st;
1506       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1507
1508       KMP_DEBUG_ASSERT(init <= trip);
1509       // keep track of done chunks for possible early exit from stealing
1510       // TODO: count executed chunks locally with rare update of shared location
1511       // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1512       if ((last = (limit >= trip)) != 0)
1513         limit = trip;
1514       if (p_st != NULL)
1515         *p_st = incr;
1516
1517       if (incr == 1) {
1518         *p_lb = start + init;
1519         *p_ub = start + limit;
1520       } else {
1521         *p_lb = start + init * incr;
1522         *p_ub = start + limit * incr;
1523       }
1524     } // if
1525     break;
1526   } // case
1527 #endif // KMP_STATIC_STEAL_ENABLED
1528   case kmp_sch_static_balanced: {
1529     KD_TRACE(
1530         10,
1531         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1532          gtid));
1533     /* check if thread has any iteration to do */
1534     if ((status = !pr->u.p.count) != 0) {
1535       pr->u.p.count = 1;
1536       *p_lb = pr->u.p.lb;
1537       *p_ub = pr->u.p.ub;
1538       last = (pr->u.p.parm1 != 0);
1539       if (p_st != NULL)
1540         *p_st = pr->u.p.st;
1541     } else { /* no iterations to do */
1542       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1543     }
1544   } // case
1545   break;
1546   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1547                                  merged here */
1548   case kmp_sch_static_chunked: {
1549     T parm1;
1550
1551     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1552                    "kmp_sch_static_[affinity|chunked] case\n",
1553                    gtid));
1554     parm1 = pr->u.p.parm1;
1555
1556     trip = pr->u.p.tc - 1;
1557     init = parm1 * (pr->u.p.count + tid);
1558
1559     if ((status = (init <= trip)) != 0) {
1560       start = pr->u.p.lb;
1561       incr = pr->u.p.st;
1562       limit = parm1 + init - 1;
1563
1564       if ((last = (limit >= trip)) != 0)
1565         limit = trip;
1566
1567       if (p_st != NULL)
1568         *p_st = incr;
1569
1570       pr->u.p.count += nproc;
1571
1572       if (incr == 1) {
1573         *p_lb = start + init;
1574         *p_ub = start + limit;
1575       } else {
1576         *p_lb = start + init * incr;
1577         *p_ub = start + limit * incr;
1578       }
1579
1580       if (pr->flags.ordered) {
1581         pr->u.p.ordered_lower = init;
1582         pr->u.p.ordered_upper = limit;
1583       } // if
1584     } // if
1585   } // case
1586   break;
1587
1588   case kmp_sch_dynamic_chunked: {
1589     UT chunk_number;
1590     UT chunk_size = pr->u.p.parm1;
1591     UT nchunks = pr->u.p.parm2;
1592
1593     KD_TRACE(
1594         100,
1595         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1596          gtid));
1597
1598     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1599     status = (chunk_number < nchunks);
1600     if (!status) {
1601       *p_lb = 0;
1602       *p_ub = 0;
1603       if (p_st != NULL)
1604         *p_st = 0;
1605     } else {
1606       init = chunk_size * chunk_number;
1607       trip = pr->u.p.tc - 1;
1608       start = pr->u.p.lb;
1609       incr = pr->u.p.st;
1610
1611       if ((last = (trip - init < (UT)chunk_size)))
1612         limit = trip;
1613       else
1614         limit = chunk_size + init - 1;
1615
1616       if (p_st != NULL)
1617         *p_st = incr;
1618
1619       if (incr == 1) {
1620         *p_lb = start + init;
1621         *p_ub = start + limit;
1622       } else {
1623         *p_lb = start + init * incr;
1624         *p_ub = start + limit * incr;
1625       }
1626
1627       if (pr->flags.ordered) {
1628         pr->u.p.ordered_lower = init;
1629         pr->u.p.ordered_upper = limit;
1630       } // if
1631     } // if
1632   } // case
1633   break;
1634
1635   case kmp_sch_guided_iterative_chunked: {
1636     T chunkspec = pr->u.p.parm1;
1637     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1638                    "iterative case\n",
1639                    gtid));
1640     trip = pr->u.p.tc;
1641     // Start atomic part of calculations
1642     while (1) {
1643       ST remaining; // signed, because can be < 0
1644       init = sh->u.s.iteration; // shared value
1645       remaining = trip - init;
1646       if (remaining <= 0) { // AC: need to compare with 0 first
1647         // nothing to do, don't try atomic op
1648         status = 0;
1649         break;
1650       }
1651       if ((T)remaining <
1652           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1653         // use dynamic-style schedule
1654         // atomically increment iterations, get old value
1655         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1656                                  (ST)chunkspec);
1657         remaining = trip - init;
1658         if (remaining <= 0) {
1659           status = 0; // all iterations got by other threads
1660         } else {
1661           // got some iterations to work on
1662           status = 1;
1663           if ((T)remaining > chunkspec) {
1664             limit = init + chunkspec - 1;
1665           } else {
1666             last = true; // the last chunk
1667             limit = init + remaining - 1;
1668           } // if
1669         } // if
1670         break;
1671       } // if
1672       limit = init + (UT)((double)remaining *
1673                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1674       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1675                                (ST)init, (ST)limit)) {
1676         // CAS was successful, chunk obtained
1677         status = 1;
1678         --limit;
1679         break;
1680       } // if
1681     } // while
1682     if (status != 0) {
1683       start = pr->u.p.lb;
1684       incr = pr->u.p.st;
1685       if (p_st != NULL)
1686         *p_st = incr;
1687       *p_lb = start + init * incr;
1688       *p_ub = start + limit * incr;
1689       if (pr->flags.ordered) {
1690         pr->u.p.ordered_lower = init;
1691         pr->u.p.ordered_upper = limit;
1692       } // if
1693     } else {
1694       *p_lb = 0;
1695       *p_ub = 0;
1696       if (p_st != NULL)
1697         *p_st = 0;
1698     } // if
1699   } // case
1700   break;
1701
1702   case kmp_sch_guided_simd: {
1703     // same as iterative but curr-chunk adjusted to be multiple of given
1704     // chunk
1705     T chunk = pr->u.p.parm1;
1706     KD_TRACE(100,
1707              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1708               gtid));
1709     trip = pr->u.p.tc;
1710     // Start atomic part of calculations
1711     while (1) {
1712       ST remaining; // signed, because can be < 0
1713       init = sh->u.s.iteration; // shared value
1714       remaining = trip - init;
1715       if (remaining <= 0) { // AC: need to compare with 0 first
1716         status = 0; // nothing to do, don't try atomic op
1717         break;
1718       }
1719       KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1720       // compare with K*nproc*(chunk+1), K=2 by default
1721       if ((T)remaining < pr->u.p.parm2) {
1722         // use dynamic-style schedule
1723         // atomically increment iterations, get old value
1724         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1725                                  (ST)chunk);
1726         remaining = trip - init;
1727         if (remaining <= 0) {
1728           status = 0; // all iterations got by other threads
1729         } else {
1730           // got some iterations to work on
1731           status = 1;
1732           if ((T)remaining > chunk) {
1733             limit = init + chunk - 1;
1734           } else {
1735             last = true; // the last chunk
1736             limit = init + remaining - 1;
1737           } // if
1738         } // if
1739         break;
1740       } // if
1741       // divide by K*nproc
1742       UT span;
1743       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1744                          &span);
1745       UT rem = span % chunk;
1746       if (rem) // adjust so that span%chunk == 0
1747         span += chunk - rem;
1748       limit = init + span;
1749       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1750                                (ST)init, (ST)limit)) {
1751         // CAS was successful, chunk obtained
1752         status = 1;
1753         --limit;
1754         break;
1755       } // if
1756     } // while
1757     if (status != 0) {
1758       start = pr->u.p.lb;
1759       incr = pr->u.p.st;
1760       if (p_st != NULL)
1761         *p_st = incr;
1762       *p_lb = start + init * incr;
1763       *p_ub = start + limit * incr;
1764       if (pr->flags.ordered) {
1765         pr->u.p.ordered_lower = init;
1766         pr->u.p.ordered_upper = limit;
1767       } // if
1768     } else {
1769       *p_lb = 0;
1770       *p_ub = 0;
1771       if (p_st != NULL)
1772         *p_st = 0;
1773     } // if
1774   } // case
1775   break;
1776
1777   case kmp_sch_guided_analytical_chunked: {
1778     T chunkspec = pr->u.p.parm1;
1779     UT chunkIdx;
1780 #if KMP_USE_X87CONTROL
1781     /* for storing original FPCW value for Windows* OS on
1782        IA-32 architecture 8-byte version */
1783     unsigned int oldFpcw;
1784     unsigned int fpcwSet = 0;
1785 #endif
1786     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1787                    "kmp_sch_guided_analytical_chunked case\n",
1788                    gtid));
1789
1790     trip = pr->u.p.tc;
1791
1792     KMP_DEBUG_ASSERT(nproc > 1);
1793     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1794
1795     while (1) { /* this while loop is a safeguard against unexpected zero
1796                    chunk sizes */
1797       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1798       if (chunkIdx >= (UT)pr->u.p.parm2) {
1799         --trip;
1800         /* use dynamic-style scheduling */
1801         init = chunkIdx * chunkspec + pr->u.p.count;
1802         /* need to verify init > 0 in case of overflow in the above
1803          * calculation */
1804         if ((status = (init > 0 && init <= trip)) != 0) {
1805           limit = init + chunkspec - 1;
1806
1807           if ((last = (limit >= trip)) != 0)
1808             limit = trip;
1809         }
1810         break;
1811       } else {
1812 /* use exponential-style scheduling */
1813 /* The following check is to workaround the lack of long double precision on
1814    Windows* OS.
1815    This check works around the possible effect that init != 0 for chunkIdx == 0.
1816  */
1817 #if KMP_USE_X87CONTROL
1818         /* If we haven't already done so, save original
1819            FPCW and set precision to 64-bit, as Windows* OS
1820            on IA-32 architecture defaults to 53-bit */
1821         if (!fpcwSet) {
1822           oldFpcw = _control87(0, 0);
1823           _control87(_PC_64, _MCW_PC);
1824           fpcwSet = 0x30000;
1825         }
1826 #endif
1827         if (chunkIdx) {
1828           init = __kmp_dispatch_guided_remaining<T>(
1829               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1830           KMP_DEBUG_ASSERT(init);
1831           init = trip - init;
1832         } else
1833           init = 0;
1834         limit = trip - __kmp_dispatch_guided_remaining<T>(
1835                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1836         KMP_ASSERT(init <= limit);
1837         if (init < limit) {
1838           KMP_DEBUG_ASSERT(limit <= trip);
1839           --limit;
1840           status = 1;
1841           break;
1842         } // if
1843       } // if
1844     } // while (1)
1845 #if KMP_USE_X87CONTROL
1846     /* restore FPCW if necessary
1847        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1848     */
1849     if (fpcwSet && (oldFpcw & fpcwSet))
1850       _control87(oldFpcw, _MCW_PC);
1851 #endif
1852     if (status != 0) {
1853       start = pr->u.p.lb;
1854       incr = pr->u.p.st;
1855       if (p_st != NULL)
1856         *p_st = incr;
1857       *p_lb = start + init * incr;
1858       *p_ub = start + limit * incr;
1859       if (pr->flags.ordered) {
1860         pr->u.p.ordered_lower = init;
1861         pr->u.p.ordered_upper = limit;
1862       }
1863     } else {
1864       *p_lb = 0;
1865       *p_ub = 0;
1866       if (p_st != NULL)
1867         *p_st = 0;
1868     }
1869   } // case
1870   break;
1871
1872   case kmp_sch_trapezoidal: {
1873     UT index;
1874     T parm2 = pr->u.p.parm2;
1875     T parm3 = pr->u.p.parm3;
1876     T parm4 = pr->u.p.parm4;
1877     KD_TRACE(100,
1878              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1879               gtid));
1880
1881     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1882
1883     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1884     trip = pr->u.p.tc - 1;
1885
1886     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1887       *p_lb = 0;
1888       *p_ub = 0;
1889       if (p_st != NULL)
1890         *p_st = 0;
1891     } else {
1892       start = pr->u.p.lb;
1893       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1894       incr = pr->u.p.st;
1895
1896       if ((last = (limit >= trip)) != 0)
1897         limit = trip;
1898
1899       if (p_st != NULL)
1900         *p_st = incr;
1901
1902       if (incr == 1) {
1903         *p_lb = start + init;
1904         *p_ub = start + limit;
1905       } else {
1906         *p_lb = start + init * incr;
1907         *p_ub = start + limit * incr;
1908       }
1909
1910       if (pr->flags.ordered) {
1911         pr->u.p.ordered_lower = init;
1912         pr->u.p.ordered_upper = limit;
1913       } // if
1914     } // if
1915   } // case
1916   break;
1917   default: {
1918     status = 0; // to avoid complaints on uninitialized variable use
1919     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1920                 KMP_HNT(GetNewerLibrary), // Hint
1921                 __kmp_msg_null // Variadic argument list terminator
1922     );
1923   } break;
1924   } // switch
1925   if (p_last)
1926     *p_last = last;
1927 #ifdef KMP_DEBUG
1928   if (pr->flags.ordered) {
1929     char *buff;
1930     // create format specifiers before the debug output
1931     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1932                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1933                             traits_t<UT>::spec, traits_t<UT>::spec);
1934     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1935     __kmp_str_free(&buff);
1936   }
1937   {
1938     char *buff;
1939     // create format specifiers before the debug output
1940     buff = __kmp_str_format(
1941         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1942         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1943         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1944     KMP_DEBUG_ASSERT(p_last);
1945     KMP_DEBUG_ASSERT(p_st);
1946     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1947     __kmp_str_free(&buff);
1948   }
1949 #endif
1950   return status;
1951 }
1952
1953 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1954    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1955    is not called. */
1956 #if OMPT_SUPPORT && OMPT_OPTIONAL
1957 #define OMPT_LOOP_END                                                          \
1958   if (status == 0) {                                                           \
1959     if (ompt_enabled.ompt_callback_work) {                                     \
1960       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1961       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1962       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1963           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1964           &(task_info->task_data), 0, codeptr);                                \
1965     }                                                                          \
1966   }
1967 #define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
1968   if (ompt_enabled.ompt_callback_dispatch && status) {                         \
1969     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
1970     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
1971     ompt_dispatch_chunk_t chunk;                                               \
1972     ompt_data_t instance = ompt_data_none;                                     \
1973     OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
1974     instance.ptr = &chunk;                                                     \
1975     ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
1976         &(team_info->parallel_data), &(task_info->task_data),                  \
1977         ompt_dispatch_ws_loop_chunk, instance);                                \
1978   }
1979 // TODO: implement count
1980 #else
1981 #define OMPT_LOOP_END // no-op
1982 #define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
1983 #endif
1984
1985 #if KMP_STATS_ENABLED
1986 #define KMP_STATS_LOOP_END                                                     \
1987   {                                                                            \
1988     kmp_int64 u, l, t, i;                                                      \
1989     l = (kmp_int64)(*p_lb);                                                    \
1990     u = (kmp_int64)(*p_ub);                                                    \
1991     i = (kmp_int64)(pr->u.p.st);                                               \
1992     if (status == 0) {                                                         \
1993       t = 0;                                                                   \
1994       KMP_POP_PARTITIONED_TIMER();                                             \
1995     } else if (i == 1) {                                                       \
1996       if (u >= l)                                                              \
1997         t = u - l + 1;                                                         \
1998       else                                                                     \
1999         t = 0;                                                                 \
2000     } else if (i < 0) {                                                        \
2001       if (l >= u)                                                              \
2002         t = (l - u) / (-i) + 1;                                                \
2003       else                                                                     \
2004         t = 0;                                                                 \
2005     } else {                                                                   \
2006       if (u >= l)                                                              \
2007         t = (u - l) / i + 1;                                                   \
2008       else                                                                     \
2009         t = 0;                                                                 \
2010     }                                                                          \
2011     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
2012   }
2013 #else
2014 #define KMP_STATS_LOOP_END /* Nothing */
2015 #endif
2016
2017 template <typename T>
2018 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2019                                T *p_lb, T *p_ub,
2020                                typename traits_t<T>::signed_t *p_st
2021 #if OMPT_SUPPORT && OMPT_OPTIONAL
2022                                ,
2023                                void *codeptr
2024 #endif
2025 ) {
2026
2027   typedef typename traits_t<T>::unsigned_t UT;
2028   typedef typename traits_t<T>::signed_t ST;
2029   // This is potentially slightly misleading, schedule(runtime) will appear here
2030   // even if the actual runtime schedule is static. (Which points out a
2031   // disadvantage of schedule(runtime): even when static scheduling is used it
2032   // costs more than a compile time choice to use static scheduling would.)
2033   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2034
2035   int status;
2036   dispatch_private_info_template<T> *pr;
2037   __kmp_assert_valid_gtid(gtid);
2038   kmp_info_t *th = __kmp_threads[gtid];
2039   kmp_team_t *team = th->th.th_team;
2040
2041   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2042   KD_TRACE(
2043       1000,
2044       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2045        gtid, p_lb, p_ub, p_st, p_last));
2046
2047   if (team->t.t_serialized) {
2048     /* NOTE: serialize this dispatch because we are not at the active level */
2049     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2050         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2051     KMP_DEBUG_ASSERT(pr);
2052
2053     if ((status = (pr->u.p.tc != 0)) == 0) {
2054       *p_lb = 0;
2055       *p_ub = 0;
2056       //            if ( p_last != NULL )
2057       //                *p_last = 0;
2058       if (p_st != NULL)
2059         *p_st = 0;
2060       if (__kmp_env_consistency_check) {
2061         if (pr->pushed_ws != ct_none) {
2062           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2063         }
2064       }
2065     } else if (pr->flags.nomerge) {
2066       kmp_int32 last;
2067       T start;
2068       UT limit, trip, init;
2069       ST incr;
2070       T chunk = pr->u.p.parm1;
2071
2072       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2073                      gtid));
2074
2075       init = chunk * pr->u.p.count++;
2076       trip = pr->u.p.tc - 1;
2077
2078       if ((status = (init <= trip)) == 0) {
2079         *p_lb = 0;
2080         *p_ub = 0;
2081         //                if ( p_last != NULL )
2082         //                    *p_last = 0;
2083         if (p_st != NULL)
2084           *p_st = 0;
2085         if (__kmp_env_consistency_check) {
2086           if (pr->pushed_ws != ct_none) {
2087             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2088           }
2089         }
2090       } else {
2091         start = pr->u.p.lb;
2092         limit = chunk + init - 1;
2093         incr = pr->u.p.st;
2094
2095         if ((last = (limit >= trip)) != 0) {
2096           limit = trip;
2097 #if KMP_OS_WINDOWS
2098           pr->u.p.last_upper = pr->u.p.ub;
2099 #endif /* KMP_OS_WINDOWS */
2100         }
2101         if (p_last != NULL)
2102           *p_last = last;
2103         if (p_st != NULL)
2104           *p_st = incr;
2105         if (incr == 1) {
2106           *p_lb = start + init;
2107           *p_ub = start + limit;
2108         } else {
2109           *p_lb = start + init * incr;
2110           *p_ub = start + limit * incr;
2111         }
2112
2113         if (pr->flags.ordered) {
2114           pr->u.p.ordered_lower = init;
2115           pr->u.p.ordered_upper = limit;
2116 #ifdef KMP_DEBUG
2117           {
2118             char *buff;
2119             // create format specifiers before the debug output
2120             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2121                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2122                                     traits_t<UT>::spec, traits_t<UT>::spec);
2123             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2124                             pr->u.p.ordered_upper));
2125             __kmp_str_free(&buff);
2126           }
2127 #endif
2128         } // if
2129       } // if
2130     } else {
2131       pr->u.p.tc = 0;
2132       *p_lb = pr->u.p.lb;
2133       *p_ub = pr->u.p.ub;
2134 #if KMP_OS_WINDOWS
2135       pr->u.p.last_upper = *p_ub;
2136 #endif /* KMP_OS_WINDOWS */
2137       if (p_last != NULL)
2138         *p_last = TRUE;
2139       if (p_st != NULL)
2140         *p_st = pr->u.p.st;
2141     } // if
2142 #ifdef KMP_DEBUG
2143     {
2144       char *buff;
2145       // create format specifiers before the debug output
2146       buff = __kmp_str_format(
2147           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2148           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2149           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2150       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2151                     (p_last ? *p_last : 0), status));
2152       __kmp_str_free(&buff);
2153     }
2154 #endif
2155 #if INCLUDE_SSC_MARKS
2156     SSC_MARK_DISPATCH_NEXT();
2157 #endif
2158     OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2159     OMPT_LOOP_END;
2160     KMP_STATS_LOOP_END;
2161     return status;
2162   } else {
2163     kmp_int32 last = 0;
2164     dispatch_shared_info_template<T> volatile *sh;
2165
2166     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2167                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2168
2169     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2170         th->th.th_dispatch->th_dispatch_pr_current);
2171     KMP_DEBUG_ASSERT(pr);
2172     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2173         th->th.th_dispatch->th_dispatch_sh_current);
2174     KMP_DEBUG_ASSERT(sh);
2175
2176 #if KMP_USE_HIER_SCHED
2177     if (pr->flags.use_hier)
2178       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2179     else
2180 #endif // KMP_USE_HIER_SCHED
2181       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2182                                                 p_st, th->th.th_team_nproc,
2183                                                 th->th.th_info.ds.ds_tid);
2184     // status == 0: no more iterations to execute
2185     if (status == 0) {
2186       ST num_done;
2187       num_done = test_then_inc<ST>(&sh->u.s.num_done);
2188 #ifdef KMP_DEBUG
2189       {
2190         char *buff;
2191         // create format specifiers before the debug output
2192         buff = __kmp_str_format(
2193             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2194             traits_t<ST>::spec);
2195         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2196         __kmp_str_free(&buff);
2197       }
2198 #endif
2199
2200 #if KMP_USE_HIER_SCHED
2201       pr->flags.use_hier = FALSE;
2202 #endif
2203       if (num_done == th->th.th_team_nproc - 1) {
2204 #if KMP_STATIC_STEAL_ENABLED
2205         if (pr->schedule == kmp_sch_static_steal) {
2206           int i;
2207           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2208                     __kmp_dispatch_num_buffers; // current loop index
2209           // loop complete, safe to destroy locks used for stealing
2210           for (i = 0; i < th->th.th_team_nproc; ++i) {
2211             dispatch_private_info_template<T> *buf =
2212                 reinterpret_cast<dispatch_private_info_template<T> *>(
2213                     &team->t.t_dispatch[i].th_disp_buffer[idx]);
2214             KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2215             KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2216             if (traits_t<T>::type_size > 4) {
2217               // destroy locks used for stealing
2218               kmp_lock_t *lck = buf->u.p.steal_lock;
2219               KMP_ASSERT(lck != NULL);
2220               __kmp_destroy_lock(lck);
2221               __kmp_free(lck);
2222               buf->u.p.steal_lock = NULL;
2223             }
2224           }
2225         }
2226 #endif
2227         /* NOTE: release shared buffer to be reused */
2228
2229         KMP_MB(); /* Flush all pending memory write invalidates.  */
2230
2231         sh->u.s.num_done = 0;
2232         sh->u.s.iteration = 0;
2233
2234         /* TODO replace with general release procedure? */
2235         if (pr->flags.ordered) {
2236           sh->u.s.ordered_iteration = 0;
2237         }
2238
2239         sh->buffer_index += __kmp_dispatch_num_buffers;
2240         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2241                        gtid, sh->buffer_index));
2242
2243         KMP_MB(); /* Flush all pending memory write invalidates.  */
2244
2245       } // if
2246       if (__kmp_env_consistency_check) {
2247         if (pr->pushed_ws != ct_none) {
2248           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2249         }
2250       }
2251
2252       th->th.th_dispatch->th_deo_fcn = NULL;
2253       th->th.th_dispatch->th_dxo_fcn = NULL;
2254       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2255       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2256     } // if (status == 0)
2257 #if KMP_OS_WINDOWS
2258     else if (last) {
2259       pr->u.p.last_upper = pr->u.p.ub;
2260     }
2261 #endif /* KMP_OS_WINDOWS */
2262     if (p_last != NULL && status != 0)
2263       *p_last = last;
2264   } // if
2265
2266 #ifdef KMP_DEBUG
2267   {
2268     char *buff;
2269     // create format specifiers before the debug output
2270     buff = __kmp_str_format(
2271         "__kmp_dispatch_next: T#%%d normal case: "
2272         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2273         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2274     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2275                   (p_last ? *p_last : 0), status));
2276     __kmp_str_free(&buff);
2277   }
2278 #endif
2279 #if INCLUDE_SSC_MARKS
2280   SSC_MARK_DISPATCH_NEXT();
2281 #endif
2282   OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2283   OMPT_LOOP_END;
2284   KMP_STATS_LOOP_END;
2285   return status;
2286 }
2287
2288 /*!
2289 @ingroup WORK_SHARING
2290 @param loc  source location information
2291 @param global_tid  global thread number
2292 @return Zero if the parallel region is not active and this thread should execute
2293 all sections, non-zero otherwise.
2294
2295 Beginning of sections construct.
2296 There are no implicit barriers in the "sections" calls, rather the compiler
2297 should introduce an explicit barrier if it is required.
2298
2299 This implementation is based on __kmp_dispatch_init, using same constructs for
2300 shared data (we can't have sections nested directly in omp for loop, there
2301 should be a parallel region in between)
2302 */
2303 kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2304
2305   int active;
2306   kmp_info_t *th;
2307   kmp_team_t *team;
2308   kmp_uint32 my_buffer_index;
2309   dispatch_shared_info_template<kmp_int32> volatile *sh;
2310
2311   KMP_DEBUG_ASSERT(__kmp_init_serial);
2312
2313   if (!TCR_4(__kmp_init_parallel))
2314     __kmp_parallel_initialize();
2315   __kmp_resume_if_soft_paused();
2316
2317   /* setup data */
2318   th = __kmp_threads[gtid];
2319   team = th->th.th_team;
2320   active = !team->t.t_serialized;
2321   th->th.th_ident = loc;
2322
2323   KMP_COUNT_BLOCK(OMP_SECTIONS);
2324   KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2325
2326   if (active) {
2327     // Setup sections in the same way as dynamic scheduled loops.
2328     // We need one shared data: which section is to execute next.
2329     // (in case parallel is not active, all sections will be executed on the
2330     // same thread)
2331     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2332                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2333
2334     my_buffer_index = th->th.th_dispatch->th_disp_index++;
2335
2336     // reuse shared data structures from dynamic sched loops:
2337     sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2338         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2339     KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2340                   my_buffer_index));
2341
2342     th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2343     th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2344
2345     KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2346                    "sh->buffer_index:%d\n",
2347                    gtid, my_buffer_index, sh->buffer_index));
2348     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2349                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2350     // Note: KMP_WAIT() cannot be used there: buffer index and
2351     // my_buffer_index are *always* 32-bit integers.
2352     KMP_MB();
2353     KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2354                    "sh->buffer_index:%d\n",
2355                    gtid, my_buffer_index, sh->buffer_index));
2356
2357     th->th.th_dispatch->th_dispatch_pr_current =
2358         nullptr; // sections construct doesn't need private data
2359     th->th.th_dispatch->th_dispatch_sh_current =
2360         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2361   }
2362
2363 #if OMPT_SUPPORT && OMPT_OPTIONAL
2364   if (ompt_enabled.ompt_callback_work) {
2365     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2366     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2367     ompt_callbacks.ompt_callback(ompt_callback_work)(
2368         ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2369         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2370   }
2371 #endif
2372   KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2373
2374   return active;
2375 }
2376
2377 /*!
2378 @ingroup WORK_SHARING
2379 @param loc  source location information
2380 @param global_tid  global thread number
2381 @param numberOfSections  number of sections in the 'sections' construct
2382 @return unsigned [from 0 to n) - number (id) of the section to execute next on
2383 this thread. n (or any other number not in range) - nothing to execute on this
2384 thread
2385 */
2386
2387 kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2388                               kmp_int32 numberOfSections) {
2389
2390   KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2391
2392   kmp_info_t *th = __kmp_threads[gtid];
2393 #ifdef KMP_DEBUG
2394   kmp_team_t *team = th->th.th_team;
2395 #endif
2396
2397   KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2398                   numberOfSections));
2399
2400   // For serialized case we should not call this function:
2401   KMP_DEBUG_ASSERT(!team->t.t_serialized);
2402
2403   dispatch_shared_info_template<kmp_int32> volatile *sh;
2404
2405   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2406                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2407
2408   KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2409   sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2410       th->th.th_dispatch->th_dispatch_sh_current);
2411   KMP_DEBUG_ASSERT(sh);
2412
2413   kmp_int32 sectionIndex = 0;
2414   bool moreSectionsToExecute = true;
2415
2416   // Find section to execute:
2417   sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2418   if (sectionIndex >= numberOfSections) {
2419     moreSectionsToExecute = false;
2420   }
2421
2422   // status == 0: no more sections to execute;
2423   // OMPTODO: __kmpc_end_sections could be bypassed?
2424   if (!moreSectionsToExecute) {
2425     kmp_int32 num_done;
2426
2427     num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2428
2429     if (num_done == th->th.th_team_nproc - 1) {
2430       /* NOTE: release this buffer to be reused */
2431
2432       KMP_MB(); /* Flush all pending memory write invalidates.  */
2433
2434       sh->u.s.num_done = 0;
2435       sh->u.s.iteration = 0;
2436
2437       KMP_MB(); /* Flush all pending memory write invalidates.  */
2438
2439       sh->buffer_index += __kmp_dispatch_num_buffers;
2440       KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2441                      sh->buffer_index));
2442
2443       KMP_MB(); /* Flush all pending memory write invalidates.  */
2444
2445     } // if
2446
2447     th->th.th_dispatch->th_deo_fcn = NULL;
2448     th->th.th_dispatch->th_dxo_fcn = NULL;
2449     th->th.th_dispatch->th_dispatch_sh_current = NULL;
2450     th->th.th_dispatch->th_dispatch_pr_current = NULL;
2451
2452 #if OMPT_SUPPORT && OMPT_OPTIONAL
2453     if (ompt_enabled.ompt_callback_dispatch) {
2454       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2455       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2456       ompt_data_t instance = ompt_data_none;
2457       instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2458       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2459           &(team_info->parallel_data), &(task_info->task_data),
2460           ompt_dispatch_section, instance);
2461     }
2462 #endif
2463   }
2464
2465   return sectionIndex;
2466 }
2467
2468 /*!
2469 @ingroup WORK_SHARING
2470 @param loc  source location information
2471 @param global_tid  global thread number
2472
2473 End of "sections" construct.
2474 Don't need to wait here: barrier is added separately when needed.
2475 */
2476 void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2477
2478   kmp_info_t *th = __kmp_threads[gtid];
2479   int active = !th->th.th_team->t.t_serialized;
2480
2481   KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2482
2483   if (!active) {
2484     // In active case call finalization is done in __kmpc_next_section
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486     if (ompt_enabled.ompt_callback_work) {
2487       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2488       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2489       ompt_callbacks.ompt_callback(ompt_callback_work)(
2490           ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2491           &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2492     }
2493 #endif
2494   }
2495
2496   KMP_POP_PARTITIONED_TIMER();
2497   KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2498 }
2499
2500 template <typename T>
2501 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2502                                   kmp_int32 *plastiter, T *plower, T *pupper,
2503                                   typename traits_t<T>::signed_t incr) {
2504   typedef typename traits_t<T>::unsigned_t UT;
2505   kmp_uint32 team_id;
2506   kmp_uint32 nteams;
2507   UT trip_count;
2508   kmp_team_t *team;
2509   kmp_info_t *th;
2510
2511   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2512   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2513 #ifdef KMP_DEBUG
2514   typedef typename traits_t<T>::signed_t ST;
2515   {
2516     char *buff;
2517     // create format specifiers before the debug output
2518     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2519                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2520                             traits_t<T>::spec, traits_t<T>::spec,
2521                             traits_t<ST>::spec, traits_t<T>::spec);
2522     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2523     __kmp_str_free(&buff);
2524   }
2525 #endif
2526
2527   if (__kmp_env_consistency_check) {
2528     if (incr == 0) {
2529       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2530                             loc);
2531     }
2532     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2533       // The loop is illegal.
2534       // Some zero-trip loops maintained by compiler, e.g.:
2535       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2536       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2537       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2538       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2539       // Compiler does not check the following illegal loops:
2540       //   for(i=0;i<10;i+=incr) // where incr<0
2541       //   for(i=10;i>0;i-=incr) // where incr<0
2542       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2543     }
2544   }
2545   __kmp_assert_valid_gtid(gtid);
2546   th = __kmp_threads[gtid];
2547   team = th->th.th_team;
2548   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2549   nteams = th->th.th_teams_size.nteams;
2550   team_id = team->t.t_master_tid;
2551   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2552
2553   // compute global trip count
2554   if (incr == 1) {
2555     trip_count = *pupper - *plower + 1;
2556   } else if (incr == -1) {
2557     trip_count = *plower - *pupper + 1;
2558   } else if (incr > 0) {
2559     // upper-lower can exceed the limit of signed type
2560     trip_count = (UT)(*pupper - *plower) / incr + 1;
2561   } else {
2562     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2563   }
2564
2565   if (trip_count <= nteams) {
2566     KMP_DEBUG_ASSERT(
2567         __kmp_static == kmp_sch_static_greedy ||
2568         __kmp_static ==
2569             kmp_sch_static_balanced); // Unknown static scheduling type.
2570     // only some teams get single iteration, others get nothing
2571     if (team_id < trip_count) {
2572       *pupper = *plower = *plower + team_id * incr;
2573     } else {
2574       *plower = *pupper + incr; // zero-trip loop
2575     }
2576     if (plastiter != NULL)
2577       *plastiter = (team_id == trip_count - 1);
2578   } else {
2579     if (__kmp_static == kmp_sch_static_balanced) {
2580       UT chunk = trip_count / nteams;
2581       UT extras = trip_count % nteams;
2582       *plower +=
2583           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2584       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2585       if (plastiter != NULL)
2586         *plastiter = (team_id == nteams - 1);
2587     } else {
2588       T chunk_inc_count =
2589           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2590       T upper = *pupper;
2591       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2592       // Unknown static scheduling type.
2593       *plower += team_id * chunk_inc_count;
2594       *pupper = *plower + chunk_inc_count - incr;
2595       // Check/correct bounds if needed
2596       if (incr > 0) {
2597         if (*pupper < *plower)
2598           *pupper = traits_t<T>::max_value;
2599         if (plastiter != NULL)
2600           *plastiter = *plower <= upper && *pupper > upper - incr;
2601         if (*pupper > upper)
2602           *pupper = upper; // tracker C73258
2603       } else {
2604         if (*pupper > *plower)
2605           *pupper = traits_t<T>::min_value;
2606         if (plastiter != NULL)
2607           *plastiter = *plower >= upper && *pupper < upper - incr;
2608         if (*pupper < upper)
2609           *pupper = upper; // tracker C73258
2610       }
2611     }
2612   }
2613 }
2614
2615 //-----------------------------------------------------------------------------
2616 // Dispatch routines
2617 //    Transfer call to template< type T >
2618 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2619 //                         T lb, T ub, ST st, ST chunk )
2620 extern "C" {
2621
2622 /*!
2623 @ingroup WORK_SHARING
2624 @{
2625 @param loc Source location
2626 @param gtid Global thread id
2627 @param schedule Schedule type
2628 @param lb  Lower bound
2629 @param ub  Upper bound
2630 @param st  Step (or increment if you prefer)
2631 @param chunk The chunk size to block with
2632
2633 This function prepares the runtime to start a dynamically scheduled for loop,
2634 saving the loop arguments.
2635 These functions are all identical apart from the types of the arguments.
2636 */
2637
2638 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2639                             enum sched_type schedule, kmp_int32 lb,
2640                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2641   KMP_DEBUG_ASSERT(__kmp_init_serial);
2642 #if OMPT_SUPPORT && OMPT_OPTIONAL
2643   OMPT_STORE_RETURN_ADDRESS(gtid);
2644 #endif
2645   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2646 }
2647 /*!
2648 See @ref __kmpc_dispatch_init_4
2649 */
2650 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2651                              enum sched_type schedule, kmp_uint32 lb,
2652                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2653   KMP_DEBUG_ASSERT(__kmp_init_serial);
2654 #if OMPT_SUPPORT && OMPT_OPTIONAL
2655   OMPT_STORE_RETURN_ADDRESS(gtid);
2656 #endif
2657   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2658 }
2659
2660 /*!
2661 See @ref __kmpc_dispatch_init_4
2662 */
2663 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2664                             enum sched_type schedule, kmp_int64 lb,
2665                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2666   KMP_DEBUG_ASSERT(__kmp_init_serial);
2667 #if OMPT_SUPPORT && OMPT_OPTIONAL
2668   OMPT_STORE_RETURN_ADDRESS(gtid);
2669 #endif
2670   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2671 }
2672
2673 /*!
2674 See @ref __kmpc_dispatch_init_4
2675 */
2676 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2677                              enum sched_type schedule, kmp_uint64 lb,
2678                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2679   KMP_DEBUG_ASSERT(__kmp_init_serial);
2680 #if OMPT_SUPPORT && OMPT_OPTIONAL
2681   OMPT_STORE_RETURN_ADDRESS(gtid);
2682 #endif
2683   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2684 }
2685
2686 /*!
2687 See @ref __kmpc_dispatch_init_4
2688
2689 Difference from __kmpc_dispatch_init set of functions is these functions
2690 are called for composite distribute parallel for construct. Thus before
2691 regular iterations dispatching we need to calc per-team iteration space.
2692
2693 These functions are all identical apart from the types of the arguments.
2694 */
2695 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2696                                  enum sched_type schedule, kmp_int32 *p_last,
2697                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2698                                  kmp_int32 chunk) {
2699   KMP_DEBUG_ASSERT(__kmp_init_serial);
2700 #if OMPT_SUPPORT && OMPT_OPTIONAL
2701   OMPT_STORE_RETURN_ADDRESS(gtid);
2702 #endif
2703   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2704   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2705 }
2706
2707 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2708                                   enum sched_type schedule, kmp_int32 *p_last,
2709                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2710                                   kmp_int32 chunk) {
2711   KMP_DEBUG_ASSERT(__kmp_init_serial);
2712 #if OMPT_SUPPORT && OMPT_OPTIONAL
2713   OMPT_STORE_RETURN_ADDRESS(gtid);
2714 #endif
2715   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2716   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2717 }
2718
2719 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2720                                  enum sched_type schedule, kmp_int32 *p_last,
2721                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2722                                  kmp_int64 chunk) {
2723   KMP_DEBUG_ASSERT(__kmp_init_serial);
2724 #if OMPT_SUPPORT && OMPT_OPTIONAL
2725   OMPT_STORE_RETURN_ADDRESS(gtid);
2726 #endif
2727   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2728   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2729 }
2730
2731 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2732                                   enum sched_type schedule, kmp_int32 *p_last,
2733                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2734                                   kmp_int64 chunk) {
2735   KMP_DEBUG_ASSERT(__kmp_init_serial);
2736 #if OMPT_SUPPORT && OMPT_OPTIONAL
2737   OMPT_STORE_RETURN_ADDRESS(gtid);
2738 #endif
2739   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2740   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2741 }
2742
2743 /*!
2744 @param loc Source code location
2745 @param gtid Global thread id
2746 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2747 otherwise
2748 @param p_lb   Pointer to the lower bound for the next chunk of work
2749 @param p_ub   Pointer to the upper bound for the next chunk of work
2750 @param p_st   Pointer to the stride for the next chunk of work
2751 @return one if there is work to be done, zero otherwise
2752
2753 Get the next dynamically allocated chunk of work for this thread.
2754 If there is no more work, then the lb,ub and stride need not be modified.
2755 */
2756 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2757                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2758 #if OMPT_SUPPORT && OMPT_OPTIONAL
2759   OMPT_STORE_RETURN_ADDRESS(gtid);
2760 #endif
2761   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2762 #if OMPT_SUPPORT && OMPT_OPTIONAL
2763                                         ,
2764                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2765 #endif
2766   );
2767 }
2768
2769 /*!
2770 See @ref __kmpc_dispatch_next_4
2771 */
2772 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2773                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2774                             kmp_int32 *p_st) {
2775 #if OMPT_SUPPORT && OMPT_OPTIONAL
2776   OMPT_STORE_RETURN_ADDRESS(gtid);
2777 #endif
2778   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2779 #if OMPT_SUPPORT && OMPT_OPTIONAL
2780                                          ,
2781                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2782 #endif
2783   );
2784 }
2785
2786 /*!
2787 See @ref __kmpc_dispatch_next_4
2788 */
2789 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2790                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2791 #if OMPT_SUPPORT && OMPT_OPTIONAL
2792   OMPT_STORE_RETURN_ADDRESS(gtid);
2793 #endif
2794   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2795 #if OMPT_SUPPORT && OMPT_OPTIONAL
2796                                         ,
2797                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2798 #endif
2799   );
2800 }
2801
2802 /*!
2803 See @ref __kmpc_dispatch_next_4
2804 */
2805 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2806                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2807                             kmp_int64 *p_st) {
2808 #if OMPT_SUPPORT && OMPT_OPTIONAL
2809   OMPT_STORE_RETURN_ADDRESS(gtid);
2810 #endif
2811   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2812 #if OMPT_SUPPORT && OMPT_OPTIONAL
2813                                          ,
2814                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2815 #endif
2816   );
2817 }
2818
2819 /*!
2820 @param loc Source code location
2821 @param gtid Global thread id
2822
2823 Mark the end of a dynamic loop.
2824 */
2825 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2826   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2827 }
2828
2829 /*!
2830 See @ref __kmpc_dispatch_fini_4
2831 */
2832 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2833   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2834 }
2835
2836 /*!
2837 See @ref __kmpc_dispatch_fini_4
2838 */
2839 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2840   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2841 }
2842
2843 /*!
2844 See @ref __kmpc_dispatch_fini_4
2845 */
2846 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2847   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2848 }
2849 /*! @} */
2850
2851 //-----------------------------------------------------------------------------
2852 // Non-template routines from kmp_dispatch.cpp used in other sources
2853
2854 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2855   return value == checker;
2856 }
2857
2858 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2859   return value != checker;
2860 }
2861
2862 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2863   return value < checker;
2864 }
2865
2866 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2867   return value >= checker;
2868 }
2869
2870 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2871   return value <= checker;
2872 }
2873
2874 kmp_uint32
2875 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2876              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2877              void *obj // Higher-level synchronization object, or NULL.
2878 ) {
2879   // note: we may not belong to a team at this point
2880   volatile kmp_uint32 *spin = spinner;
2881   kmp_uint32 check = checker;
2882   kmp_uint32 spins;
2883   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2884   kmp_uint32 r;
2885   kmp_uint64 time;
2886
2887   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2888   KMP_INIT_YIELD(spins);
2889   KMP_INIT_BACKOFF(time);
2890   // main wait spin loop
2891   while (!f(r = TCR_4(*spin), check)) {
2892     KMP_FSYNC_SPIN_PREPARE(obj);
2893     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2894        split. It causes problems with infinite recursion because of exit lock */
2895     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2896         __kmp_abort_thread(); */
2897     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2898   }
2899   KMP_FSYNC_SPIN_ACQUIRED(obj);
2900   return r;
2901 }
2902
2903 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2904                       kmp_uint32 (*pred)(void *, kmp_uint32),
2905                       void *obj // Higher-level synchronization object, or NULL.
2906 ) {
2907   // note: we may not belong to a team at this point
2908   void *spin = spinner;
2909   kmp_uint32 check = checker;
2910   kmp_uint32 spins;
2911   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2912   kmp_uint64 time;
2913
2914   KMP_FSYNC_SPIN_INIT(obj, spin);
2915   KMP_INIT_YIELD(spins);
2916   KMP_INIT_BACKOFF(time);
2917   // main wait spin loop
2918   while (!f(spin, check)) {
2919     KMP_FSYNC_SPIN_PREPARE(obj);
2920     /* if we have waited a bit, or are noversubscribed, yield */
2921     /* pause is in the following code */
2922     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2923   }
2924   KMP_FSYNC_SPIN_ACQUIRED(obj);
2925 }
2926
2927 } // extern "C"
2928
2929 #ifdef KMP_GOMP_COMPAT
2930
2931 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2932                                enum sched_type schedule, kmp_int32 lb,
2933                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2934                                int push_ws) {
2935   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2936                                  push_ws);
2937 }
2938
2939 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2940                                 enum sched_type schedule, kmp_uint32 lb,
2941                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2942                                 int push_ws) {
2943   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2944                                   push_ws);
2945 }
2946
2947 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2948                                enum sched_type schedule, kmp_int64 lb,
2949                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2950                                int push_ws) {
2951   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2952                                  push_ws);
2953 }
2954
2955 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2956                                 enum sched_type schedule, kmp_uint64 lb,
2957                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2958                                 int push_ws) {
2959   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2960                                   push_ws);
2961 }
2962
2963 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2964   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2965 }
2966
2967 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2968   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2969 }
2970
2971 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2972   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2973 }
2974
2975 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2976   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2977 }
2978
2979 #endif /* KMP_GOMP_COMPAT */
2980
2981 /* ------------------------------------------------------------------------ */