openmp/runtime/src/kmp_dispatch.cpp

   1 /*
   2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 /* Dynamic scheduling initialization and dispatch.
  14  *
  15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
  16  *       it may change values between parallel regions.  __kmp_max_nth
  17  *       is the largest value __kmp_nth may take, 1 is the smallest.
  18  */
  19
  20 #include "kmp.h"
  21 #include "kmp_error.h"
  22 #include "kmp_i18n.h"
  23 #include "kmp_itt.h"
  24 #include "kmp_stats.h"
  25 #include "kmp_str.h"
  26 #if KMP_USE_X87CONTROL
  27 #include <float.h>
  28 #endif
  29 #include "kmp_lock.h"
  30 #include "kmp_dispatch.h"
  31 #if KMP_USE_HIER_SCHED
  32 #include "kmp_dispatch_hier.h"
  33 #endif
  34
  35 #if OMPT_SUPPORT
  36 #include "ompt-specific.h"
  37 #endif
  38
  39 /* ------------------------------------------------------------------------ */
  40 /* ------------------------------------------------------------------------ */
  41
  42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  43   kmp_info_t *th;
  44
  45   KMP_DEBUG_ASSERT(gtid_ref);
  46
  47   if (__kmp_env_consistency_check) {
  48     th = __kmp_threads[*gtid_ref];
  49     if (th->th.th_root->r.r_active &&
  50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
  51 #if KMP_USE_DYNAMIC_LOCK
  52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
  53 #else
  54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
  55 #endif
  56     }
  57   }
  58 }
  59
  60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  61   kmp_info_t *th;
  62
  63   if (__kmp_env_consistency_check) {
  64     th = __kmp_threads[*gtid_ref];
  65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
  66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
  67     }
  68   }
  69 }
  70
  71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
  72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
  73                                          bool use_hier = false) {
  74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
  75   int monotonicity;
  76   // default to monotonic
  77   monotonicity = SCHEDULE_MONOTONIC;
  78   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
  79     monotonicity = SCHEDULE_NONMONOTONIC;
  80   else if (SCHEDULE_HAS_MONOTONIC(schedule))
  81     monotonicity = SCHEDULE_MONOTONIC;
  82   return monotonicity;
  83 }
  84
  85 // Initialize a dispatch_private_info_template<T> buffer for a particular
  86 // type of schedule,chunk.  The loop description is found in lb (lower bound),
  87 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
  88 // to the scheduling (often the number of threads in a team, but not always if
  89 // hierarchical scheduling is used).  tid is the id of the thread calling
  90 // the function within the group of nproc threads.  It will have a value
  91 // between 0 and nproc - 1.  This is often just the thread id within a team, but
  92 // is not necessarily the case when using hierarchical scheduling.
  93 // loc is the source file location of the corresponding loop
  94 // gtid is the global thread id
  95 template <typename T>
  96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  97                                    dispatch_private_info_template<T> *pr,
  98                                    enum sched_type schedule, T lb, T ub,
  99                                    typename traits_t<T>::signed_t st,
 100 #if USE_ITT_BUILD
 101                                    kmp_uint64 *cur_chunk,
 102 #endif
 103                                    typename traits_t<T>::signed_t chunk,
 104                                    T nproc, T tid) {
 105   typedef typename traits_t<T>::unsigned_t UT;
 106   typedef typename traits_t<T>::floating_t DBL;
 107
 108   int active;
 109   T tc;
 110   kmp_info_t *th;
 111   kmp_team_t *team;
 112   int monotonicity;
 113   bool use_hier;
 114
 115 #ifdef KMP_DEBUG
 116   typedef typename traits_t<T>::signed_t ST;
 117   {
 118     char *buff;
 119     // create format specifiers before the debug output
 120     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
 121                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
 122                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
 123                             traits_t<T>::spec, traits_t<T>::spec,
 124                             traits_t<ST>::spec, traits_t<ST>::spec,
 125                             traits_t<T>::spec, traits_t<T>::spec);
 126     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
 127     __kmp_str_free(&buff);
 128   }
 129 #endif
 130   /* setup data */
 131   th = __kmp_threads[gtid];
 132   team = th->th.th_team;
 133   active = !team->t.t_serialized;
 134
 135 #if USE_ITT_BUILD
 136   int itt_need_metadata_reporting =
 137       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
 138       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
 139       team->t.t_active_level == 1;
 140 #endif
 141
 142 #if KMP_USE_HIER_SCHED
 143   use_hier = pr->flags.use_hier;
 144 #else
 145   use_hier = false;
 146 #endif
 147
 148   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
 149   monotonicity = __kmp_get_monotonicity(schedule, use_hier);
 150   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 151
 152   /* Pick up the nomerge/ordered bits from the scheduling type */
 153   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
 154     pr->flags.nomerge = TRUE;
 155     schedule =
 156         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
 157   } else {
 158     pr->flags.nomerge = FALSE;
 159   }
 160   pr->type_size = traits_t<T>::type_size; // remember the size of variables
 161   if (kmp_ord_lower & schedule) {
 162     pr->flags.ordered = TRUE;
 163     schedule =
 164         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
 165   } else {
 166     pr->flags.ordered = FALSE;
 167   }
 168   // Ordered overrides nonmonotonic
 169   if (pr->flags.ordered) {
 170     monotonicity = SCHEDULE_MONOTONIC;
 171   }
 172
 173   if (schedule == kmp_sch_static) {
 174     schedule = __kmp_static;
 175   } else {
 176     if (schedule == kmp_sch_runtime) {
 177       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
 178       // not specified)
 179       schedule = team->t.t_sched.r_sched_type;
 180       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
 181       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 182       // Detail the schedule if needed (global controls are differentiated
 183       // appropriately)
 184       if (schedule == kmp_sch_guided_chunked) {
 185         schedule = __kmp_guided;
 186       } else if (schedule == kmp_sch_static) {
 187         schedule = __kmp_static;
 188       }
 189       // Use the chunk size specified by OMP_SCHEDULE (or default if not
 190       // specified)
 191       chunk = team->t.t_sched.chunk;
 192 #if USE_ITT_BUILD
 193       if (cur_chunk)
 194         *cur_chunk = chunk;
 195 #endif
 196 #ifdef KMP_DEBUG
 197       {
 198         char *buff;
 199         // create format specifiers before the debug output
 200         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
 201                                 "schedule:%%d chunk:%%%s\n",
 202                                 traits_t<ST>::spec);
 203         KD_TRACE(10, (buff, gtid, schedule, chunk));
 204         __kmp_str_free(&buff);
 205       }
 206 #endif
 207     } else {
 208       if (schedule == kmp_sch_guided_chunked) {
 209         schedule = __kmp_guided;
 210       }
 211       if (chunk <= 0) {
 212         chunk = KMP_DEFAULT_CHUNK;
 213       }
 214     }
 215
 216     if (schedule == kmp_sch_auto) {
 217       // mapping and differentiation: in the __kmp_do_serial_initialize()
 218       schedule = __kmp_auto;
 219 #ifdef KMP_DEBUG
 220       {
 221         char *buff;
 222         // create format specifiers before the debug output
 223         buff = __kmp_str_format(
 224             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
 225             "schedule:%%d chunk:%%%s\n",
 226             traits_t<ST>::spec);
 227         KD_TRACE(10, (buff, gtid, schedule, chunk));
 228         __kmp_str_free(&buff);
 229       }
 230 #endif
 231     }
 232 #if KMP_STATIC_STEAL_ENABLED
 233     // map nonmonotonic:dynamic to static steal
 234     if (schedule == kmp_sch_dynamic_chunked) {
 235       if (monotonicity == SCHEDULE_NONMONOTONIC)
 236         schedule = kmp_sch_static_steal;
 237     }
 238 #endif
 239     /* guided analytical not safe for too many threads */
 240     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
 241       schedule = kmp_sch_guided_iterative_chunked;
 242       KMP_WARNING(DispatchManyThreads);
 243     }
 244     if (schedule == kmp_sch_runtime_simd) {
 245       // compiler provides simd_width in the chunk parameter
 246       schedule = team->t.t_sched.r_sched_type;
 247       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
 248       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 249       // Detail the schedule if needed (global controls are differentiated
 250       // appropriately)
 251       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
 252           schedule == __kmp_static) {
 253         schedule = kmp_sch_static_balanced_chunked;
 254       } else {
 255         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
 256           schedule = kmp_sch_guided_simd;
 257         }
 258         chunk = team->t.t_sched.chunk * chunk;
 259       }
 260 #if USE_ITT_BUILD
 261       if (cur_chunk)
 262         *cur_chunk = chunk;
 263 #endif
 264 #ifdef KMP_DEBUG
 265       {
 266         char *buff;
 267         // create format specifiers before the debug output
 268         buff = __kmp_str_format(
 269             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
 270             " chunk:%%%s\n",
 271             traits_t<ST>::spec);
 272         KD_TRACE(10, (buff, gtid, schedule, chunk));
 273         __kmp_str_free(&buff);
 274       }
 275 #endif
 276     }
 277     pr->u.p.parm1 = chunk;
 278   }
 279   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
 280               "unknown scheduling type");
 281
 282   pr->u.p.count = 0;
 283
 284   if (__kmp_env_consistency_check) {
 285     if (st == 0) {
 286       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
 287                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
 288     }
 289   }
 290   // compute trip count
 291   if (st == 1) { // most common case
 292     if (ub >= lb) {
 293       tc = ub - lb + 1;
 294     } else { // ub < lb
 295       tc = 0; // zero-trip
 296     }
 297   } else if (st < 0) {
 298     if (lb >= ub) {
 299       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
 300       // where the division needs to be unsigned regardless of the result type
 301       tc = (UT)(lb - ub) / (-st) + 1;
 302     } else { // lb < ub
 303       tc = 0; // zero-trip
 304     }
 305   } else { // st > 0
 306     if (ub >= lb) {
 307       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
 308       // where the division needs to be unsigned regardless of the result type
 309       tc = (UT)(ub - lb) / st + 1;
 310     } else { // ub < lb
 311       tc = 0; // zero-trip
 312     }
 313   }
 314
 315 #if KMP_STATS_ENABLED
 316   if (KMP_MASTER_GTID(gtid)) {
 317     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
 318   }
 319 #endif
 320
 321   pr->u.p.lb = lb;
 322   pr->u.p.ub = ub;
 323   pr->u.p.st = st;
 324   pr->u.p.tc = tc;
 325
 326 #if KMP_OS_WINDOWS
 327   pr->u.p.last_upper = ub + st;
 328 #endif /* KMP_OS_WINDOWS */
 329
 330   /* NOTE: only the active parallel region(s) has active ordered sections */
 331
 332   if (active) {
 333     if (pr->flags.ordered) {
 334       pr->ordered_bumped = 0;
 335       pr->u.p.ordered_lower = 1;
 336       pr->u.p.ordered_upper = 0;
 337     }
 338   }
 339
 340   switch (schedule) {
 341 #if (KMP_STATIC_STEAL_ENABLED)
 342   case kmp_sch_static_steal: {
 343     T ntc, init;
 344
 345     KD_TRACE(100,
 346              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
 347               gtid));
 348
 349     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
 350     if (nproc > 1 && ntc >= nproc) {
 351       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
 352       T id = tid;
 353       T small_chunk, extras;
 354
 355       small_chunk = ntc / nproc;
 356       extras = ntc % nproc;
 357
 358       init = id * small_chunk + (id < extras ? id : extras);
 359       pr->u.p.count = init;
 360       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
 361
 362       pr->u.p.parm2 = lb;
 363       // parm3 is the number of times to attempt stealing which is
 364       // proportional to the number of chunks per thread up until
 365       // the maximum value of nproc.
 366       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
 367       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
 368       pr->u.p.st = st;
 369       if (traits_t<T>::type_size > 4) {
 370         // AC: TODO: check if 16-byte CAS available and use it to
 371         // improve performance (probably wait for explicit request
 372         // before spending time on this).
 373         // For now use dynamically allocated per-thread lock,
 374         // free memory in __kmp_dispatch_next when status==0.
 375         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
 376         pr->u.p.th_steal_lock =
 377             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
 378         __kmp_init_lock(pr->u.p.th_steal_lock);
 379       }
 380       break;
 381     } else {
 382       /* too few chunks: switching to kmp_sch_dynamic_chunked */
 383       schedule = kmp_sch_dynamic_chunked;
 384       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
 385                      "kmp_sch_dynamic_chunked\n",
 386                       gtid));
 387       if (pr->u.p.parm1 <= 0)
 388         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
 389       break;
 390     } // if
 391   } // case
 392 #endif
 393   case kmp_sch_static_balanced: {
 394     T init, limit;
 395
 396     KD_TRACE(
 397         100,
 398         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
 399          gtid));
 400
 401     if (nproc > 1) {
 402       T id = tid;
 403
 404       if (tc < nproc) {
 405         if (id < tc) {
 406           init = id;
 407           limit = id;
 408           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
 409         } else {
 410           pr->u.p.count = 1; /* means no more chunks to execute */
 411           pr->u.p.parm1 = FALSE;
 412           break;
 413         }
 414       } else {
 415         T small_chunk = tc / nproc;
 416         T extras = tc % nproc;
 417         init = id * small_chunk + (id < extras ? id : extras);
 418         limit = init + small_chunk - (id < extras ? 0 : 1);
 419         pr->u.p.parm1 = (id == nproc - 1);
 420       }
 421     } else {
 422       if (tc > 0) {
 423         init = 0;
 424         limit = tc - 1;
 425         pr->u.p.parm1 = TRUE;
 426       } else {
 427         // zero trip count
 428         pr->u.p.count = 1; /* means no more chunks to execute */
 429         pr->u.p.parm1 = FALSE;
 430         break;
 431       }
 432     }
 433 #if USE_ITT_BUILD
 434     // Calculate chunk for metadata report
 435     if (itt_need_metadata_reporting)
 436       if (cur_chunk)
 437         *cur_chunk = limit - init + 1;
 438 #endif
 439     if (st == 1) {
 440       pr->u.p.lb = lb + init;
 441       pr->u.p.ub = lb + limit;
 442     } else {
 443       // calculated upper bound, "ub" is user-defined upper bound
 444       T ub_tmp = lb + limit * st;
 445       pr->u.p.lb = lb + init * st;
 446       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
 447       // it exactly
 448       if (st > 0) {
 449         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
 450       } else {
 451         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
 452       }
 453     }
 454     if (pr->flags.ordered) {
 455       pr->u.p.ordered_lower = init;
 456       pr->u.p.ordered_upper = limit;
 457     }
 458     break;
 459   } // case
 460   case kmp_sch_static_balanced_chunked: {
 461     // similar to balanced, but chunk adjusted to multiple of simd width
 462     T nth = nproc;
 463     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
 464                    " -> falling-through to static_greedy\n",
 465                    gtid));
 466     schedule = kmp_sch_static_greedy;
 467     if (nth > 1)
 468       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
 469     else
 470       pr->u.p.parm1 = tc;
 471     break;
 472   } // case
 473   case kmp_sch_guided_simd:
 474   case kmp_sch_guided_iterative_chunked: {
 475     KD_TRACE(
 476         100,
 477         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
 478          " case\n",
 479          gtid));
 480
 481     if (nproc > 1) {
 482       if ((2L * chunk + 1) * nproc >= tc) {
 483         /* chunk size too large, switch to dynamic */
 484         schedule = kmp_sch_dynamic_chunked;
 485       } else {
 486         // when remaining iters become less than parm2 - switch to dynamic
 487         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
 488         *(double *)&pr->u.p.parm3 =
 489             guided_flt_param / nproc; // may occupy parm3 and parm4
 490       }
 491     } else {
 492       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 493                      "kmp_sch_static_greedy\n",
 494                      gtid));
 495       schedule = kmp_sch_static_greedy;
 496       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
 497       KD_TRACE(
 498           100,
 499           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
 500            gtid));
 501       pr->u.p.parm1 = tc;
 502     } // if
 503   } // case
 504   break;
 505   case kmp_sch_guided_analytical_chunked: {
 506     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
 507                    "kmp_sch_guided_analytical_chunked case\n",
 508                    gtid));
 509
 510     if (nproc > 1) {
 511       if ((2L * chunk + 1) * nproc >= tc) {
 512         /* chunk size too large, switch to dynamic */
 513         schedule = kmp_sch_dynamic_chunked;
 514       } else {
 515         /* commonly used term: (2 nproc - 1)/(2 nproc) */
 516         DBL x;
 517
 518 #if KMP_USE_X87CONTROL
 519         /* Linux* OS already has 64-bit computation by default for long double,
 520            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
 521            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
 522            instead of the default 53-bit. Even though long double doesn't work
 523            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
 524            expected to impact the correctness of the algorithm, but this has not
 525            been mathematically proven. */
 526         // save original FPCW and set precision to 64-bit, as
 527         // Windows* OS on IA-32 architecture defaults to 53-bit
 528         unsigned int oldFpcw = _control87(0, 0);
 529         _control87(_PC_64, _MCW_PC); // 0,0x30000
 530 #endif
 531         /* value used for comparison in solver for cross-over point */
 532         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
 533
 534         /* crossover point--chunk indexes equal to or greater than
 535            this point switch to dynamic-style scheduling */
 536         UT cross;
 537
 538         /* commonly used term: (2 nproc - 1)/(2 nproc) */
 539         x = (long double)1.0 - (long double)0.5 / nproc;
 540
 541 #ifdef KMP_DEBUG
 542         { // test natural alignment
 543           struct _test_a {
 544             char a;
 545             union {
 546               char b;
 547               DBL d;
 548             };
 549           } t;
 550           ptrdiff_t natural_alignment =
 551               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
 552           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
 553           // long)natural_alignment );
 554           KMP_DEBUG_ASSERT(
 555               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
 556         }
 557 #endif // KMP_DEBUG
 558
 559         /* save the term in thread private dispatch structure */
 560         *(DBL *)&pr->u.p.parm3 = x;
 561
 562         /* solve for the crossover point to the nearest integer i for which C_i
 563            <= chunk */
 564         {
 565           UT left, right, mid;
 566           long double p;
 567
 568           /* estimate initial upper and lower bound */
 569
 570           /* doesn't matter what value right is as long as it is positive, but
 571              it affects performance of the solver */
 572           right = 229;
 573           p = __kmp_pow<UT>(x, right);
 574           if (p > target) {
 575             do {
 576               p *= p;
 577               right <<= 1;
 578             } while (p > target && right < (1 << 27));
 579             /* lower bound is previous (failed) estimate of upper bound */
 580             left = right >> 1;
 581           } else {
 582             left = 0;
 583           }
 584
 585           /* bisection root-finding method */
 586           while (left + 1 < right) {
 587             mid = (left + right) / 2;
 588             if (__kmp_pow<UT>(x, mid) > target) {
 589               left = mid;
 590             } else {
 591               right = mid;
 592             }
 593           } // while
 594           cross = right;
 595         }
 596         /* assert sanity of computed crossover point */
 597         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
 598                    __kmp_pow<UT>(x, cross) <= target);
 599
 600         /* save the crossover point in thread private dispatch structure */
 601         pr->u.p.parm2 = cross;
 602
 603 // C75803
 604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
 605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
 606 #else
 607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
 608 #endif
 609         /* dynamic-style scheduling offset */
 610         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
 611                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
 612                         cross * chunk;
 613 #if KMP_USE_X87CONTROL
 614         // restore FPCW
 615         _control87(oldFpcw, _MCW_PC);
 616 #endif
 617       } // if
 618     } else {
 619       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 620                      "kmp_sch_static_greedy\n",
 621                      gtid));
 622       schedule = kmp_sch_static_greedy;
 623       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
 624       pr->u.p.parm1 = tc;
 625     } // if
 626   } // case
 627   break;
 628   case kmp_sch_static_greedy:
 629     KD_TRACE(
 630         100,
 631         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
 632          gtid));
 633     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
 634     break;
 635   case kmp_sch_static_chunked:
 636   case kmp_sch_dynamic_chunked:
 637     if (pr->u.p.parm1 <= 0) {
 638       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
 639     }
 640     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
 641                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
 642                    gtid));
 643     break;
 644   case kmp_sch_trapezoidal: {
 645     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
 646
 647     T parm1, parm2, parm3, parm4;
 648     KD_TRACE(100,
 649              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
 650               gtid));
 651
 652     parm1 = chunk;
 653
 654     /* F : size of the first cycle */
 655     parm2 = (tc / (2 * nproc));
 656
 657     if (parm2 < 1) {
 658       parm2 = 1;
 659     }
 660
 661     /* L : size of the last cycle.  Make sure the last cycle is not larger
 662        than the first cycle. */
 663     if (parm1 < 1) {
 664       parm1 = 1;
 665     } else if (parm1 > parm2) {
 666       parm1 = parm2;
 667     }
 668
 669     /* N : number of cycles */
 670     parm3 = (parm2 + parm1);
 671     parm3 = (2 * tc + parm3 - 1) / parm3;
 672
 673     if (parm3 < 2) {
 674       parm3 = 2;
 675     }
 676
 677     /* sigma : decreasing incr of the trapezoid */
 678     parm4 = (parm3 - 1);
 679     parm4 = (parm2 - parm1) / parm4;
 680
 681     // pointless check, because parm4 >= 0 always
 682     // if ( parm4 < 0 ) {
 683     //    parm4 = 0;
 684     //}
 685
 686     pr->u.p.parm1 = parm1;
 687     pr->u.p.parm2 = parm2;
 688     pr->u.p.parm3 = parm3;
 689     pr->u.p.parm4 = parm4;
 690   } // case
 691   break;
 692
 693   default: {
 694     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
 695                 KMP_HNT(GetNewerLibrary), // Hint
 696                 __kmp_msg_null // Variadic argument list terminator
 697                 );
 698   } break;
 699   } // switch
 700   pr->schedule = schedule;
 701 }
 702
 703 #if KMP_USE_HIER_SCHED
 704 template <typename T>
 705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
 706                                              typename traits_t<T>::signed_t st);
 707 template <>
 708 inline void
 709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
 710                                             kmp_int32 ub, kmp_int32 st) {
 711   __kmp_dispatch_init_hierarchy<kmp_int32>(
 712       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 713       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
 714 }
 715 template <>
 716 inline void
 717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
 718                                              kmp_uint32 ub, kmp_int32 st) {
 719   __kmp_dispatch_init_hierarchy<kmp_uint32>(
 720       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 721       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
 722 }
 723 template <>
 724 inline void
 725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
 726                                             kmp_int64 ub, kmp_int64 st) {
 727   __kmp_dispatch_init_hierarchy<kmp_int64>(
 728       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 729       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
 730 }
 731 template <>
 732 inline void
 733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
 734                                              kmp_uint64 ub, kmp_int64 st) {
 735   __kmp_dispatch_init_hierarchy<kmp_uint64>(
 736       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 737       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
 738 }
 739
 740 // free all the hierarchy scheduling memory associated with the team
 741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
 742   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
 743   for (int i = 0; i < num_disp_buff; ++i) {
 744     // type does not matter here so use kmp_int32
 745     auto sh =
 746         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
 747             &team->t.t_disp_buffer[i]);
 748     if (sh->hier) {
 749       sh->hier->deallocate();
 750       __kmp_free(sh->hier);
 751     }
 752   }
 753 }
 754 #endif
 755
 756 // UT - unsigned flavor of T, ST - signed flavor of T,
 757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
 758 template <typename T>
 759 static void
 760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 761                     T ub, typename traits_t<T>::signed_t st,
 762                     typename traits_t<T>::signed_t chunk, int push_ws) {
 763   typedef typename traits_t<T>::unsigned_t UT;
 764
 765   int active;
 766   kmp_info_t *th;
 767   kmp_team_t *team;
 768   kmp_uint32 my_buffer_index;
 769   dispatch_private_info_template<T> *pr;
 770   dispatch_shared_info_template<T> volatile *sh;
 771
 772   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
 773                    sizeof(dispatch_private_info));
 774   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
 775                    sizeof(dispatch_shared_info));
 776   __kmp_assert_valid_gtid(gtid);
 777
 778   if (!TCR_4(__kmp_init_parallel))
 779     __kmp_parallel_initialize();
 780
 781   __kmp_resume_if_soft_paused();
 782
 783 #if INCLUDE_SSC_MARKS
 784   SSC_MARK_DISPATCH_INIT();
 785 #endif
 786 #ifdef KMP_DEBUG
 787   typedef typename traits_t<T>::signed_t ST;
 788   {
 789     char *buff;
 790     // create format specifiers before the debug output
 791     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
 792                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
 793                             traits_t<ST>::spec, traits_t<T>::spec,
 794                             traits_t<T>::spec, traits_t<ST>::spec);
 795     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
 796     __kmp_str_free(&buff);
 797   }
 798 #endif
 799   /* setup data */
 800   th = __kmp_threads[gtid];
 801   team = th->th.th_team;
 802   active = !team->t.t_serialized;
 803   th->th.th_ident = loc;
 804
 805   // Any half-decent optimizer will remove this test when the blocks are empty
 806   // since the macros expand to nothing
 807   // when statistics are disabled.
 808   if (schedule == __kmp_static) {
 809     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
 810   } else {
 811     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
 812   }
 813
 814 #if KMP_USE_HIER_SCHED
 815   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
 816   // Hierarchical scheduling does not work with ordered, so if ordered is
 817   // detected, then revert back to threaded scheduling.
 818   bool ordered;
 819   enum sched_type my_sched = schedule;
 820   my_buffer_index = th->th.th_dispatch->th_disp_index;
 821   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 822       &th->th.th_dispatch
 823            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 824   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
 825   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
 826     my_sched =
 827         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
 828   ordered = (kmp_ord_lower & my_sched);
 829   if (pr->flags.use_hier) {
 830     if (ordered) {
 831       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
 832                      "Disabling hierarchical scheduling.\n",
 833                      gtid));
 834       pr->flags.use_hier = FALSE;
 835     }
 836   }
 837   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
 838     // Don't use hierarchical for ordered parallel loops and don't
 839     // use the runtime hierarchy if one was specified in the program
 840     if (!ordered && !pr->flags.use_hier)
 841       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
 842   }
 843 #endif // KMP_USE_HIER_SCHED
 844
 845 #if USE_ITT_BUILD
 846   kmp_uint64 cur_chunk = chunk;
 847   int itt_need_metadata_reporting =
 848       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
 849       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
 850       team->t.t_active_level == 1;
 851 #endif
 852   if (!active) {
 853     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 854         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
 855   } else {
 856     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
 857                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 858
 859     my_buffer_index = th->th.th_dispatch->th_disp_index++;
 860
 861     /* What happens when number of threads changes, need to resize buffer? */
 862     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 863         &th->th.th_dispatch
 864              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 865     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 866         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 867     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
 868                   my_buffer_index));
 869   }
 870
 871   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
 872 #if USE_ITT_BUILD
 873                                 &cur_chunk,
 874 #endif
 875                                 chunk, (T)th->th.th_team_nproc,
 876                                 (T)th->th.th_info.ds.ds_tid);
 877   if (active) {
 878     if (pr->flags.ordered == 0) {
 879       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
 880       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
 881     } else {
 882       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
 883       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
 884     }
 885   }
 886
 887   if (active) {
 888     /* The name of this buffer should be my_buffer_index when it's free to use
 889      * it */
 890
 891     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
 892                    "sh->buffer_index:%d\n",
 893                    gtid, my_buffer_index, sh->buffer_index));
 894     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
 895                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
 896     // Note: KMP_WAIT() cannot be used there: buffer index and
 897     // my_buffer_index are *always* 32-bit integers.
 898     KMP_MB(); /* is this necessary? */
 899     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
 900                    "sh->buffer_index:%d\n",
 901                    gtid, my_buffer_index, sh->buffer_index));
 902
 903     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
 904     th->th.th_dispatch->th_dispatch_sh_current =
 905         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
 906 #if USE_ITT_BUILD
 907     if (pr->flags.ordered) {
 908       __kmp_itt_ordered_init(gtid);
 909     }
 910     // Report loop metadata
 911     if (itt_need_metadata_reporting) {
 912       // Only report metadata by master of active team at level 1
 913       kmp_uint64 schedtype = 0;
 914       switch (schedule) {
 915       case kmp_sch_static_chunked:
 916       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
 917         break;
 918       case kmp_sch_static_greedy:
 919         cur_chunk = pr->u.p.parm1;
 920         break;
 921       case kmp_sch_dynamic_chunked:
 922         schedtype = 1;
 923         break;
 924       case kmp_sch_guided_iterative_chunked:
 925       case kmp_sch_guided_analytical_chunked:
 926       case kmp_sch_guided_simd:
 927         schedtype = 2;
 928         break;
 929       default:
 930         // Should we put this case under "static"?
 931         // case kmp_sch_static_steal:
 932         schedtype = 3;
 933         break;
 934       }
 935       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
 936     }
 937 #if KMP_USE_HIER_SCHED
 938     if (pr->flags.use_hier) {
 939       pr->u.p.count = 0;
 940       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
 941     }
 942 #endif // KMP_USER_HIER_SCHED
 943 #endif /* USE_ITT_BUILD */
 944   }
 945
 946 #ifdef KMP_DEBUG
 947   {
 948     char *buff;
 949     // create format specifiers before the debug output
 950     buff = __kmp_str_format(
 951         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
 952         "lb:%%%s ub:%%%s"
 953         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
 954         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
 955         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
 956         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
 957         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
 958         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
 959     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
 960                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
 961                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
 962                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
 963     __kmp_str_free(&buff);
 964   }
 965 #endif
 966 #if (KMP_STATIC_STEAL_ENABLED)
 967   // It cannot be guaranteed that after execution of a loop with some other
 968   // schedule kind all the parm3 variables will contain the same value. Even if
 969   // all parm3 will be the same, it still exists a bad case like using 0 and 1
 970   // rather than program life-time increment. So the dedicated variable is
 971   // required. The 'static_steal_counter' is used.
 972   if (pr->schedule == kmp_sch_static_steal) {
 973     // Other threads will inspect this variable when searching for a victim.
 974     // This is a flag showing that other threads may steal from this thread
 975     // since then.
 976     volatile T *p = &pr->u.p.static_steal_counter;
 977     *p = *p + 1;
 978   }
 979 #endif // ( KMP_STATIC_STEAL_ENABLED )
 980
 981 #if OMPT_SUPPORT && OMPT_OPTIONAL
 982   if (ompt_enabled.ompt_callback_work) {
 983     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
 984     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
 985     ompt_callbacks.ompt_callback(ompt_callback_work)(
 986         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
 987         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
 988   }
 989 #endif
 990   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
 991 }
 992
 993 /* For ordered loops, either __kmp_dispatch_finish() should be called after
 994  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
 995  * every chunk of iterations.  If the ordered section(s) were not executed
 996  * for this iteration (or every iteration in this chunk), we need to set the
 997  * ordered iteration counters so that the next thread can proceed. */
 998 template <typename UT>
 999 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1000   typedef typename traits_t<UT>::signed_t ST;
1001   __kmp_assert_valid_gtid(gtid);
1002   kmp_info_t *th = __kmp_threads[gtid];
1003
1004   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1005   if (!th->th.th_team->t.t_serialized) {
1006
1007     dispatch_private_info_template<UT> *pr =
1008         reinterpret_cast<dispatch_private_info_template<UT> *>(
1009             th->th.th_dispatch->th_dispatch_pr_current);
1010     dispatch_shared_info_template<UT> volatile *sh =
1011         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1012             th->th.th_dispatch->th_dispatch_sh_current);
1013     KMP_DEBUG_ASSERT(pr);
1014     KMP_DEBUG_ASSERT(sh);
1015     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1016                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1017
1018     if (pr->ordered_bumped) {
1019       KD_TRACE(
1020           1000,
1021           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1022            gtid));
1023       pr->ordered_bumped = 0;
1024     } else {
1025       UT lower = pr->u.p.ordered_lower;
1026
1027 #ifdef KMP_DEBUG
1028       {
1029         char *buff;
1030         // create format specifiers before the debug output
1031         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1032                                 "ordered_iteration:%%%s lower:%%%s\n",
1033                                 traits_t<UT>::spec, traits_t<UT>::spec);
1034         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1035         __kmp_str_free(&buff);
1036       }
1037 #endif
1038
1039       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1040                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1041       KMP_MB(); /* is this necessary? */
1042 #ifdef KMP_DEBUG
1043       {
1044         char *buff;
1045         // create format specifiers before the debug output
1046         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1047                                 "ordered_iteration:%%%s lower:%%%s\n",
1048                                 traits_t<UT>::spec, traits_t<UT>::spec);
1049         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1050         __kmp_str_free(&buff);
1051       }
1052 #endif
1053
1054       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1055     } // if
1056   } // if
1057   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1058 }
1059
1060 #ifdef KMP_GOMP_COMPAT
1061
1062 template <typename UT>
1063 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1064   typedef typename traits_t<UT>::signed_t ST;
1065   __kmp_assert_valid_gtid(gtid);
1066   kmp_info_t *th = __kmp_threads[gtid];
1067
1068   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1069   if (!th->th.th_team->t.t_serialized) {
1070     //        int cid;
1071     dispatch_private_info_template<UT> *pr =
1072         reinterpret_cast<dispatch_private_info_template<UT> *>(
1073             th->th.th_dispatch->th_dispatch_pr_current);
1074     dispatch_shared_info_template<UT> volatile *sh =
1075         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1076             th->th.th_dispatch->th_dispatch_sh_current);
1077     KMP_DEBUG_ASSERT(pr);
1078     KMP_DEBUG_ASSERT(sh);
1079     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1080                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1081
1082     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1083     UT lower = pr->u.p.ordered_lower;
1084     UT upper = pr->u.p.ordered_upper;
1085     UT inc = upper - lower + 1;
1086
1087     if (pr->ordered_bumped == inc) {
1088       KD_TRACE(
1089           1000,
1090           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1091            gtid));
1092       pr->ordered_bumped = 0;
1093     } else {
1094       inc -= pr->ordered_bumped;
1095
1096 #ifdef KMP_DEBUG
1097       {
1098         char *buff;
1099         // create format specifiers before the debug output
1100         buff = __kmp_str_format(
1101             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1102             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1103             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1104         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1105         __kmp_str_free(&buff);
1106       }
1107 #endif
1108
1109       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1110                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1111
1112       KMP_MB(); /* is this necessary? */
1113       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1114                       "ordered_bumped to zero\n",
1115                       gtid));
1116       pr->ordered_bumped = 0;
1117 //!!!!! TODO check if the inc should be unsigned, or signed???
1118 #ifdef KMP_DEBUG
1119       {
1120         char *buff;
1121         // create format specifiers before the debug output
1122         buff = __kmp_str_format(
1123             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1124             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1125             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1126             traits_t<UT>::spec);
1127         KD_TRACE(1000,
1128                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1129         __kmp_str_free(&buff);
1130       }
1131 #endif
1132
1133       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1134     }
1135     //        }
1136   }
1137   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1138 }
1139
1140 #endif /* KMP_GOMP_COMPAT */
1141
1142 template <typename T>
1143 int __kmp_dispatch_next_algorithm(int gtid,
1144                                   dispatch_private_info_template<T> *pr,
1145                                   dispatch_shared_info_template<T> volatile *sh,
1146                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1147                                   typename traits_t<T>::signed_t *p_st, T nproc,
1148                                   T tid) {
1149   typedef typename traits_t<T>::unsigned_t UT;
1150   typedef typename traits_t<T>::signed_t ST;
1151   typedef typename traits_t<T>::floating_t DBL;
1152   int status = 0;
1153   kmp_int32 last = 0;
1154   T start;
1155   ST incr;
1156   UT limit, trip, init;
1157   kmp_info_t *th = __kmp_threads[gtid];
1158   kmp_team_t *team = th->th.th_team;
1159
1160   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1161                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1162   KMP_DEBUG_ASSERT(pr);
1163   KMP_DEBUG_ASSERT(sh);
1164   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1165 #ifdef KMP_DEBUG
1166   {
1167     char *buff;
1168     // create format specifiers before the debug output
1169     buff =
1170         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1171                          "sh:%%p nproc:%%%s tid:%%%s\n",
1172                          traits_t<T>::spec, traits_t<T>::spec);
1173     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1174     __kmp_str_free(&buff);
1175   }
1176 #endif
1177
1178   // zero trip count
1179   if (pr->u.p.tc == 0) {
1180     KD_TRACE(10,
1181              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1182               "zero status:%d\n",
1183               gtid, status));
1184     return 0;
1185   }
1186
1187   switch (pr->schedule) {
1188 #if (KMP_STATIC_STEAL_ENABLED)
1189   case kmp_sch_static_steal: {
1190     T chunk = pr->u.p.parm1;
1191
1192     KD_TRACE(100,
1193              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1194               gtid));
1195
1196     trip = pr->u.p.tc - 1;
1197
1198     if (traits_t<T>::type_size > 4) {
1199       // use lock for 8-byte and CAS for 4-byte induction
1200       // variable. TODO (optional): check and use 16-byte CAS
1201       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1202       KMP_DEBUG_ASSERT(lck != NULL);
1203       if (pr->u.p.count < (UT)pr->u.p.ub) {
1204         __kmp_acquire_lock(lck, gtid);
1205         // try to get own chunk of iterations
1206         init = (pr->u.p.count)++;
1207         status = (init < (UT)pr->u.p.ub);
1208         __kmp_release_lock(lck, gtid);
1209       } else {
1210         status = 0; // no own chunks
1211       }
1212       if (!status) { // try to steal
1213         kmp_info_t **other_threads = team->t.t_threads;
1214         int while_limit = pr->u.p.parm3;
1215         int while_index = 0;
1216         T id = pr->u.p.static_steal_counter; // loop id
1217         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1218                   __kmp_dispatch_num_buffers; // current loop index
1219         // note: victim thread can potentially execute another loop
1220         // TODO: algorithm of searching for a victim
1221         // should be cleaned up and measured
1222         while ((!status) && (while_limit != ++while_index)) {
1223           dispatch_private_info_template<T> *victim;
1224           T remaining;
1225           T victimIdx = pr->u.p.parm4;
1226           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1227           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1228               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1229           KMP_DEBUG_ASSERT(victim);
1230           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1231                  oldVictimIdx != victimIdx) {
1232             victimIdx = (victimIdx + 1) % nproc;
1233             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1234                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1235             KMP_DEBUG_ASSERT(victim);
1236           }
1237           if (victim == pr || id != victim->u.p.static_steal_counter) {
1238             continue; // try once more (nproc attempts in total)
1239             // no victim is ready yet to participate in stealing
1240             // because no victim passed kmp_init_dispatch yet
1241           }
1242           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1243             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1244             continue; // not enough chunks to steal, goto next victim
1245           }
1246
1247           lck = victim->u.p.th_steal_lock;
1248           KMP_ASSERT(lck != NULL);
1249           __kmp_acquire_lock(lck, gtid);
1250           limit = victim->u.p.ub; // keep initial ub
1251           if (victim->u.p.count >= limit ||
1252               (remaining = limit - victim->u.p.count) < 2) {
1253             __kmp_release_lock(lck, gtid);
1254             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1255             continue; // not enough chunks to steal
1256           }
1257           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1258           // by 1
1259           if (remaining > 3) {
1260             // steal 1/4 of remaining
1261             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1262             init = (victim->u.p.ub -= (remaining >> 2));
1263           } else {
1264             // steal 1 chunk of 2 or 3 remaining
1265             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1266             init = (victim->u.p.ub -= 1);
1267           }
1268           __kmp_release_lock(lck, gtid);
1269
1270           KMP_DEBUG_ASSERT(init + 1 <= limit);
1271           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1272           status = 1;
1273           while_index = 0;
1274           // now update own count and ub with stolen range but init chunk
1275           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1276           pr->u.p.count = init + 1;
1277           pr->u.p.ub = limit;
1278           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1279         } // while (search for victim)
1280       } // if (try to find victim and steal)
1281     } else {
1282       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1283       typedef union {
1284         struct {
1285           UT count;
1286           T ub;
1287         } p;
1288         kmp_int64 b;
1289       } union_i4;
1290       // All operations on 'count' or 'ub' must be combined atomically
1291       // together.
1292       {
1293         union_i4 vold, vnew;
1294         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1295         vnew = vold;
1296         vnew.p.count++;
1297         while (!KMP_COMPARE_AND_STORE_ACQ64(
1298             (volatile kmp_int64 *)&pr->u.p.count,
1299             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1300             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1301           KMP_CPU_PAUSE();
1302           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1303           vnew = vold;
1304           vnew.p.count++;
1305         }
1306         vnew = vold;
1307         init = vnew.p.count;
1308         status = (init < (UT)vnew.p.ub);
1309       }
1310
1311       if (!status) {
1312         kmp_info_t **other_threads = team->t.t_threads;
1313         int while_limit = pr->u.p.parm3;
1314         int while_index = 0;
1315         T id = pr->u.p.static_steal_counter; // loop id
1316         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1317                   __kmp_dispatch_num_buffers; // current loop index
1318         // note: victim thread can potentially execute another loop
1319         // TODO: algorithm of searching for a victim
1320         // should be cleaned up and measured
1321         while ((!status) && (while_limit != ++while_index)) {
1322           dispatch_private_info_template<T> *victim;
1323           union_i4 vold, vnew;
1324           kmp_int32 remaining;
1325           T victimIdx = pr->u.p.parm4;
1326           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1327           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1328               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1329           KMP_DEBUG_ASSERT(victim);
1330           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1331                  oldVictimIdx != victimIdx) {
1332             victimIdx = (victimIdx + 1) % nproc;
1333             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1334                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1335             KMP_DEBUG_ASSERT(victim);
1336           }
1337           if (victim == pr || id != victim->u.p.static_steal_counter) {
1338             continue; // try once more (nproc attempts in total)
1339             // no victim is ready yet to participate in stealing
1340             // because no victim passed kmp_init_dispatch yet
1341           }
1342           pr->u.p.parm4 = victimIdx; // new victim found
1343           while (1) { // CAS loop if victim has enough chunks to steal
1344             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1345             vnew = vold;
1346
1347             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1348             if (vnew.p.count >= (UT)vnew.p.ub ||
1349                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1350               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1351               break; // not enough chunks to steal, goto next victim
1352             }
1353             if (remaining > 3) {
1354               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1355             } else {
1356               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1357             }
1358             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1359             // TODO: Should this be acquire or release?
1360             if (KMP_COMPARE_AND_STORE_ACQ64(
1361                     (volatile kmp_int64 *)&victim->u.p.count,
1362                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1363                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1364               // stealing succeeded
1365               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1366                                         vold.p.ub - vnew.p.ub);
1367               status = 1;
1368               while_index = 0;
1369               // now update own count and ub
1370               init = vnew.p.ub;
1371               vold.p.count = init + 1;
1372 #if KMP_ARCH_X86
1373               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1374 #else
1375               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1376 #endif
1377               break;
1378             } // if (check CAS result)
1379             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1380           } // while (try to steal from particular victim)
1381         } // while (search for victim)
1382       } // if (try to find victim and steal)
1383     } // if (4-byte induction variable)
1384     if (!status) {
1385       *p_lb = 0;
1386       *p_ub = 0;
1387       if (p_st != NULL)
1388         *p_st = 0;
1389     } else {
1390       start = pr->u.p.parm2;
1391       init *= chunk;
1392       limit = chunk + init - 1;
1393       incr = pr->u.p.st;
1394       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1395
1396       KMP_DEBUG_ASSERT(init <= trip);
1397       if ((last = (limit >= trip)) != 0)
1398         limit = trip;
1399       if (p_st != NULL)
1400         *p_st = incr;
1401
1402       if (incr == 1) {
1403         *p_lb = start + init;
1404         *p_ub = start + limit;
1405       } else {
1406         *p_lb = start + init * incr;
1407         *p_ub = start + limit * incr;
1408       }
1409
1410       if (pr->flags.ordered) {
1411         pr->u.p.ordered_lower = init;
1412         pr->u.p.ordered_upper = limit;
1413       } // if
1414     } // if
1415     break;
1416   } // case
1417 #endif // ( KMP_STATIC_STEAL_ENABLED )
1418   case kmp_sch_static_balanced: {
1419     KD_TRACE(
1420         10,
1421         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1422          gtid));
1423     /* check if thread has any iteration to do */
1424     if ((status = !pr->u.p.count) != 0) {
1425       pr->u.p.count = 1;
1426       *p_lb = pr->u.p.lb;
1427       *p_ub = pr->u.p.ub;
1428       last = pr->u.p.parm1;
1429       if (p_st != NULL)
1430         *p_st = pr->u.p.st;
1431     } else { /* no iterations to do */
1432       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1433     }
1434   } // case
1435   break;
1436   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1437                                  merged here */
1438   case kmp_sch_static_chunked: {
1439     T parm1;
1440
1441     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1442                    "kmp_sch_static_[affinity|chunked] case\n",
1443                    gtid));
1444     parm1 = pr->u.p.parm1;
1445
1446     trip = pr->u.p.tc - 1;
1447     init = parm1 * (pr->u.p.count + tid);
1448
1449     if ((status = (init <= trip)) != 0) {
1450       start = pr->u.p.lb;
1451       incr = pr->u.p.st;
1452       limit = parm1 + init - 1;
1453
1454       if ((last = (limit >= trip)) != 0)
1455         limit = trip;
1456
1457       if (p_st != NULL)
1458         *p_st = incr;
1459
1460       pr->u.p.count += nproc;
1461
1462       if (incr == 1) {
1463         *p_lb = start + init;
1464         *p_ub = start + limit;
1465       } else {
1466         *p_lb = start + init * incr;
1467         *p_ub = start + limit * incr;
1468       }
1469
1470       if (pr->flags.ordered) {
1471         pr->u.p.ordered_lower = init;
1472         pr->u.p.ordered_upper = limit;
1473       } // if
1474     } // if
1475   } // case
1476   break;
1477
1478   case kmp_sch_dynamic_chunked: {
1479     T chunk = pr->u.p.parm1;
1480
1481     KD_TRACE(
1482         100,
1483         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1484          gtid));
1485
1486     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1487     trip = pr->u.p.tc - 1;
1488
1489     if ((status = (init <= trip)) == 0) {
1490       *p_lb = 0;
1491       *p_ub = 0;
1492       if (p_st != NULL)
1493         *p_st = 0;
1494     } else {
1495       start = pr->u.p.lb;
1496       limit = chunk + init - 1;
1497       incr = pr->u.p.st;
1498
1499       if ((last = (limit >= trip)) != 0)
1500         limit = trip;
1501
1502       if (p_st != NULL)
1503         *p_st = incr;
1504
1505       if (incr == 1) {
1506         *p_lb = start + init;
1507         *p_ub = start + limit;
1508       } else {
1509         *p_lb = start + init * incr;
1510         *p_ub = start + limit * incr;
1511       }
1512
1513       if (pr->flags.ordered) {
1514         pr->u.p.ordered_lower = init;
1515         pr->u.p.ordered_upper = limit;
1516       } // if
1517     } // if
1518   } // case
1519   break;
1520
1521   case kmp_sch_guided_iterative_chunked: {
1522     T chunkspec = pr->u.p.parm1;
1523     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1524                    "iterative case\n",
1525                    gtid));
1526     trip = pr->u.p.tc;
1527     // Start atomic part of calculations
1528     while (1) {
1529       ST remaining; // signed, because can be < 0
1530       init = sh->u.s.iteration; // shared value
1531       remaining = trip - init;
1532       if (remaining <= 0) { // AC: need to compare with 0 first
1533         // nothing to do, don't try atomic op
1534         status = 0;
1535         break;
1536       }
1537       if ((T)remaining <
1538           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1539         // use dynamic-style schedule
1540         // atomically increment iterations, get old value
1541         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1542                                  (ST)chunkspec);
1543         remaining = trip - init;
1544         if (remaining <= 0) {
1545           status = 0; // all iterations got by other threads
1546         } else {
1547           // got some iterations to work on
1548           status = 1;
1549           if ((T)remaining > chunkspec) {
1550             limit = init + chunkspec - 1;
1551           } else {
1552             last = 1; // the last chunk
1553             limit = init + remaining - 1;
1554           } // if
1555         } // if
1556         break;
1557       } // if
1558       limit = init +
1559               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1560       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1561                                (ST)init, (ST)limit)) {
1562         // CAS was successful, chunk obtained
1563         status = 1;
1564         --limit;
1565         break;
1566       } // if
1567     } // while
1568     if (status != 0) {
1569       start = pr->u.p.lb;
1570       incr = pr->u.p.st;
1571       if (p_st != NULL)
1572         *p_st = incr;
1573       *p_lb = start + init * incr;
1574       *p_ub = start + limit * incr;
1575       if (pr->flags.ordered) {
1576         pr->u.p.ordered_lower = init;
1577         pr->u.p.ordered_upper = limit;
1578       } // if
1579     } else {
1580       *p_lb = 0;
1581       *p_ub = 0;
1582       if (p_st != NULL)
1583         *p_st = 0;
1584     } // if
1585   } // case
1586   break;
1587
1588   case kmp_sch_guided_simd: {
1589     // same as iterative but curr-chunk adjusted to be multiple of given
1590     // chunk
1591     T chunk = pr->u.p.parm1;
1592     KD_TRACE(100,
1593              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1594               gtid));
1595     trip = pr->u.p.tc;
1596     // Start atomic part of calculations
1597     while (1) {
1598       ST remaining; // signed, because can be < 0
1599       init = sh->u.s.iteration; // shared value
1600       remaining = trip - init;
1601       if (remaining <= 0) { // AC: need to compare with 0 first
1602         status = 0; // nothing to do, don't try atomic op
1603         break;
1604       }
1605       KMP_DEBUG_ASSERT(init % chunk == 0);
1606       // compare with K*nproc*(chunk+1), K=2 by default
1607       if ((T)remaining < pr->u.p.parm2) {
1608         // use dynamic-style schedule
1609         // atomically increment iterations, get old value
1610         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1611                                  (ST)chunk);
1612         remaining = trip - init;
1613         if (remaining <= 0) {
1614           status = 0; // all iterations got by other threads
1615         } else {
1616           // got some iterations to work on
1617           status = 1;
1618           if ((T)remaining > chunk) {
1619             limit = init + chunk - 1;
1620           } else {
1621             last = 1; // the last chunk
1622             limit = init + remaining - 1;
1623           } // if
1624         } // if
1625         break;
1626       } // if
1627       // divide by K*nproc
1628       UT span = remaining * (*(double *)&pr->u.p.parm3);
1629       UT rem = span % chunk;
1630       if (rem) // adjust so that span%chunk == 0
1631         span += chunk - rem;
1632       limit = init + span;
1633       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1634                                (ST)init, (ST)limit)) {
1635         // CAS was successful, chunk obtained
1636         status = 1;
1637         --limit;
1638         break;
1639       } // if
1640     } // while
1641     if (status != 0) {
1642       start = pr->u.p.lb;
1643       incr = pr->u.p.st;
1644       if (p_st != NULL)
1645         *p_st = incr;
1646       *p_lb = start + init * incr;
1647       *p_ub = start + limit * incr;
1648       if (pr->flags.ordered) {
1649         pr->u.p.ordered_lower = init;
1650         pr->u.p.ordered_upper = limit;
1651       } // if
1652     } else {
1653       *p_lb = 0;
1654       *p_ub = 0;
1655       if (p_st != NULL)
1656         *p_st = 0;
1657     } // if
1658   } // case
1659   break;
1660
1661   case kmp_sch_guided_analytical_chunked: {
1662     T chunkspec = pr->u.p.parm1;
1663     UT chunkIdx;
1664 #if KMP_USE_X87CONTROL
1665     /* for storing original FPCW value for Windows* OS on
1666        IA-32 architecture 8-byte version */
1667     unsigned int oldFpcw;
1668     unsigned int fpcwSet = 0;
1669 #endif
1670     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1671                    "kmp_sch_guided_analytical_chunked case\n",
1672                    gtid));
1673
1674     trip = pr->u.p.tc;
1675
1676     KMP_DEBUG_ASSERT(nproc > 1);
1677     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1678
1679     while (1) { /* this while loop is a safeguard against unexpected zero
1680                    chunk sizes */
1681       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1682       if (chunkIdx >= (UT)pr->u.p.parm2) {
1683         --trip;
1684         /* use dynamic-style scheduling */
1685         init = chunkIdx * chunkspec + pr->u.p.count;
1686         /* need to verify init > 0 in case of overflow in the above
1687          * calculation */
1688         if ((status = (init > 0 && init <= trip)) != 0) {
1689           limit = init + chunkspec - 1;
1690
1691           if ((last = (limit >= trip)) != 0)
1692             limit = trip;
1693         }
1694         break;
1695       } else {
1696 /* use exponential-style scheduling */
1697 /* The following check is to workaround the lack of long double precision on
1698    Windows* OS.
1699    This check works around the possible effect that init != 0 for chunkIdx == 0.
1700  */
1701 #if KMP_USE_X87CONTROL
1702         /* If we haven't already done so, save original
1703            FPCW and set precision to 64-bit, as Windows* OS
1704            on IA-32 architecture defaults to 53-bit */
1705         if (!fpcwSet) {
1706           oldFpcw = _control87(0, 0);
1707           _control87(_PC_64, _MCW_PC);
1708           fpcwSet = 0x30000;
1709         }
1710 #endif
1711         if (chunkIdx) {
1712           init = __kmp_dispatch_guided_remaining<T>(
1713               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1714           KMP_DEBUG_ASSERT(init);
1715           init = trip - init;
1716         } else
1717           init = 0;
1718         limit = trip - __kmp_dispatch_guided_remaining<T>(
1719                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1720         KMP_ASSERT(init <= limit);
1721         if (init < limit) {
1722           KMP_DEBUG_ASSERT(limit <= trip);
1723           --limit;
1724           status = 1;
1725           break;
1726         } // if
1727       } // if
1728     } // while (1)
1729 #if KMP_USE_X87CONTROL
1730     /* restore FPCW if necessary
1731        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1732     */
1733     if (fpcwSet && (oldFpcw & fpcwSet))
1734       _control87(oldFpcw, _MCW_PC);
1735 #endif
1736     if (status != 0) {
1737       start = pr->u.p.lb;
1738       incr = pr->u.p.st;
1739       if (p_st != NULL)
1740         *p_st = incr;
1741       *p_lb = start + init * incr;
1742       *p_ub = start + limit * incr;
1743       if (pr->flags.ordered) {
1744         pr->u.p.ordered_lower = init;
1745         pr->u.p.ordered_upper = limit;
1746       }
1747     } else {
1748       *p_lb = 0;
1749       *p_ub = 0;
1750       if (p_st != NULL)
1751         *p_st = 0;
1752     }
1753   } // case
1754   break;
1755
1756   case kmp_sch_trapezoidal: {
1757     UT index;
1758     T parm2 = pr->u.p.parm2;
1759     T parm3 = pr->u.p.parm3;
1760     T parm4 = pr->u.p.parm4;
1761     KD_TRACE(100,
1762              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1763               gtid));
1764
1765     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1766
1767     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1768     trip = pr->u.p.tc - 1;
1769
1770     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1771       *p_lb = 0;
1772       *p_ub = 0;
1773       if (p_st != NULL)
1774         *p_st = 0;
1775     } else {
1776       start = pr->u.p.lb;
1777       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1778       incr = pr->u.p.st;
1779
1780       if ((last = (limit >= trip)) != 0)
1781         limit = trip;
1782
1783       if (p_st != NULL)
1784         *p_st = incr;
1785
1786       if (incr == 1) {
1787         *p_lb = start + init;
1788         *p_ub = start + limit;
1789       } else {
1790         *p_lb = start + init * incr;
1791         *p_ub = start + limit * incr;
1792       }
1793
1794       if (pr->flags.ordered) {
1795         pr->u.p.ordered_lower = init;
1796         pr->u.p.ordered_upper = limit;
1797       } // if
1798     } // if
1799   } // case
1800   break;
1801   default: {
1802     status = 0; // to avoid complaints on uninitialized variable use
1803     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1804                 KMP_HNT(GetNewerLibrary), // Hint
1805                 __kmp_msg_null // Variadic argument list terminator
1806                 );
1807   } break;
1808   } // switch
1809   if (p_last)
1810     *p_last = last;
1811 #ifdef KMP_DEBUG
1812   if (pr->flags.ordered) {
1813     char *buff;
1814     // create format specifiers before the debug output
1815     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1816                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1817                             traits_t<UT>::spec, traits_t<UT>::spec);
1818     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1819     __kmp_str_free(&buff);
1820   }
1821   {
1822     char *buff;
1823     // create format specifiers before the debug output
1824     buff = __kmp_str_format(
1825         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1826         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1827         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1828     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1829     __kmp_str_free(&buff);
1830   }
1831 #endif
1832   return status;
1833 }
1834
1835 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1836    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1837    is not called. */
1838 #if OMPT_SUPPORT && OMPT_OPTIONAL
1839 #define OMPT_LOOP_END                                                          \
1840   if (status == 0) {                                                           \
1841     if (ompt_enabled.ompt_callback_work) {                                     \
1842       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1843       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1844       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1845           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1846           &(task_info->task_data), 0, codeptr);                                \
1847     }                                                                          \
1848   }
1849 // TODO: implement count
1850 #else
1851 #define OMPT_LOOP_END // no-op
1852 #endif
1853
1854 #if KMP_STATS_ENABLED
1855 #define KMP_STATS_LOOP_END                                                     \
1856   {                                                                            \
1857     kmp_int64 u, l, t, i;                                                      \
1858     l = (kmp_int64)(*p_lb);                                                    \
1859     u = (kmp_int64)(*p_ub);                                                    \
1860     i = (kmp_int64)(pr->u.p.st);                                               \
1861     if (status == 0) {                                                         \
1862       t = 0;                                                                   \
1863       KMP_POP_PARTITIONED_TIMER();                                             \
1864     } else if (i == 1) {                                                       \
1865       if (u >= l)                                                              \
1866         t = u - l + 1;                                                         \
1867       else                                                                     \
1868         t = 0;                                                                 \
1869     } else if (i < 0) {                                                        \
1870       if (l >= u)                                                              \
1871         t = (l - u) / (-i) + 1;                                                \
1872       else                                                                     \
1873         t = 0;                                                                 \
1874     } else {                                                                   \
1875       if (u >= l)                                                              \
1876         t = (u - l) / i + 1;                                                   \
1877       else                                                                     \
1878         t = 0;                                                                 \
1879     }                                                                          \
1880     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1881   }
1882 #else
1883 #define KMP_STATS_LOOP_END /* Nothing */
1884 #endif
1885
1886 template <typename T>
1887 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1888                                T *p_lb, T *p_ub,
1889                                typename traits_t<T>::signed_t *p_st
1890 #if OMPT_SUPPORT && OMPT_OPTIONAL
1891                                ,
1892                                void *codeptr
1893 #endif
1894                                ) {
1895
1896   typedef typename traits_t<T>::unsigned_t UT;
1897   typedef typename traits_t<T>::signed_t ST;
1898   // This is potentially slightly misleading, schedule(runtime) will appear here
1899   // even if the actual runtime schedule is static. (Which points out a
1900   // disadvantage of schedule(runtime): even when static scheduling is used it
1901   // costs more than a compile time choice to use static scheduling would.)
1902   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1903
1904   int status;
1905   dispatch_private_info_template<T> *pr;
1906   __kmp_assert_valid_gtid(gtid);
1907   kmp_info_t *th = __kmp_threads[gtid];
1908   kmp_team_t *team = th->th.th_team;
1909
1910   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1911   KD_TRACE(
1912       1000,
1913       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1914        gtid, p_lb, p_ub, p_st, p_last));
1915
1916   if (team->t.t_serialized) {
1917     /* NOTE: serialize this dispatch because we are not at the active level */
1918     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1919         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1920     KMP_DEBUG_ASSERT(pr);
1921
1922     if ((status = (pr->u.p.tc != 0)) == 0) {
1923       *p_lb = 0;
1924       *p_ub = 0;
1925       //            if ( p_last != NULL )
1926       //                *p_last = 0;
1927       if (p_st != NULL)
1928         *p_st = 0;
1929       if (__kmp_env_consistency_check) {
1930         if (pr->pushed_ws != ct_none) {
1931           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1932         }
1933       }
1934     } else if (pr->flags.nomerge) {
1935       kmp_int32 last;
1936       T start;
1937       UT limit, trip, init;
1938       ST incr;
1939       T chunk = pr->u.p.parm1;
1940
1941       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1942                      gtid));
1943
1944       init = chunk * pr->u.p.count++;
1945       trip = pr->u.p.tc - 1;
1946
1947       if ((status = (init <= trip)) == 0) {
1948         *p_lb = 0;
1949         *p_ub = 0;
1950         //                if ( p_last != NULL )
1951         //                    *p_last = 0;
1952         if (p_st != NULL)
1953           *p_st = 0;
1954         if (__kmp_env_consistency_check) {
1955           if (pr->pushed_ws != ct_none) {
1956             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1957           }
1958         }
1959       } else {
1960         start = pr->u.p.lb;
1961         limit = chunk + init - 1;
1962         incr = pr->u.p.st;
1963
1964         if ((last = (limit >= trip)) != 0) {
1965           limit = trip;
1966 #if KMP_OS_WINDOWS
1967           pr->u.p.last_upper = pr->u.p.ub;
1968 #endif /* KMP_OS_WINDOWS */
1969         }
1970         if (p_last != NULL)
1971           *p_last = last;
1972         if (p_st != NULL)
1973           *p_st = incr;
1974         if (incr == 1) {
1975           *p_lb = start + init;
1976           *p_ub = start + limit;
1977         } else {
1978           *p_lb = start + init * incr;
1979           *p_ub = start + limit * incr;
1980         }
1981
1982         if (pr->flags.ordered) {
1983           pr->u.p.ordered_lower = init;
1984           pr->u.p.ordered_upper = limit;
1985 #ifdef KMP_DEBUG
1986           {
1987             char *buff;
1988             // create format specifiers before the debug output
1989             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1990                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1991                                     traits_t<UT>::spec, traits_t<UT>::spec);
1992             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1993                             pr->u.p.ordered_upper));
1994             __kmp_str_free(&buff);
1995           }
1996 #endif
1997         } // if
1998       } // if
1999     } else {
2000       pr->u.p.tc = 0;
2001       *p_lb = pr->u.p.lb;
2002       *p_ub = pr->u.p.ub;
2003 #if KMP_OS_WINDOWS
2004       pr->u.p.last_upper = *p_ub;
2005 #endif /* KMP_OS_WINDOWS */
2006       if (p_last != NULL)
2007         *p_last = TRUE;
2008       if (p_st != NULL)
2009         *p_st = pr->u.p.st;
2010     } // if
2011 #ifdef KMP_DEBUG
2012     {
2013       char *buff;
2014       // create format specifiers before the debug output
2015       buff = __kmp_str_format(
2016           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2017           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2018           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2019       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2020                     (p_last ? *p_last : 0), status));
2021       __kmp_str_free(&buff);
2022     }
2023 #endif
2024 #if INCLUDE_SSC_MARKS
2025     SSC_MARK_DISPATCH_NEXT();
2026 #endif
2027     OMPT_LOOP_END;
2028     KMP_STATS_LOOP_END;
2029     return status;
2030   } else {
2031     kmp_int32 last = 0;
2032     dispatch_shared_info_template<T> volatile *sh;
2033
2034     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2035                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2036
2037     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2038         th->th.th_dispatch->th_dispatch_pr_current);
2039     KMP_DEBUG_ASSERT(pr);
2040     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2041         th->th.th_dispatch->th_dispatch_sh_current);
2042     KMP_DEBUG_ASSERT(sh);
2043
2044 #if KMP_USE_HIER_SCHED
2045     if (pr->flags.use_hier)
2046       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2047     else
2048 #endif // KMP_USE_HIER_SCHED
2049       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2050                                                 p_st, th->th.th_team_nproc,
2051                                                 th->th.th_info.ds.ds_tid);
2052     // status == 0: no more iterations to execute
2053     if (status == 0) {
2054       UT num_done;
2055
2056       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2057 #ifdef KMP_DEBUG
2058       {
2059         char *buff;
2060         // create format specifiers before the debug output
2061         buff = __kmp_str_format(
2062             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2063             traits_t<UT>::spec);
2064         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2065         __kmp_str_free(&buff);
2066       }
2067 #endif
2068
2069 #if KMP_USE_HIER_SCHED
2070       pr->flags.use_hier = FALSE;
2071 #endif
2072       if ((ST)num_done == th->th.th_team_nproc - 1) {
2073 #if (KMP_STATIC_STEAL_ENABLED)
2074         if (pr->schedule == kmp_sch_static_steal &&
2075             traits_t<T>::type_size > 4) {
2076           int i;
2077           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2078                     __kmp_dispatch_num_buffers; // current loop index
2079           kmp_info_t **other_threads = team->t.t_threads;
2080           // loop complete, safe to destroy locks used for stealing
2081           for (i = 0; i < th->th.th_team_nproc; ++i) {
2082             dispatch_private_info_template<T> *buf =
2083                 reinterpret_cast<dispatch_private_info_template<T> *>(
2084                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2085             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2086             KMP_ASSERT(lck != NULL);
2087             __kmp_destroy_lock(lck);
2088             __kmp_free(lck);
2089             buf->u.p.th_steal_lock = NULL;
2090           }
2091         }
2092 #endif
2093         /* NOTE: release this buffer to be reused */
2094
2095         KMP_MB(); /* Flush all pending memory write invalidates.  */
2096
2097         sh->u.s.num_done = 0;
2098         sh->u.s.iteration = 0;
2099
2100         /* TODO replace with general release procedure? */
2101         if (pr->flags.ordered) {
2102           sh->u.s.ordered_iteration = 0;
2103         }
2104
2105         KMP_MB(); /* Flush all pending memory write invalidates.  */
2106
2107         sh->buffer_index += __kmp_dispatch_num_buffers;
2108         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2109                        gtid, sh->buffer_index));
2110
2111         KMP_MB(); /* Flush all pending memory write invalidates.  */
2112
2113       } // if
2114       if (__kmp_env_consistency_check) {
2115         if (pr->pushed_ws != ct_none) {
2116           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2117         }
2118       }
2119
2120       th->th.th_dispatch->th_deo_fcn = NULL;
2121       th->th.th_dispatch->th_dxo_fcn = NULL;
2122       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2123       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2124     } // if (status == 0)
2125 #if KMP_OS_WINDOWS
2126     else if (last) {
2127       pr->u.p.last_upper = pr->u.p.ub;
2128     }
2129 #endif /* KMP_OS_WINDOWS */
2130     if (p_last != NULL && status != 0)
2131       *p_last = last;
2132   } // if
2133
2134 #ifdef KMP_DEBUG
2135   {
2136     char *buff;
2137     // create format specifiers before the debug output
2138     buff = __kmp_str_format(
2139         "__kmp_dispatch_next: T#%%d normal case: "
2140         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2141         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2142     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2143                   (p_last ? *p_last : 0), status));
2144     __kmp_str_free(&buff);
2145   }
2146 #endif
2147 #if INCLUDE_SSC_MARKS
2148   SSC_MARK_DISPATCH_NEXT();
2149 #endif
2150   OMPT_LOOP_END;
2151   KMP_STATS_LOOP_END;
2152   return status;
2153 }
2154
2155 template <typename T>
2156 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2157                                   kmp_int32 *plastiter, T *plower, T *pupper,
2158                                   typename traits_t<T>::signed_t incr) {
2159   typedef typename traits_t<T>::unsigned_t UT;
2160   kmp_uint32 team_id;
2161   kmp_uint32 nteams;
2162   UT trip_count;
2163   kmp_team_t *team;
2164   kmp_info_t *th;
2165
2166   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2167   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2168 #ifdef KMP_DEBUG
2169   typedef typename traits_t<T>::signed_t ST;
2170   {
2171     char *buff;
2172     // create format specifiers before the debug output
2173     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2174                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2175                             traits_t<T>::spec, traits_t<T>::spec,
2176                             traits_t<ST>::spec, traits_t<T>::spec);
2177     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2178     __kmp_str_free(&buff);
2179   }
2180 #endif
2181
2182   if (__kmp_env_consistency_check) {
2183     if (incr == 0) {
2184       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2185                             loc);
2186     }
2187     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2188       // The loop is illegal.
2189       // Some zero-trip loops maintained by compiler, e.g.:
2190       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2191       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2192       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2193       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2194       // Compiler does not check the following illegal loops:
2195       //   for(i=0;i<10;i+=incr) // where incr<0
2196       //   for(i=10;i>0;i-=incr) // where incr<0
2197       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2198     }
2199   }
2200   __kmp_assert_valid_gtid(gtid);
2201   th = __kmp_threads[gtid];
2202   team = th->th.th_team;
2203   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2204   nteams = th->th.th_teams_size.nteams;
2205   team_id = team->t.t_master_tid;
2206   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2207
2208   // compute global trip count
2209   if (incr == 1) {
2210     trip_count = *pupper - *plower + 1;
2211   } else if (incr == -1) {
2212     trip_count = *plower - *pupper + 1;
2213   } else if (incr > 0) {
2214     // upper-lower can exceed the limit of signed type
2215     trip_count = (UT)(*pupper - *plower) / incr + 1;
2216   } else {
2217     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2218   }
2219
2220   if (trip_count <= nteams) {
2221     KMP_DEBUG_ASSERT(
2222         __kmp_static == kmp_sch_static_greedy ||
2223         __kmp_static ==
2224             kmp_sch_static_balanced); // Unknown static scheduling type.
2225     // only some teams get single iteration, others get nothing
2226     if (team_id < trip_count) {
2227       *pupper = *plower = *plower + team_id * incr;
2228     } else {
2229       *plower = *pupper + incr; // zero-trip loop
2230     }
2231     if (plastiter != NULL)
2232       *plastiter = (team_id == trip_count - 1);
2233   } else {
2234     if (__kmp_static == kmp_sch_static_balanced) {
2235       UT chunk = trip_count / nteams;
2236       UT extras = trip_count % nteams;
2237       *plower +=
2238           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2239       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2240       if (plastiter != NULL)
2241         *plastiter = (team_id == nteams - 1);
2242     } else {
2243       T chunk_inc_count =
2244           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2245       T upper = *pupper;
2246       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2247       // Unknown static scheduling type.
2248       *plower += team_id * chunk_inc_count;
2249       *pupper = *plower + chunk_inc_count - incr;
2250       // Check/correct bounds if needed
2251       if (incr > 0) {
2252         if (*pupper < *plower)
2253           *pupper = traits_t<T>::max_value;
2254         if (plastiter != NULL)
2255           *plastiter = *plower <= upper && *pupper > upper - incr;
2256         if (*pupper > upper)
2257           *pupper = upper; // tracker C73258
2258       } else {
2259         if (*pupper > *plower)
2260           *pupper = traits_t<T>::min_value;
2261         if (plastiter != NULL)
2262           *plastiter = *plower >= upper && *pupper < upper - incr;
2263         if (*pupper < upper)
2264           *pupper = upper; // tracker C73258
2265       }
2266     }
2267   }
2268 }
2269
2270 //-----------------------------------------------------------------------------
2271 // Dispatch routines
2272 //    Transfer call to template< type T >
2273 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2274 //                         T lb, T ub, ST st, ST chunk )
2275 extern "C" {
2276
2277 /*!
2278 @ingroup WORK_SHARING
2279 @{
2280 @param loc Source location
2281 @param gtid Global thread id
2282 @param schedule Schedule type
2283 @param lb  Lower bound
2284 @param ub  Upper bound
2285 @param st  Step (or increment if you prefer)
2286 @param chunk The chunk size to block with
2287
2288 This function prepares the runtime to start a dynamically scheduled for loop,
2289 saving the loop arguments.
2290 These functions are all identical apart from the types of the arguments.
2291 */
2292
2293 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2294                             enum sched_type schedule, kmp_int32 lb,
2295                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2296   KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298   OMPT_STORE_RETURN_ADDRESS(gtid);
2299 #endif
2300   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2301 }
2302 /*!
2303 See @ref __kmpc_dispatch_init_4
2304 */
2305 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2306                              enum sched_type schedule, kmp_uint32 lb,
2307                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2308   KMP_DEBUG_ASSERT(__kmp_init_serial);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310   OMPT_STORE_RETURN_ADDRESS(gtid);
2311 #endif
2312   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2313 }
2314
2315 /*!
2316 See @ref __kmpc_dispatch_init_4
2317 */
2318 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2319                             enum sched_type schedule, kmp_int64 lb,
2320                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2321   KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323   OMPT_STORE_RETURN_ADDRESS(gtid);
2324 #endif
2325   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2326 }
2327
2328 /*!
2329 See @ref __kmpc_dispatch_init_4
2330 */
2331 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2332                              enum sched_type schedule, kmp_uint64 lb,
2333                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2334   KMP_DEBUG_ASSERT(__kmp_init_serial);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336   OMPT_STORE_RETURN_ADDRESS(gtid);
2337 #endif
2338   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2339 }
2340
2341 /*!
2342 See @ref __kmpc_dispatch_init_4
2343
2344 Difference from __kmpc_dispatch_init set of functions is these functions
2345 are called for composite distribute parallel for construct. Thus before
2346 regular iterations dispatching we need to calc per-team iteration space.
2347
2348 These functions are all identical apart from the types of the arguments.
2349 */
2350 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2351                                  enum sched_type schedule, kmp_int32 *p_last,
2352                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2353                                  kmp_int32 chunk) {
2354   KMP_DEBUG_ASSERT(__kmp_init_serial);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356   OMPT_STORE_RETURN_ADDRESS(gtid);
2357 #endif
2358   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2359   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2360 }
2361
2362 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2363                                   enum sched_type schedule, kmp_int32 *p_last,
2364                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2365                                   kmp_int32 chunk) {
2366   KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368   OMPT_STORE_RETURN_ADDRESS(gtid);
2369 #endif
2370   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2371   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2372 }
2373
2374 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2375                                  enum sched_type schedule, kmp_int32 *p_last,
2376                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2377                                  kmp_int64 chunk) {
2378   KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380   OMPT_STORE_RETURN_ADDRESS(gtid);
2381 #endif
2382   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2383   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2384 }
2385
2386 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2387                                   enum sched_type schedule, kmp_int32 *p_last,
2388                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2389                                   kmp_int64 chunk) {
2390   KMP_DEBUG_ASSERT(__kmp_init_serial);
2391 #if OMPT_SUPPORT && OMPT_OPTIONAL
2392   OMPT_STORE_RETURN_ADDRESS(gtid);
2393 #endif
2394   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2395   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2396 }
2397
2398 /*!
2399 @param loc Source code location
2400 @param gtid Global thread id
2401 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2402 otherwise
2403 @param p_lb   Pointer to the lower bound for the next chunk of work
2404 @param p_ub   Pointer to the upper bound for the next chunk of work
2405 @param p_st   Pointer to the stride for the next chunk of work
2406 @return one if there is work to be done, zero otherwise
2407
2408 Get the next dynamically allocated chunk of work for this thread.
2409 If there is no more work, then the lb,ub and stride need not be modified.
2410 */
2411 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2412                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL
2414   OMPT_STORE_RETURN_ADDRESS(gtid);
2415 #endif
2416   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418                                         ,
2419                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2420 #endif
2421                                             );
2422 }
2423
2424 /*!
2425 See @ref __kmpc_dispatch_next_4
2426 */
2427 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2428                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2429                             kmp_int32 *p_st) {
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431   OMPT_STORE_RETURN_ADDRESS(gtid);
2432 #endif
2433   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435                                          ,
2436                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2437 #endif
2438                                              );
2439 }
2440
2441 /*!
2442 See @ref __kmpc_dispatch_next_4
2443 */
2444 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2445                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2446 #if OMPT_SUPPORT && OMPT_OPTIONAL
2447   OMPT_STORE_RETURN_ADDRESS(gtid);
2448 #endif
2449   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2451                                         ,
2452                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2453 #endif
2454                                             );
2455 }
2456
2457 /*!
2458 See @ref __kmpc_dispatch_next_4
2459 */
2460 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2461                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2462                             kmp_int64 *p_st) {
2463 #if OMPT_SUPPORT && OMPT_OPTIONAL
2464   OMPT_STORE_RETURN_ADDRESS(gtid);
2465 #endif
2466   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2467 #if OMPT_SUPPORT && OMPT_OPTIONAL
2468                                          ,
2469                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2470 #endif
2471                                              );
2472 }
2473
2474 /*!
2475 @param loc Source code location
2476 @param gtid Global thread id
2477
2478 Mark the end of a dynamic loop.
2479 */
2480 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2481   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2482 }
2483
2484 /*!
2485 See @ref __kmpc_dispatch_fini_4
2486 */
2487 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2488   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2489 }
2490
2491 /*!
2492 See @ref __kmpc_dispatch_fini_4
2493 */
2494 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2495   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2496 }
2497
2498 /*!
2499 See @ref __kmpc_dispatch_fini_4
2500 */
2501 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2502   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2503 }
2504 /*! @} */
2505
2506 //-----------------------------------------------------------------------------
2507 // Non-template routines from kmp_dispatch.cpp used in other sources
2508
2509 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2510   return value == checker;
2511 }
2512
2513 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2514   return value != checker;
2515 }
2516
2517 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2518   return value < checker;
2519 }
2520
2521 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2522   return value >= checker;
2523 }
2524
2525 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2526   return value <= checker;
2527 }
2528
2529 kmp_uint32
2530 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2531              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2532              void *obj // Higher-level synchronization object, or NULL.
2533              ) {
2534   // note: we may not belong to a team at this point
2535   volatile kmp_uint32 *spin = spinner;
2536   kmp_uint32 check = checker;
2537   kmp_uint32 spins;
2538   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2539   kmp_uint32 r;
2540
2541   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2542   KMP_INIT_YIELD(spins);
2543   // main wait spin loop
2544   while (!f(r = TCR_4(*spin), check)) {
2545     KMP_FSYNC_SPIN_PREPARE(obj);
2546     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2547        split. It causes problems with infinite recursion because of exit lock */
2548     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2549         __kmp_abort_thread(); */
2550     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2551   }
2552   KMP_FSYNC_SPIN_ACQUIRED(obj);
2553   return r;
2554 }
2555
2556 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2557                       kmp_uint32 (*pred)(void *, kmp_uint32),
2558                       void *obj // Higher-level synchronization object, or NULL.
2559                       ) {
2560   // note: we may not belong to a team at this point
2561   void *spin = spinner;
2562   kmp_uint32 check = checker;
2563   kmp_uint32 spins;
2564   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2565
2566   KMP_FSYNC_SPIN_INIT(obj, spin);
2567   KMP_INIT_YIELD(spins);
2568   // main wait spin loop
2569   while (!f(spin, check)) {
2570     KMP_FSYNC_SPIN_PREPARE(obj);
2571     /* if we have waited a bit, or are noversubscribed, yield */
2572     /* pause is in the following code */
2573     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2574   }
2575   KMP_FSYNC_SPIN_ACQUIRED(obj);
2576 }
2577
2578 } // extern "C"
2579
2580 #ifdef KMP_GOMP_COMPAT
2581
2582 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2583                                enum sched_type schedule, kmp_int32 lb,
2584                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2585                                int push_ws) {
2586   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2587                                  push_ws);
2588 }
2589
2590 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2591                                 enum sched_type schedule, kmp_uint32 lb,
2592                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2593                                 int push_ws) {
2594   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2595                                   push_ws);
2596 }
2597
2598 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2599                                enum sched_type schedule, kmp_int64 lb,
2600                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2601                                int push_ws) {
2602   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2603                                  push_ws);
2604 }
2605
2606 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2607                                 enum sched_type schedule, kmp_uint64 lb,
2608                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2609                                 int push_ws) {
2610   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2611                                   push_ws);
2612 }
2613
2614 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2615   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2616 }
2617
2618 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2619   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2620 }
2621
2622 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2623   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2624 }
2625
2626 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2627   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2628 }
2629
2630 #endif /* KMP_GOMP_COMPAT */
2631
2632 /* ------------------------------------------------------------------------ */