openmp/runtime/src/kmp_dispatch.h

   1 /*
   2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef KMP_DISPATCH_H
  14 #define KMP_DISPATCH_H
  15
  16 /* ------------------------------------------------------------------------ */
  17 /* ------------------------------------------------------------------------ */
  18
  19 #include "kmp.h"
  20 #include "kmp_error.h"
  21 #include "kmp_i18n.h"
  22 #include "kmp_itt.h"
  23 #include "kmp_stats.h"
  24 #include "kmp_str.h"
  25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
  26 #include <float.h>
  27 #endif
  28
  29 #if OMPT_SUPPORT
  30 #include "ompt-internal.h"
  31 #include "ompt-specific.h"
  32 #endif
  33
  34 /* ------------------------------------------------------------------------ */
  35 /* ------------------------------------------------------------------------ */
  36 #if KMP_USE_HIER_SCHED
  37 // Forward declarations of some hierarchical scheduling data structures
  38 template <typename T> struct kmp_hier_t;
  39 template <typename T> struct kmp_hier_top_unit_t;
  40 #endif // KMP_USE_HIER_SCHED
  41
  42 template <typename T> struct dispatch_shared_info_template;
  43 template <typename T> struct dispatch_private_info_template;
  44
  45 template <typename T>
  46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  47                                           dispatch_private_info_template<T> *pr,
  48                                           enum sched_type schedule, T lb, T ub,
  49                                           typename traits_t<T>::signed_t st,
  50 #if USE_ITT_BUILD
  51                                           kmp_uint64 *cur_chunk,
  52 #endif
  53                                           typename traits_t<T>::signed_t chunk,
  54                                           T nproc, T unit_id);
  55 template <typename T>
  56 extern int __kmp_dispatch_next_algorithm(
  57     int gtid, dispatch_private_info_template<T> *pr,
  58     dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
  59     T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
  60
  61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  63
  64 #if KMP_STATIC_STEAL_ENABLED
  65
  66 // replaces dispatch_private_info{32,64} structures and
  67 // dispatch_private_info{32,64}_t types
  68 template <typename T> struct dispatch_private_infoXX_template {
  69   typedef typename traits_t<T>::unsigned_t UT;
  70   typedef typename traits_t<T>::signed_t ST;
  71   UT count; // unsigned
  72   T ub;
  73   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
  74   T lb;
  75   ST st; // signed
  76   UT tc; // unsigned
  77   kmp_lock_t *steal_lock; // lock used for chunk stealing
  78   /* parm[1-4] are used in different ways by different scheduling algorithms */
  79
  80   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
  81   //    a) parm3 is properly aligned and
  82   //    b) all parm1-4 are in the same cache line.
  83   // Because of parm1-4 are used together, performance seems to be better
  84   // if they are in the same line (not measured though).
  85
  86   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
  87     T parm1;
  88     T parm2;
  89     T parm3;
  90     T parm4;
  91   };
  92
  93   UT ordered_lower; // unsigned
  94   UT ordered_upper; // unsigned
  95 #if KMP_OS_WINDOWS
  96   T last_upper;
  97 #endif /* KMP_OS_WINDOWS */
  98 };
  99
 100 #else /* KMP_STATIC_STEAL_ENABLED */
 101
 102 // replaces dispatch_private_info{32,64} structures and
 103 // dispatch_private_info{32,64}_t types
 104 template <typename T> struct dispatch_private_infoXX_template {
 105   typedef typename traits_t<T>::unsigned_t UT;
 106   typedef typename traits_t<T>::signed_t ST;
 107   T lb;
 108   T ub;
 109   ST st; // signed
 110   UT tc; // unsigned
 111
 112   T parm1;
 113   T parm2;
 114   T parm3;
 115   T parm4;
 116
 117   UT count; // unsigned
 118
 119   UT ordered_lower; // unsigned
 120   UT ordered_upper; // unsigned
 121 #if KMP_OS_WINDOWS
 122   T last_upper;
 123 #endif /* KMP_OS_WINDOWS */
 124 };
 125 #endif /* KMP_STATIC_STEAL_ENABLED */
 126
 127 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
 128   // duplicate alignment here, otherwise size of structure is not correct in our
 129   // compiler
 130   union KMP_ALIGN_CACHE private_info_tmpl {
 131     dispatch_private_infoXX_template<T> p;
 132     dispatch_private_info64_t p64;
 133   } u;
 134   enum sched_type schedule; /* scheduling algorithm */
 135   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
 136   std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
 137   kmp_uint32 ordered_bumped;
 138   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
 139   kmp_uint32 type_size;
 140 #if KMP_USE_HIER_SCHED
 141   kmp_int32 hier_id;
 142   kmp_hier_top_unit_t<T> *hier_parent;
 143   // member functions
 144   kmp_int32 get_hier_id() const { return hier_id; }
 145   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
 146 #endif
 147   enum cons_type pushed_ws;
 148 };
 149
 150 // replaces dispatch_shared_info{32,64} structures and
 151 // dispatch_shared_info{32,64}_t types
 152 template <typename T> struct dispatch_shared_infoXX_template {
 153   typedef typename traits_t<T>::unsigned_t UT;
 154   typedef typename traits_t<T>::signed_t ST;
 155   /* chunk index under dynamic, number of idle threads under static-steal;
 156      iteration index otherwise */
 157   volatile UT iteration;
 158   volatile ST num_done;
 159   volatile UT ordered_iteration;
 160   // to retain the structure size making ordered_iteration scalar
 161   UT ordered_dummy[KMP_MAX_ORDERED - 3];
 162 };
 163
 164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
 165 template <typename T> struct dispatch_shared_info_template {
 166   typedef typename traits_t<T>::unsigned_t UT;
 167   // we need union here to keep the structure size
 168   union shared_info_tmpl {
 169     dispatch_shared_infoXX_template<UT> s;
 170     dispatch_shared_info64_t s64;
 171   } u;
 172   volatile kmp_uint32 buffer_index;
 173   volatile kmp_int32 doacross_buf_idx; // teamwise index
 174   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
 175   kmp_int32 doacross_num_done; // count finished threads
 176 #if KMP_USE_HIER_SCHED
 177   kmp_hier_t<T> *hier;
 178 #endif
 179 #if KMP_USE_HWLOC
 180   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
 181   // machines (> 48 cores). Performance analysis showed that a cache thrash
 182   // was occurring and this padding helps alleviate the problem.
 183   char padding[64];
 184 #endif
 185 };
 186
 187 /* ------------------------------------------------------------------------ */
 188 /* ------------------------------------------------------------------------ */
 189
 190 #undef USE_TEST_LOCKS
 191
 192 // test_then_add template (general template should NOT be used)
 193 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
 194
 195 template <>
 196 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
 197                                                  kmp_int32 d) {
 198   kmp_int32 r;
 199   r = KMP_TEST_THEN_ADD32(p, d);
 200   return r;
 201 }
 202
 203 template <>
 204 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
 205                                                  kmp_int64 d) {
 206   kmp_int64 r;
 207   r = KMP_TEST_THEN_ADD64(p, d);
 208   return r;
 209 }
 210
 211 // test_then_inc_acq template (general template should NOT be used)
 212 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
 213
 214 template <>
 215 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
 216   kmp_int32 r;
 217   r = KMP_TEST_THEN_INC_ACQ32(p);
 218   return r;
 219 }
 220
 221 template <>
 222 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
 223   kmp_int64 r;
 224   r = KMP_TEST_THEN_INC_ACQ64(p);
 225   return r;
 226 }
 227
 228 // test_then_inc template (general template should NOT be used)
 229 template <typename T> static __forceinline T test_then_inc(volatile T *p);
 230
 231 template <>
 232 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
 233   kmp_int32 r;
 234   r = KMP_TEST_THEN_INC32(p);
 235   return r;
 236 }
 237
 238 template <>
 239 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
 240   kmp_int64 r;
 241   r = KMP_TEST_THEN_INC64(p);
 242   return r;
 243 }
 244
 245 // compare_and_swap template (general template should NOT be used)
 246 template <typename T>
 247 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
 248
 249 template <>
 250 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
 251                                                     kmp_int32 c, kmp_int32 s) {
 252   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
 253 }
 254
 255 template <>
 256 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
 257                                                     kmp_int64 c, kmp_int64 s) {
 258   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
 259 }
 260
 261 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
 262   return value >= checker;
 263 }
 264 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
 265   return value == checker;
 266 }
 267
 268 /*
 269     Spin wait loop that pauses between checks.
 270     Waits until function returns non-zero when called with *spinner and check.
 271     Does NOT put threads to sleep.
 272     Arguments:
 273         UT is unsigned 4- or 8-byte type
 274         spinner - memory location to check value
 275         checker - value which spinner is >, <, ==, etc.
 276         pred - predicate function to perform binary comparison of some sort
 277 #if USE_ITT_BUILD
 278         obj -- is higher-level synchronization object to report to ittnotify. It
 279         is used to report locks consistently. For example, if lock is acquired
 280         immediately, its address is reported to ittnotify via
 281         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
 282         and lock routine calls to KMP_WAIT(), the later should report the
 283         same address, not an address of low-level spinner.
 284 #endif // USE_ITT_BUILD
 285     TODO: make inline function (move to header file for icl)
 286 */
 287 template <typename UT>
 288 static UT __kmp_wait(volatile UT *spinner, UT checker,
 289                      kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
 290   // note: we may not belong to a team at this point
 291   volatile UT *spin = spinner;
 292   UT check = checker;
 293   kmp_uint32 spins;
 294   kmp_uint32 (*f)(UT, UT) = pred;
 295   kmp_uint64 time;
 296   UT r;
 297
 298   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
 299   KMP_INIT_YIELD(spins);
 300   KMP_INIT_BACKOFF(time);
 301   // main wait spin loop
 302   while (!f(r = *spin, check)) {
 303     KMP_FSYNC_SPIN_PREPARE(obj);
 304     /* GEH - remove this since it was accidentally introduced when kmp_wait was
 305        split.
 306        It causes problems with infinite recursion because of exit lock */
 307     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
 308         __kmp_abort_thread(); */
 309     // If oversubscribed, or have waited a bit then yield.
 310     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
 311   }
 312   KMP_FSYNC_SPIN_ACQUIRED(obj);
 313   return r;
 314 }
 315
 316 /* ------------------------------------------------------------------------ */
 317 /* ------------------------------------------------------------------------ */
 318
 319 template <typename UT>
 320 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 321   dispatch_private_info_template<UT> *pr;
 322
 323   int gtid = *gtid_ref;
 324   //    int  cid = *cid_ref;
 325   kmp_info_t *th = __kmp_threads[gtid];
 326   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 327
 328   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
 329   if (__kmp_env_consistency_check) {
 330     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 331         th->th.th_dispatch->th_dispatch_pr_current);
 332     if (pr->pushed_ws != ct_none) {
 333 #if KMP_USE_DYNAMIC_LOCK
 334       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
 335 #else
 336       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
 337 #endif
 338     }
 339   }
 340
 341   if (!th->th.th_team->t.t_serialized) {
 342     dispatch_shared_info_template<UT> *sh =
 343         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 344             th->th.th_dispatch->th_dispatch_sh_current);
 345     UT lower;
 346
 347     if (!__kmp_env_consistency_check) {
 348       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 349           th->th.th_dispatch->th_dispatch_pr_current);
 350     }
 351     lower = pr->u.p.ordered_lower;
 352
 353 #if !defined(KMP_GOMP_COMPAT)
 354     if (__kmp_env_consistency_check) {
 355       if (pr->ordered_bumped) {
 356         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 357         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 358                                ct_ordered_in_pdo, loc_ref,
 359                                &p->stack_data[p->w_top]);
 360       }
 361     }
 362 #endif /* !defined(KMP_GOMP_COMPAT) */
 363
 364     KMP_MB();
 365 #ifdef KMP_DEBUG
 366     {
 367       char *buff;
 368       // create format specifiers before the debug output
 369       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
 370                               "ordered_iter:%%%s lower:%%%s\n",
 371                               traits_t<UT>::spec, traits_t<UT>::spec);
 372       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 373       __kmp_str_free(&buff);
 374     }
 375 #endif
 376     __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
 377                    __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
 378     KMP_MB(); /* is this necessary? */
 379 #ifdef KMP_DEBUG
 380     {
 381       char *buff;
 382       // create format specifiers before the debug output
 383       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
 384                               "ordered_iter:%%%s lower:%%%s\n",
 385                               traits_t<UT>::spec, traits_t<UT>::spec);
 386       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 387       __kmp_str_free(&buff);
 388     }
 389 #endif
 390   }
 391   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
 392 }
 393
 394 template <typename UT>
 395 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 396   typedef typename traits_t<UT>::signed_t ST;
 397   dispatch_private_info_template<UT> *pr;
 398
 399   int gtid = *gtid_ref;
 400   //    int  cid = *cid_ref;
 401   kmp_info_t *th = __kmp_threads[gtid];
 402   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 403
 404   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
 405   if (__kmp_env_consistency_check) {
 406     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 407         th->th.th_dispatch->th_dispatch_pr_current);
 408     if (pr->pushed_ws != ct_none) {
 409       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
 410     }
 411   }
 412
 413   if (!th->th.th_team->t.t_serialized) {
 414     dispatch_shared_info_template<UT> *sh =
 415         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 416             th->th.th_dispatch->th_dispatch_sh_current);
 417
 418     if (!__kmp_env_consistency_check) {
 419       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 420           th->th.th_dispatch->th_dispatch_pr_current);
 421     }
 422
 423     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
 424 #if !defined(KMP_GOMP_COMPAT)
 425     if (__kmp_env_consistency_check) {
 426       if (pr->ordered_bumped != 0) {
 427         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 428         /* How to test it? - OM */
 429         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 430                                ct_ordered_in_pdo, loc_ref,
 431                                &p->stack_data[p->w_top]);
 432       }
 433     }
 434 #endif /* !defined(KMP_GOMP_COMPAT) */
 435
 436     KMP_MB(); /* Flush all pending memory write invalidates.  */
 437
 438     pr->ordered_bumped += 1;
 439
 440     KD_TRACE(1000,
 441              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
 442               gtid, pr->ordered_bumped));
 443
 444     KMP_MB(); /* Flush all pending memory write invalidates.  */
 445
 446     /* TODO use general release procedure? */
 447     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
 448
 449     KMP_MB(); /* Flush all pending memory write invalidates.  */
 450   }
 451   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
 452 }
 453
 454 /* Computes and returns x to the power of y, where y must a non-negative integer
 455  */
 456 template <typename UT>
 457 static __forceinline long double __kmp_pow(long double x, UT y) {
 458   long double s = 1.0L;
 459
 460   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
 461   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
 462   while (y) {
 463     if (y & 1)
 464       s *= x;
 465     x *= x;
 466     y >>= 1;
 467   }
 468   return s;
 469 }
 470
 471 /* Computes and returns the number of unassigned iterations after idx chunks
 472    have been assigned
 473    (the total number of unassigned iterations in chunks with index greater than
 474    or equal to idx).
 475    __forceinline seems to be broken so that if we __forceinline this function,
 476    the behavior is wrong
 477    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
 478 */
 479 template <typename T>
 480 static __inline typename traits_t<T>::unsigned_t
 481 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
 482                                 typename traits_t<T>::unsigned_t idx) {
 483   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
 484      least for ICL 8.1, long double arithmetic may not really have
 485      long double precision, even with /Qlong_double.  Currently, we
 486      workaround that in the caller code, by manipulating the FPCW for
 487      Windows* OS on IA-32 architecture.  The lack of precision is not
 488      expected to be a correctness issue, though.
 489   */
 490   typedef typename traits_t<T>::unsigned_t UT;
 491
 492   long double x = tc * __kmp_pow<UT>(base, idx);
 493   UT r = (UT)x;
 494   if (x == r)
 495     return r;
 496   return r + 1;
 497 }
 498
 499 // Parameters of the guided-iterative algorithm:
 500 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
 501 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
 502 // by default n = 2. For example with n = 3 the chunks distribution will be more
 503 // flat.
 504 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
 505 static const int guided_int_param = 2;
 506 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
 507 #endif // KMP_DISPATCH_H