openmp/runtime/src/kmp_dispatch.h

   1 /*
   2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef KMP_DISPATCH_H
  14 #define KMP_DISPATCH_H
  15
  16 /* ------------------------------------------------------------------------ */
  17 /* ------------------------------------------------------------------------ */
  18
  19 #include "kmp.h"
  20 #include "kmp_error.h"
  21 #include "kmp_i18n.h"
  22 #include "kmp_itt.h"
  23 #include "kmp_stats.h"
  24 #include "kmp_str.h"
  25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
  26 #include <float.h>
  27 #endif
  28
  29 #if OMPT_SUPPORT
  30 #include "ompt-internal.h"
  31 #include "ompt-specific.h"
  32 #endif
  33
  34 /* ------------------------------------------------------------------------ */
  35 /* ------------------------------------------------------------------------ */
  36 #if KMP_USE_HIER_SCHED
  37 // Forward declarations of some hierarchical scheduling data structures
  38 template <typename T> struct kmp_hier_t;
  39 template <typename T> struct kmp_hier_top_unit_t;
  40 #endif // KMP_USE_HIER_SCHED
  41
  42 template <typename T> struct dispatch_shared_info_template;
  43 template <typename T> struct dispatch_private_info_template;
  44
  45 template <typename T>
  46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  47                                           dispatch_private_info_template<T> *pr,
  48                                           enum sched_type schedule, T lb, T ub,
  49                                           typename traits_t<T>::signed_t st,
  50 #if USE_ITT_BUILD
  51                                           kmp_uint64 *cur_chunk,
  52 #endif
  53                                           typename traits_t<T>::signed_t chunk,
  54                                           T nproc, T unit_id);
  55 template <typename T>
  56 extern int __kmp_dispatch_next_algorithm(
  57     int gtid, dispatch_private_info_template<T> *pr,
  58     dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
  59     T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
  60
  61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  63
  64 #if KMP_STATIC_STEAL_ENABLED
  65
  66 // replaces dispatch_private_info{32,64} structures and
  67 // dispatch_private_info{32,64}_t types
  68 template <typename T> struct dispatch_private_infoXX_template {
  69   typedef typename traits_t<T>::unsigned_t UT;
  70   typedef typename traits_t<T>::signed_t ST;
  71   UT count; // unsigned
  72   T ub;
  73   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
  74   T lb;
  75   ST st; // signed
  76   UT tc; // unsigned
  77   kmp_lock_t *steal_lock; // lock used for chunk stealing
  78
  79   UT ordered_lower; // unsigned
  80   UT ordered_upper; // unsigned
  81
  82   /* parm[1-4] are used in different ways by different scheduling algorithms */
  83
  84   // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
  85   //    a) parm3 is properly aligned and
  86   //    b) all parm1-4 are in the same cache line.
  87   // Because of parm1-4 are used together, performance seems to be better
  88   // if they are in the same line (not measured though).
  89   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
  90     T parm1;
  91     T parm2;
  92     T parm3;
  93     T parm4;
  94   };
  95
  96 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
  97   UT pchunks; // total number of chunks for processes with p-core
  98   UT num_procs_with_pcore; // number of threads with p-core
  99   T first_thread_with_ecore;
 100 #endif
 101 #if KMP_OS_WINDOWS
 102   T last_upper;
 103 #endif /* KMP_OS_WINDOWS */
 104 };
 105
 106 #else /* KMP_STATIC_STEAL_ENABLED */
 107
 108 // replaces dispatch_private_info{32,64} structures and
 109 // dispatch_private_info{32,64}_t types
 110 template <typename T> struct dispatch_private_infoXX_template {
 111   typedef typename traits_t<T>::unsigned_t UT;
 112   typedef typename traits_t<T>::signed_t ST;
 113   T lb;
 114   T ub;
 115   ST st; // signed
 116   UT tc; // unsigned
 117
 118   T parm1;
 119   T parm2;
 120   T parm3;
 121   T parm4;
 122
 123   UT count; // unsigned
 124
 125   UT ordered_lower; // unsigned
 126   UT ordered_upper; // unsigned
 127 #if KMP_OS_WINDOWS
 128   T last_upper;
 129 #endif /* KMP_OS_WINDOWS */
 130 };
 131 #endif /* KMP_STATIC_STEAL_ENABLED */
 132
 133 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
 134   // duplicate alignment here, otherwise size of structure is not correct in our
 135   // compiler
 136   union KMP_ALIGN_CACHE private_info_tmpl {
 137     dispatch_private_infoXX_template<T> p;
 138     dispatch_private_info64_t p64;
 139   } u;
 140   enum sched_type schedule; /* scheduling algorithm */
 141   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
 142   std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
 143   kmp_uint32 ordered_bumped;
 144   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
 145   kmp_uint32 type_size;
 146 #if KMP_USE_HIER_SCHED
 147   kmp_int32 hier_id;
 148   kmp_hier_top_unit_t<T> *hier_parent;
 149   // member functions
 150   kmp_int32 get_hier_id() const { return hier_id; }
 151   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
 152 #endif
 153   enum cons_type pushed_ws;
 154 };
 155
 156 // replaces dispatch_shared_info{32,64} structures and
 157 // dispatch_shared_info{32,64}_t types
 158 template <typename T> struct dispatch_shared_infoXX_template {
 159   typedef typename traits_t<T>::unsigned_t UT;
 160   typedef typename traits_t<T>::signed_t ST;
 161   /* chunk index under dynamic, number of idle threads under static-steal;
 162      iteration index otherwise */
 163   volatile UT iteration;
 164   volatile ST num_done;
 165   volatile UT ordered_iteration;
 166   // to retain the structure size making ordered_iteration scalar
 167   UT ordered_dummy[KMP_MAX_ORDERED - 3];
 168 };
 169
 170 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
 171 template <typename T> struct dispatch_shared_info_template {
 172   typedef typename traits_t<T>::unsigned_t UT;
 173   // we need union here to keep the structure size
 174   union shared_info_tmpl {
 175     dispatch_shared_infoXX_template<UT> s;
 176     dispatch_shared_info64_t s64;
 177   } u;
 178   volatile kmp_uint32 buffer_index;
 179   volatile kmp_int32 doacross_buf_idx; // teamwise index
 180   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
 181   kmp_int32 doacross_num_done; // count finished threads
 182 #if KMP_USE_HIER_SCHED
 183   kmp_hier_t<T> *hier;
 184 #endif
 185 #if KMP_USE_HWLOC
 186   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
 187   // machines (> 48 cores). Performance analysis showed that a cache thrash
 188   // was occurring and this padding helps alleviate the problem.
 189   char padding[64];
 190 #endif
 191 };
 192
 193 /* ------------------------------------------------------------------------ */
 194 /* ------------------------------------------------------------------------ */
 195
 196 #undef USE_TEST_LOCKS
 197
 198 // test_then_add template (general template should NOT be used)
 199 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
 200
 201 template <>
 202 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
 203                                                  kmp_int32 d) {
 204   kmp_int32 r;
 205   r = KMP_TEST_THEN_ADD32(p, d);
 206   return r;
 207 }
 208
 209 template <>
 210 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
 211                                                  kmp_int64 d) {
 212   kmp_int64 r;
 213   r = KMP_TEST_THEN_ADD64(p, d);
 214   return r;
 215 }
 216
 217 // test_then_inc_acq template (general template should NOT be used)
 218 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
 219
 220 template <>
 221 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
 222   kmp_int32 r;
 223   r = KMP_TEST_THEN_INC_ACQ32(p);
 224   return r;
 225 }
 226
 227 template <>
 228 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
 229   kmp_int64 r;
 230   r = KMP_TEST_THEN_INC_ACQ64(p);
 231   return r;
 232 }
 233
 234 // test_then_inc template (general template should NOT be used)
 235 template <typename T> static __forceinline T test_then_inc(volatile T *p);
 236
 237 template <>
 238 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
 239   kmp_int32 r;
 240   r = KMP_TEST_THEN_INC32(p);
 241   return r;
 242 }
 243
 244 template <>
 245 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
 246   kmp_int64 r;
 247   r = KMP_TEST_THEN_INC64(p);
 248   return r;
 249 }
 250
 251 // compare_and_swap template (general template should NOT be used)
 252 template <typename T>
 253 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
 254
 255 template <>
 256 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
 257                                                     kmp_int32 c, kmp_int32 s) {
 258   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
 259 }
 260
 261 template <>
 262 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
 263                                                     kmp_int64 c, kmp_int64 s) {
 264   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
 265 }
 266
 267 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
 268   return value >= checker;
 269 }
 270 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
 271   return value == checker;
 272 }
 273
 274 /*
 275     Spin wait loop that pauses between checks.
 276     Waits until function returns non-zero when called with *spinner and check.
 277     Does NOT put threads to sleep.
 278     Arguments:
 279         UT is unsigned 4- or 8-byte type
 280         spinner - memory location to check value
 281         checker - value which spinner is >, <, ==, etc.
 282         pred - predicate function to perform binary comparison of some sort
 283 #if USE_ITT_BUILD
 284         obj -- is higher-level synchronization object to report to ittnotify. It
 285         is used to report locks consistently. For example, if lock is acquired
 286         immediately, its address is reported to ittnotify via
 287         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
 288         and lock routine calls to KMP_WAIT(), the later should report the
 289         same address, not an address of low-level spinner.
 290 #endif // USE_ITT_BUILD
 291     TODO: make inline function (move to header file for icl)
 292 */
 293 template <typename UT>
 294 static UT __kmp_wait(volatile UT *spinner, UT checker,
 295                      kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
 296   // note: we may not belong to a team at this point
 297   volatile UT *spin = spinner;
 298   UT check = checker;
 299   kmp_uint32 spins;
 300   kmp_uint32 (*f)(UT, UT) = pred;
 301   kmp_uint64 time;
 302   UT r;
 303
 304   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
 305   KMP_INIT_YIELD(spins);
 306   KMP_INIT_BACKOFF(time);
 307   // main wait spin loop
 308   while (!f(r = *spin, check)) {
 309     KMP_FSYNC_SPIN_PREPARE(obj);
 310     /* GEH - remove this since it was accidentally introduced when kmp_wait was
 311        split.
 312        It causes problems with infinite recursion because of exit lock */
 313     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
 314         __kmp_abort_thread(); */
 315     // If oversubscribed, or have waited a bit then yield.
 316     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
 317   }
 318   KMP_FSYNC_SPIN_ACQUIRED(obj);
 319   return r;
 320 }
 321
 322 /* ------------------------------------------------------------------------ */
 323 /* ------------------------------------------------------------------------ */
 324
 325 template <typename UT>
 326 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 327   dispatch_private_info_template<UT> *pr;
 328
 329   int gtid = *gtid_ref;
 330   //    int  cid = *cid_ref;
 331   kmp_info_t *th = __kmp_threads[gtid];
 332   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 333
 334   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
 335   if (__kmp_env_consistency_check) {
 336     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 337         th->th.th_dispatch->th_dispatch_pr_current);
 338     if (pr->pushed_ws != ct_none) {
 339 #if KMP_USE_DYNAMIC_LOCK
 340       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
 341 #else
 342       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
 343 #endif
 344     }
 345   }
 346
 347   if (!th->th.th_team->t.t_serialized) {
 348     dispatch_shared_info_template<UT> *sh =
 349         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 350             th->th.th_dispatch->th_dispatch_sh_current);
 351     UT lower;
 352
 353     if (!__kmp_env_consistency_check) {
 354       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 355           th->th.th_dispatch->th_dispatch_pr_current);
 356     }
 357     lower = pr->u.p.ordered_lower;
 358
 359 #if !defined(KMP_GOMP_COMPAT)
 360     if (__kmp_env_consistency_check) {
 361       if (pr->ordered_bumped) {
 362         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 363         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 364                                ct_ordered_in_pdo, loc_ref,
 365                                &p->stack_data[p->w_top]);
 366       }
 367     }
 368 #endif /* !defined(KMP_GOMP_COMPAT) */
 369
 370     KMP_MB();
 371 #ifdef KMP_DEBUG
 372     {
 373       char *buff;
 374       // create format specifiers before the debug output
 375       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
 376                               "ordered_iter:%%%s lower:%%%s\n",
 377                               traits_t<UT>::spec, traits_t<UT>::spec);
 378       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 379       __kmp_str_free(&buff);
 380     }
 381 #endif
 382     __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
 383                    __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
 384     KMP_MB(); /* is this necessary? */
 385 #ifdef KMP_DEBUG
 386     {
 387       char *buff;
 388       // create format specifiers before the debug output
 389       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
 390                               "ordered_iter:%%%s lower:%%%s\n",
 391                               traits_t<UT>::spec, traits_t<UT>::spec);
 392       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 393       __kmp_str_free(&buff);
 394     }
 395 #endif
 396   }
 397   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
 398 }
 399
 400 template <typename UT>
 401 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 402   typedef typename traits_t<UT>::signed_t ST;
 403   dispatch_private_info_template<UT> *pr;
 404
 405   int gtid = *gtid_ref;
 406   //    int  cid = *cid_ref;
 407   kmp_info_t *th = __kmp_threads[gtid];
 408   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 409
 410   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
 411   if (__kmp_env_consistency_check) {
 412     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 413         th->th.th_dispatch->th_dispatch_pr_current);
 414     if (pr->pushed_ws != ct_none) {
 415       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
 416     }
 417   }
 418
 419   if (!th->th.th_team->t.t_serialized) {
 420     dispatch_shared_info_template<UT> *sh =
 421         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 422             th->th.th_dispatch->th_dispatch_sh_current);
 423
 424     if (!__kmp_env_consistency_check) {
 425       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 426           th->th.th_dispatch->th_dispatch_pr_current);
 427     }
 428
 429     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
 430 #if !defined(KMP_GOMP_COMPAT)
 431     if (__kmp_env_consistency_check) {
 432       if (pr->ordered_bumped != 0) {
 433         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 434         /* How to test it? - OM */
 435         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 436                                ct_ordered_in_pdo, loc_ref,
 437                                &p->stack_data[p->w_top]);
 438       }
 439     }
 440 #endif /* !defined(KMP_GOMP_COMPAT) */
 441
 442     KMP_MB(); /* Flush all pending memory write invalidates.  */
 443
 444     pr->ordered_bumped += 1;
 445
 446     KD_TRACE(1000,
 447              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
 448               gtid, pr->ordered_bumped));
 449
 450     KMP_MB(); /* Flush all pending memory write invalidates.  */
 451
 452     /* TODO use general release procedure? */
 453     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
 454
 455     KMP_MB(); /* Flush all pending memory write invalidates.  */
 456   }
 457   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
 458 }
 459
 460 /* Computes and returns x to the power of y, where y must a non-negative integer
 461  */
 462 template <typename UT>
 463 static __forceinline long double __kmp_pow(long double x, UT y) {
 464   long double s = 1.0L;
 465
 466   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
 467   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
 468   while (y) {
 469     if (y & 1)
 470       s *= x;
 471     x *= x;
 472     y >>= 1;
 473   }
 474   return s;
 475 }
 476
 477 /* Computes and returns the number of unassigned iterations after idx chunks
 478    have been assigned
 479    (the total number of unassigned iterations in chunks with index greater than
 480    or equal to idx).
 481    __forceinline seems to be broken so that if we __forceinline this function,
 482    the behavior is wrong
 483    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
 484 */
 485 template <typename T>
 486 static __inline typename traits_t<T>::unsigned_t
 487 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
 488                                 typename traits_t<T>::unsigned_t idx) {
 489   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
 490      least for ICL 8.1, long double arithmetic may not really have
 491      long double precision, even with /Qlong_double.  Currently, we
 492      workaround that in the caller code, by manipulating the FPCW for
 493      Windows* OS on IA-32 architecture.  The lack of precision is not
 494      expected to be a correctness issue, though.
 495   */
 496   typedef typename traits_t<T>::unsigned_t UT;
 497
 498   long double x = tc * __kmp_pow<UT>(base, idx);
 499   UT r = (UT)x;
 500   if (x == r)
 501     return r;
 502   return r + 1;
 503 }
 504
 505 // Parameters of the guided-iterative algorithm:
 506 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
 507 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
 508 // by default n = 2. For example with n = 3 the chunks distribution will be more
 509 // flat.
 510 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
 511 static const int guided_int_param = 2;
 512 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
 513 #endif // KMP_DISPATCH_H