openmp/libomptarget/DeviceRTL/src/Workshare.cpp

   1 //===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file contains the implementation of the KMPC interface
  10 // for the loop construct plus other worksharing constructs that use the same
  11 // interface as loops.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "Debug.h"
  16 #include "Interface.h"
  17 #include "Mapping.h"
  18 #include "State.h"
  19 #include "Synchronization.h"
  20 #include "Types.h"
  21 #include "Utils.h"
  22
  23 using namespace _OMP;
  24
  25 // TODO:
  26 struct DynamicScheduleTracker {
  27   int64_t Chunk;
  28   int64_t LoopUpperBound;
  29   int64_t NextLowerBound;
  30   int64_t Stride;
  31   kmp_sched_t ScheduleType;
  32   DynamicScheduleTracker *NextDST;
  33 };
  34
  35 #define ASSERT0(...)
  36
  37 // used by the library for the interface with the app
  38 #define DISPATCH_FINISHED 0
  39 #define DISPATCH_NOTFINISHED 1
  40
  41 // used by dynamic scheduling
  42 #define FINISHED 0
  43 #define NOT_FINISHED 1
  44 #define LAST_CHUNK 2
  45
  46 #pragma omp declare target
  47
  48 // TODO: This variable is a hack inherited from the old runtime.
  49 uint64_t SHARED(Cnt);
  50
  51 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
  52   ////////////////////////////////////////////////////////////////////////////////
  53   // Loop with static scheduling with chunk
  54
  55   // Generic implementation of OMP loop scheduling with static policy
  56   /*! \brief Calculate initial bounds for static loop and stride
  57    *  @param[in] loc location in code of the call (not used here)
  58    *  @param[in] global_tid global thread id
  59    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
  60    *  @param[in] plastiter pointer to last iteration
  61    *  @param[in,out] pointer to loop lower bound. it will contain value of
  62    *  lower bound of first chunk
  63    *  @param[in,out] pointer to loop upper bound. It will contain value of
  64    *  upper bound of first chunk
  65    *  @param[in,out] pointer to loop stride. It will contain value of stride
  66    *  between two successive chunks executed by the same thread
  67    *  @param[in] loop increment bump
  68    *  @param[in] chunk size
  69    */
  70
  71   // helper function for static chunk
  72   static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
  73                              T entityId, T numberOfEntities) {
  74     // each thread executes multiple chunks all of the same size, except
  75     // the last one
  76     // distance between two successive chunks
  77     stride = numberOfEntities * chunk;
  78     lb = lb + entityId * chunk;
  79     T inputUb = ub;
  80     ub = lb + chunk - 1; // Clang uses i <= ub
  81     // Say ub' is the begining of the last chunk. Then who ever has a
  82     // lower bound plus a multiple of the increment equal to ub' is
  83     // the last one.
  84     T beginingLastChunk = inputUb - (inputUb % chunk);
  85     last = ((beginingLastChunk - lb) % stride) == 0;
  86   }
  87
  88   ////////////////////////////////////////////////////////////////////////////////
  89   // Loop with static scheduling without chunk
  90
  91   // helper function for static no chunk
  92   static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
  93                                T entityId, T numberOfEntities) {
  94     // No chunk size specified.  Each thread or warp gets at most one
  95     // chunk; chunks are all almost of equal size
  96     T loopSize = ub - lb + 1;
  97
  98     chunk = loopSize / numberOfEntities;
  99     T leftOver = loopSize - chunk * numberOfEntities;
 100
 101     if (entityId < leftOver) {
 102       chunk++;
 103       lb = lb + entityId * chunk;
 104     } else {
 105       lb = lb + entityId * chunk + leftOver;
 106     }
 107
 108     T inputUb = ub;
 109     ub = lb + chunk - 1; // Clang uses i <= ub
 110     last = lb <= inputUb && inputUb <= ub;
 111     stride = loopSize; // make sure we only do 1 chunk per warp
 112   }
 113
 114   ////////////////////////////////////////////////////////////////////////////////
 115   // Support for Static Init
 116
 117   static void for_static_init(int32_t, int32_t schedtype,
 118                               int32_t *plastiter, T *plower, T *pupper,
 119                               ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
 120     int32_t gtid = omp_get_thread_num();
 121     int numberOfActiveOMPThreads = omp_get_num_threads();
 122
 123     // All warps that are in excess of the maximum requested, do
 124     // not execute the loop
 125     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
 126             "current thread is not needed here; error");
 127
 128     // copy
 129     int lastiter = 0;
 130     T lb = *plower;
 131     T ub = *pupper;
 132     ST stride = *pstride;
 133
 134     // init
 135     switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
 136     case kmp_sched_static_chunk: {
 137       if (chunk > 0) {
 138         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
 139                        numberOfActiveOMPThreads);
 140         break;
 141       }
 142     } // note: if chunk <=0, use nochunk
 143     case kmp_sched_static_balanced_chunk: {
 144       if (chunk > 0) {
 145         // round up to make sure the chunk is enough to cover all iterations
 146         T tripCount = ub - lb + 1; // +1 because ub is inclusive
 147         T span = (tripCount + numberOfActiveOMPThreads - 1) /
 148                  numberOfActiveOMPThreads;
 149         // perform chunk adjustment
 150         chunk = (span + chunk - 1) & ~(chunk - 1);
 151
 152         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
 153         T oldUb = ub;
 154         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
 155                        numberOfActiveOMPThreads);
 156         if (ub > oldUb)
 157           ub = oldUb;
 158         break;
 159       }
 160     } // note: if chunk <=0, use nochunk
 161     case kmp_sched_static_nochunk: {
 162       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
 163                        numberOfActiveOMPThreads);
 164       break;
 165     }
 166     case kmp_sched_distr_static_chunk: {
 167       if (chunk > 0) {
 168         ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
 169                        omp_get_num_teams());
 170         break;
 171       } // note: if chunk <=0, use nochunk
 172     }
 173     case kmp_sched_distr_static_nochunk: {
 174       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
 175                        omp_get_num_teams());
 176       break;
 177     }
 178     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
 179       ForStaticChunk(lastiter, lb, ub, stride, chunk,
 180                      numberOfActiveOMPThreads * omp_get_team_num() + gtid,
 181                      omp_get_num_teams() * numberOfActiveOMPThreads);
 182       break;
 183     }
 184     default: {
 185       // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
 186       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
 187                      numberOfActiveOMPThreads);
 188       break;
 189     }
 190     }
 191     // copy back
 192     *plastiter = lastiter;
 193     *plower = lb;
 194     *pupper = ub;
 195     *pstride = stride;
 196   }
 197
 198   ////////////////////////////////////////////////////////////////////////////////
 199   // Support for dispatch Init
 200
 201   static int OrderedSchedule(kmp_sched_t schedule) {
 202     return schedule >= kmp_sched_ordered_first &&
 203            schedule <= kmp_sched_ordered_last;
 204   }
 205
 206   static void dispatch_init(IdentTy *loc, int32_t threadId,
 207                             kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
 208                             DynamicScheduleTracker *DST) {
 209     int tid = mapping::getThreadIdInBlock();
 210     T tnum = omp_get_num_threads();
 211     T tripCount = ub - lb + 1; // +1 because ub is inclusive
 212     ASSERT0(LT_FUSSY, threadId < tnum,
 213             "current thread is not needed here; error");
 214
 215     /* Currently just ignore the monotonic and non-monotonic modifiers
 216      * (the compiler isn't producing them * yet anyway).
 217      * When it is we'll want to look at them somewhere here and use that
 218      * information to add to our schedule choice. We shouldn't need to pass
 219      * them on, they merely affect which schedule we can legally choose for
 220      * various dynamic cases. (In particular, whether or not a stealing scheme
 221      * is legal).
 222      */
 223     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 224
 225     // Process schedule.
 226     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
 227       if (OrderedSchedule(schedule))
 228         __kmpc_barrier(loc, threadId);
 229       schedule = kmp_sched_static_chunk;
 230       chunk = tripCount; // one thread gets the whole loop
 231     } else if (schedule == kmp_sched_runtime) {
 232       // process runtime
 233       omp_sched_t rtSched;
 234       int ChunkInt;
 235       omp_get_schedule(&rtSched, &ChunkInt);
 236       chunk = ChunkInt;
 237       switch (rtSched) {
 238       case omp_sched_static: {
 239         if (chunk > 0)
 240           schedule = kmp_sched_static_chunk;
 241         else
 242           schedule = kmp_sched_static_nochunk;
 243         break;
 244       }
 245       case omp_sched_auto: {
 246         schedule = kmp_sched_static_chunk;
 247         chunk = 1;
 248         break;
 249       }
 250       case omp_sched_dynamic:
 251       case omp_sched_guided: {
 252         schedule = kmp_sched_dynamic;
 253         break;
 254       }
 255       }
 256     } else if (schedule == kmp_sched_auto) {
 257       schedule = kmp_sched_static_chunk;
 258       chunk = 1;
 259     } else {
 260       // ASSERT(LT_FUSSY,
 261       //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
 262       //        "unknown schedule %d & chunk %lld\n", (int)schedule,
 263       //        (long long)chunk);
 264     }
 265
 266     // init schedules
 267     if (schedule == kmp_sched_static_chunk) {
 268       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
 269       // save sched state
 270       DST->ScheduleType = schedule;
 271       // save ub
 272       DST->LoopUpperBound = ub;
 273       // compute static chunk
 274       ST stride;
 275       int lastiter = 0;
 276       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
 277       // save computed params
 278       DST->Chunk = chunk;
 279       DST->NextLowerBound = lb;
 280       DST->Stride = stride;
 281     } else if (schedule == kmp_sched_static_balanced_chunk) {
 282       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
 283       // save sched state
 284       DST->ScheduleType = schedule;
 285       // save ub
 286       DST->LoopUpperBound = ub;
 287       // compute static chunk
 288       ST stride;
 289       int lastiter = 0;
 290       // round up to make sure the chunk is enough to cover all iterations
 291       T span = (tripCount + tnum - 1) / tnum;
 292       // perform chunk adjustment
 293       chunk = (span + chunk - 1) & ~(chunk - 1);
 294
 295       T oldUb = ub;
 296       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
 297       ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
 298       if (ub > oldUb)
 299         ub = oldUb;
 300       // save computed params
 301       DST->Chunk = chunk;
 302       DST->NextLowerBound = lb;
 303       DST->Stride = stride;
 304     } else if (schedule == kmp_sched_static_nochunk) {
 305       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
 306       // save sched state
 307       DST->ScheduleType = schedule;
 308       // save ub
 309       DST->LoopUpperBound = ub;
 310       // compute static chunk
 311       ST stride;
 312       int lastiter = 0;
 313       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
 314       // save computed params
 315       DST->Chunk = chunk;
 316       DST->NextLowerBound = lb;
 317       DST->Stride = stride;
 318     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
 319       // save data
 320       DST->ScheduleType = schedule;
 321       if (chunk < 1)
 322         chunk = 1;
 323       DST->Chunk = chunk;
 324       DST->LoopUpperBound = ub;
 325       DST->NextLowerBound = lb;
 326       __kmpc_barrier(loc, threadId);
 327       if (tid == 0) {
 328         Cnt = 0;
 329         fence::team(__ATOMIC_SEQ_CST);
 330       }
 331       __kmpc_barrier(loc, threadId);
 332     }
 333   }
 334
 335   ////////////////////////////////////////////////////////////////////////////////
 336   // Support for dispatch next
 337
 338   static uint64_t NextIter() {
 339     __kmpc_impl_lanemask_t active = mapping::activemask();
 340     uint32_t leader = utils::ffs(active) - 1;
 341     uint32_t change = utils::popc(active);
 342     __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
 343     unsigned int rank = utils::popc(active & lane_mask_lt);
 344     uint64_t warp_res;
 345     if (rank == 0) {
 346       warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
 347     }
 348     warp_res = utils::shuffle(active, warp_res, leader);
 349     return warp_res + rank;
 350   }
 351
 352   static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
 353                               T loopUpperBound) {
 354     T N = NextIter();
 355     lb = loopLowerBound + N * chunkSize;
 356     ub = lb + chunkSize - 1; // Clang uses i <= ub
 357
 358     // 3 result cases:
 359     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
 360     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
 361     //  NOT_FINISHED
 362     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
 363     // a.
 364     if (lb <= loopUpperBound && ub < loopUpperBound) {
 365       return NOT_FINISHED;
 366     }
 367     // b.
 368     if (lb <= loopUpperBound) {
 369       ub = loopUpperBound;
 370       return LAST_CHUNK;
 371     }
 372     // c. if we are here, we are in case 'c'
 373     lb = loopUpperBound + 2;
 374     ub = loopUpperBound + 1;
 375     return FINISHED;
 376   }
 377
 378   static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
 379                            T *plower, T *pupper, ST *pstride,
 380                            DynamicScheduleTracker *DST) {
 381     // ID of a thread in its own warp
 382
 383     // automatically selects thread or warp ID based on selected implementation
 384     ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
 385             "current thread is not needed here; error");
 386     // retrieve schedule
 387     kmp_sched_t schedule = DST->ScheduleType;
 388
 389     // xxx reduce to one
 390     if (schedule == kmp_sched_static_chunk ||
 391         schedule == kmp_sched_static_nochunk) {
 392       T myLb = DST->NextLowerBound;
 393       T ub = DST->LoopUpperBound;
 394       // finished?
 395       if (myLb > ub) {
 396         return DISPATCH_FINISHED;
 397       }
 398       // not finished, save current bounds
 399       ST chunk = DST->Chunk;
 400       *plower = myLb;
 401       T myUb = myLb + chunk - 1; // Clang uses i <= ub
 402       if (myUb > ub)
 403         myUb = ub;
 404       *pupper = myUb;
 405       *plast = (int32_t)(myUb == ub);
 406
 407       // increment next lower bound by the stride
 408       ST stride = DST->Stride;
 409       DST->NextLowerBound = myLb + stride;
 410       return DISPATCH_NOTFINISHED;
 411     }
 412     ASSERT0(LT_FUSSY,
 413             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
 414             "bad sched");
 415     T myLb, myUb;
 416     int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
 417                                     DST->LoopUpperBound);
 418
 419     if (finished == FINISHED)
 420       return DISPATCH_FINISHED;
 421
 422     // not finished (either not finished or last chunk)
 423     *plast = (int32_t)(finished == LAST_CHUNK);
 424     *plower = myLb;
 425     *pupper = myUb;
 426     *pstride = 1;
 427
 428     return DISPATCH_NOTFINISHED;
 429   }
 430
 431   static void dispatch_fini() {
 432     // nothing
 433   }
 434
 435   ////////////////////////////////////////////////////////////////////////////////
 436   // end of template class that encapsulate all the helper functions
 437   ////////////////////////////////////////////////////////////////////////////////
 438 };
 439
 440 ////////////////////////////////////////////////////////////////////////////////
 441 // KMP interface implementation (dyn loops)
 442 ////////////////////////////////////////////////////////////////////////////////
 443
 444 // TODO: This is a stopgap. We probably want to expand the dispatch API to take
 445 //       an DST pointer which can then be allocated properly without malloc.
 446 DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
 447
 448 // Create a new DST, link the current one, and define the new as current.
 449 static DynamicScheduleTracker *pushDST() {
 450   DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
 451       memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
 452   *NewDST = DynamicScheduleTracker({0});
 453   NewDST->NextDST = ThreadDSTPtr;
 454   ThreadDSTPtr = NewDST;
 455   return ThreadDSTPtr;
 456 }
 457
 458 // Return the current DST.
 459 static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
 460
 461 // Pop the current DST and restore the last one.
 462 static void popDST() {
 463   DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
 464   memory::freeGlobal(ThreadDSTPtr, "remove DST");
 465   ThreadDSTPtr = OldDST;
 466 }
 467
 468 extern "C" {
 469
 470 // init
 471 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
 472                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
 473   FunctionTracingRAII();
 474   DynamicScheduleTracker *DST = pushDST();
 475   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
 476       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 477 }
 478
 479 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
 480                              uint32_t lb, uint32_t ub, int32_t st,
 481                              int32_t chunk) {
 482   FunctionTracingRAII();
 483   DynamicScheduleTracker *DST = pushDST();
 484   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
 485       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 486 }
 487
 488 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
 489                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
 490   FunctionTracingRAII();
 491   DynamicScheduleTracker *DST = pushDST();
 492   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
 493       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 494 }
 495
 496 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
 497                              uint64_t lb, uint64_t ub, int64_t st,
 498                              int64_t chunk) {
 499   FunctionTracingRAII();
 500   DynamicScheduleTracker *DST = pushDST();
 501   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
 502       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 503 }
 504
 505 // next
 506 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
 507                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
 508   FunctionTracingRAII();
 509   DynamicScheduleTracker *DST = peekDST();
 510   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
 511       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 512 }
 513
 514 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
 515                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
 516   FunctionTracingRAII();
 517   DynamicScheduleTracker *DST = peekDST();
 518   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
 519       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 520 }
 521
 522 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
 523                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
 524   FunctionTracingRAII();
 525   DynamicScheduleTracker *DST = peekDST();
 526   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
 527       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 528 }
 529
 530 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
 531                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
 532   FunctionTracingRAII();
 533   DynamicScheduleTracker *DST = peekDST();
 534   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
 535       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 536 }
 537
 538 // fini
 539 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
 540   FunctionTracingRAII();
 541   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
 542   popDST();
 543 }
 544
 545 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
 546   FunctionTracingRAII();
 547   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
 548   popDST();
 549 }
 550
 551 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
 552   FunctionTracingRAII();
 553   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
 554   popDST();
 555 }
 556
 557 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
 558   FunctionTracingRAII();
 559   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
 560   popDST();
 561 }
 562
 563 ////////////////////////////////////////////////////////////////////////////////
 564 // KMP interface implementation (static loops)
 565 ////////////////////////////////////////////////////////////////////////////////
 566
 567 void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
 568                               int32_t schedtype, int32_t *plastiter,
 569                               int32_t *plower, int32_t *pupper,
 570                               int32_t *pstride, int32_t incr, int32_t chunk) {
 571   FunctionTracingRAII();
 572   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
 573       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 574       mapping::isSPMDMode());
 575 }
 576
 577 void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
 578                                int32_t schedtype, int32_t *plastiter,
 579                                uint32_t *plower, uint32_t *pupper,
 580                                int32_t *pstride, int32_t incr, int32_t chunk) {
 581   FunctionTracingRAII();
 582   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
 583       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 584       mapping::isSPMDMode());
 585 }
 586
 587 void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
 588                               int32_t schedtype, int32_t *plastiter,
 589                               int64_t *plower, int64_t *pupper,
 590                               int64_t *pstride, int64_t incr, int64_t chunk) {
 591   FunctionTracingRAII();
 592   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
 593       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 594       mapping::isSPMDMode());
 595 }
 596
 597 void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
 598                                int32_t schedtype, int32_t *plastiter,
 599                                uint64_t *plower, uint64_t *pupper,
 600                                int64_t *pstride, int64_t incr, int64_t chunk) {
 601   FunctionTracingRAII();
 602   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
 603       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 604       mapping::isSPMDMode());
 605 }
 606
 607 void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
 608                                      int32_t schedtype, int32_t *plastiter,
 609                                      int32_t *plower, int32_t *pupper,
 610                                      int32_t *pstride, int32_t incr,
 611                                      int32_t chunk) {
 612   FunctionTracingRAII();
 613   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
 614       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 615       mapping::isSPMDMode());
 616 }
 617
 618 void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
 619                                       int32_t schedtype, int32_t *plastiter,
 620                                       uint32_t *plower, uint32_t *pupper,
 621                                       int32_t *pstride, int32_t incr,
 622                                       int32_t chunk) {
 623   FunctionTracingRAII();
 624   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
 625       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 626       mapping::isSPMDMode());
 627 }
 628
 629 void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
 630                                      int32_t schedtype, int32_t *plastiter,
 631                                      int64_t *plower, int64_t *pupper,
 632                                      int64_t *pstride, int64_t incr,
 633                                      int64_t chunk) {
 634   FunctionTracingRAII();
 635   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
 636       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 637       mapping::isSPMDMode());
 638 }
 639
 640 void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
 641                                       int32_t schedtype, int32_t *plastiter,
 642                                       uint64_t *plower, uint64_t *pupper,
 643                                       int64_t *pstride, int64_t incr,
 644                                       int64_t chunk) {
 645   FunctionTracingRAII();
 646   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
 647       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 648       mapping::isSPMDMode());
 649 }
 650
 651 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {
 652   FunctionTracingRAII();
 653 }
 654
 655 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
 656   FunctionTracingRAII();
 657 }
 658 }
 659
 660 #pragma omp end declare target