openmp/libomptarget/DeviceRTL/src/Workshare.cpp

   1 //===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file contains the implementation of the KMPC interface
  10 // for the loop construct plus other worksharing constructs that use the same
  11 // interface as loops.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "Debug.h"
  16 #include "Interface.h"
  17 #include "Mapping.h"
  18 #include "State.h"
  19 #include "Synchronization.h"
  20 #include "Types.h"
  21 #include "Utils.h"
  22
  23 using namespace ompx;
  24
  25 // TODO:
  26 struct DynamicScheduleTracker {
  27   int64_t Chunk;
  28   int64_t LoopUpperBound;
  29   int64_t NextLowerBound;
  30   int64_t Stride;
  31   kmp_sched_t ScheduleType;
  32   DynamicScheduleTracker *NextDST;
  33 };
  34
  35 #define ASSERT0(...)
  36
  37 // used by the library for the interface with the app
  38 #define DISPATCH_FINISHED 0
  39 #define DISPATCH_NOTFINISHED 1
  40
  41 // used by dynamic scheduling
  42 #define FINISHED 0
  43 #define NOT_FINISHED 1
  44 #define LAST_CHUNK 2
  45
  46 #pragma omp begin declare target device_type(nohost)
  47
  48 // TODO: This variable is a hack inherited from the old runtime.
  49 static uint64_t SHARED(Cnt);
  50
  51 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
  52   ////////////////////////////////////////////////////////////////////////////////
  53   // Loop with static scheduling with chunk
  54
  55   // Generic implementation of OMP loop scheduling with static policy
  56   /*! \brief Calculate initial bounds for static loop and stride
  57    *  @param[in] loc location in code of the call (not used here)
  58    *  @param[in] global_tid global thread id
  59    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
  60    *  @param[in] plastiter pointer to last iteration
  61    *  @param[in,out] pointer to loop lower bound. it will contain value of
  62    *  lower bound of first chunk
  63    *  @param[in,out] pointer to loop upper bound. It will contain value of
  64    *  upper bound of first chunk
  65    *  @param[in,out] pointer to loop stride. It will contain value of stride
  66    *  between two successive chunks executed by the same thread
  67    *  @param[in] loop increment bump
  68    *  @param[in] chunk size
  69    */
  70
  71   // helper function for static chunk
  72   static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
  73                              T entityId, T numberOfEntities) {
  74     // each thread executes multiple chunks all of the same size, except
  75     // the last one
  76     // distance between two successive chunks
  77     stride = numberOfEntities * chunk;
  78     lb = lb + entityId * chunk;
  79     T inputUb = ub;
  80     ub = lb + chunk - 1; // Clang uses i <= ub
  81     // Say ub' is the begining of the last chunk. Then who ever has a
  82     // lower bound plus a multiple of the increment equal to ub' is
  83     // the last one.
  84     T beginingLastChunk = inputUb - (inputUb % chunk);
  85     last = ((beginingLastChunk - lb) % stride) == 0;
  86   }
  87
  88   ////////////////////////////////////////////////////////////////////////////////
  89   // Loop with static scheduling without chunk
  90
  91   // helper function for static no chunk
  92   static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
  93                                T entityId, T numberOfEntities) {
  94     // No chunk size specified.  Each thread or warp gets at most one
  95     // chunk; chunks are all almost of equal size
  96     T loopSize = ub - lb + 1;
  97
  98     chunk = loopSize / numberOfEntities;
  99     T leftOver = loopSize - chunk * numberOfEntities;
 100
 101     if (entityId < leftOver) {
 102       chunk++;
 103       lb = lb + entityId * chunk;
 104     } else {
 105       lb = lb + entityId * chunk + leftOver;
 106     }
 107
 108     T inputUb = ub;
 109     ub = lb + chunk - 1; // Clang uses i <= ub
 110     last = lb <= inputUb && inputUb <= ub;
 111     stride = loopSize; // make sure we only do 1 chunk per warp
 112   }
 113
 114   ////////////////////////////////////////////////////////////////////////////////
 115   // Support for Static Init
 116
 117   static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
 118                               T *plower, T *pupper, ST *pstride, ST chunk,
 119                               bool IsSPMDExecutionMode) {
 120     int32_t gtid = omp_get_thread_num();
 121     int numberOfActiveOMPThreads = omp_get_num_threads();
 122
 123     // All warps that are in excess of the maximum requested, do
 124     // not execute the loop
 125     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
 126             "current thread is not needed here; error");
 127
 128     // copy
 129     int lastiter = 0;
 130     T lb = *plower;
 131     T ub = *pupper;
 132     ST stride = *pstride;
 133
 134     // init
 135     switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
 136     case kmp_sched_static_chunk: {
 137       if (chunk > 0) {
 138         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
 139                        numberOfActiveOMPThreads);
 140         break;
 141       }
 142       [[fallthrough]];
 143     } // note: if chunk <=0, use nochunk
 144     case kmp_sched_static_balanced_chunk: {
 145       if (chunk > 0) {
 146         // round up to make sure the chunk is enough to cover all iterations
 147         T tripCount = ub - lb + 1; // +1 because ub is inclusive
 148         T span = (tripCount + numberOfActiveOMPThreads - 1) /
 149                  numberOfActiveOMPThreads;
 150         // perform chunk adjustment
 151         chunk = (span + chunk - 1) & ~(chunk - 1);
 152
 153         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
 154         T oldUb = ub;
 155         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
 156                        numberOfActiveOMPThreads);
 157         if (ub > oldUb)
 158           ub = oldUb;
 159         break;
 160       }
 161       [[fallthrough]];
 162     } // note: if chunk <=0, use nochunk
 163     case kmp_sched_static_nochunk: {
 164       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
 165                        numberOfActiveOMPThreads);
 166       break;
 167     }
 168     case kmp_sched_distr_static_chunk: {
 169       if (chunk > 0) {
 170         ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
 171                        omp_get_num_teams());
 172         break;
 173       }
 174       [[fallthrough]];
 175     } // note: if chunk <=0, use nochunk
 176     case kmp_sched_distr_static_nochunk: {
 177       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
 178                        omp_get_num_teams());
 179       break;
 180     }
 181     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
 182       ForStaticChunk(lastiter, lb, ub, stride, chunk,
 183                      numberOfActiveOMPThreads * omp_get_team_num() + gtid,
 184                      omp_get_num_teams() * numberOfActiveOMPThreads);
 185       break;
 186     }
 187     default: {
 188       // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
 189       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
 190                      numberOfActiveOMPThreads);
 191       break;
 192     }
 193     }
 194     // copy back
 195     *plastiter = lastiter;
 196     *plower = lb;
 197     *pupper = ub;
 198     *pstride = stride;
 199   }
 200
 201   ////////////////////////////////////////////////////////////////////////////////
 202   // Support for dispatch Init
 203
 204   static int OrderedSchedule(kmp_sched_t schedule) {
 205     return schedule >= kmp_sched_ordered_first &&
 206            schedule <= kmp_sched_ordered_last;
 207   }
 208
 209   static void dispatch_init(IdentTy *loc, int32_t threadId,
 210                             kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
 211                             DynamicScheduleTracker *DST) {
 212     int tid = mapping::getThreadIdInBlock();
 213     T tnum = omp_get_num_threads();
 214     T tripCount = ub - lb + 1; // +1 because ub is inclusive
 215     ASSERT0(LT_FUSSY, threadId < tnum,
 216             "current thread is not needed here; error");
 217
 218     /* Currently just ignore the monotonic and non-monotonic modifiers
 219      * (the compiler isn't producing them * yet anyway).
 220      * When it is we'll want to look at them somewhere here and use that
 221      * information to add to our schedule choice. We shouldn't need to pass
 222      * them on, they merely affect which schedule we can legally choose for
 223      * various dynamic cases. (In particular, whether or not a stealing scheme
 224      * is legal).
 225      */
 226     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 227
 228     // Process schedule.
 229     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
 230       if (OrderedSchedule(schedule))
 231         __kmpc_barrier(loc, threadId);
 232       schedule = kmp_sched_static_chunk;
 233       chunk = tripCount; // one thread gets the whole loop
 234     } else if (schedule == kmp_sched_runtime) {
 235       // process runtime
 236       omp_sched_t rtSched;
 237       int ChunkInt;
 238       omp_get_schedule(&rtSched, &ChunkInt);
 239       chunk = ChunkInt;
 240       switch (rtSched) {
 241       case omp_sched_static: {
 242         if (chunk > 0)
 243           schedule = kmp_sched_static_chunk;
 244         else
 245           schedule = kmp_sched_static_nochunk;
 246         break;
 247       }
 248       case omp_sched_auto: {
 249         schedule = kmp_sched_static_chunk;
 250         chunk = 1;
 251         break;
 252       }
 253       case omp_sched_dynamic:
 254       case omp_sched_guided: {
 255         schedule = kmp_sched_dynamic;
 256         break;
 257       }
 258       }
 259     } else if (schedule == kmp_sched_auto) {
 260       schedule = kmp_sched_static_chunk;
 261       chunk = 1;
 262     } else {
 263       // ASSERT(LT_FUSSY,
 264       //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
 265       //        "unknown schedule %d & chunk %lld\n", (int)schedule,
 266       //        (long long)chunk);
 267     }
 268
 269     // init schedules
 270     if (schedule == kmp_sched_static_chunk) {
 271       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
 272       // save sched state
 273       DST->ScheduleType = schedule;
 274       // save ub
 275       DST->LoopUpperBound = ub;
 276       // compute static chunk
 277       ST stride;
 278       int lastiter = 0;
 279       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
 280       // save computed params
 281       DST->Chunk = chunk;
 282       DST->NextLowerBound = lb;
 283       DST->Stride = stride;
 284     } else if (schedule == kmp_sched_static_balanced_chunk) {
 285       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
 286       // save sched state
 287       DST->ScheduleType = schedule;
 288       // save ub
 289       DST->LoopUpperBound = ub;
 290       // compute static chunk
 291       ST stride;
 292       int lastiter = 0;
 293       // round up to make sure the chunk is enough to cover all iterations
 294       T span = (tripCount + tnum - 1) / tnum;
 295       // perform chunk adjustment
 296       chunk = (span + chunk - 1) & ~(chunk - 1);
 297
 298       T oldUb = ub;
 299       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
 300       ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
 301       if (ub > oldUb)
 302         ub = oldUb;
 303       // save computed params
 304       DST->Chunk = chunk;
 305       DST->NextLowerBound = lb;
 306       DST->Stride = stride;
 307     } else if (schedule == kmp_sched_static_nochunk) {
 308       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
 309       // save sched state
 310       DST->ScheduleType = schedule;
 311       // save ub
 312       DST->LoopUpperBound = ub;
 313       // compute static chunk
 314       ST stride;
 315       int lastiter = 0;
 316       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
 317       // save computed params
 318       DST->Chunk = chunk;
 319       DST->NextLowerBound = lb;
 320       DST->Stride = stride;
 321     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
 322       // save data
 323       DST->ScheduleType = schedule;
 324       if (chunk < 1)
 325         chunk = 1;
 326       DST->Chunk = chunk;
 327       DST->LoopUpperBound = ub;
 328       DST->NextLowerBound = lb;
 329       __kmpc_barrier(loc, threadId);
 330       if (tid == 0) {
 331         Cnt = 0;
 332         fence::team(atomic::seq_cst);
 333       }
 334       __kmpc_barrier(loc, threadId);
 335     }
 336   }
 337
 338   ////////////////////////////////////////////////////////////////////////////////
 339   // Support for dispatch next
 340
 341   static uint64_t NextIter() {
 342     __kmpc_impl_lanemask_t active = mapping::activemask();
 343     uint32_t leader = utils::ffs(active) - 1;
 344     uint32_t change = utils::popc(active);
 345     __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
 346     unsigned int rank = utils::popc(active & lane_mask_lt);
 347     uint64_t warp_res = 0;
 348     if (rank == 0) {
 349       warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
 350     }
 351     warp_res = utils::shuffle(active, warp_res, leader);
 352     return warp_res + rank;
 353   }
 354
 355   static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
 356                               T loopUpperBound) {
 357     T N = NextIter();
 358     lb = loopLowerBound + N * chunkSize;
 359     ub = lb + chunkSize - 1; // Clang uses i <= ub
 360
 361     // 3 result cases:
 362     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
 363     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
 364     //  NOT_FINISHED
 365     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
 366     // a.
 367     if (lb <= loopUpperBound && ub < loopUpperBound) {
 368       return NOT_FINISHED;
 369     }
 370     // b.
 371     if (lb <= loopUpperBound) {
 372       ub = loopUpperBound;
 373       return LAST_CHUNK;
 374     }
 375     // c. if we are here, we are in case 'c'
 376     lb = loopUpperBound + 2;
 377     ub = loopUpperBound + 1;
 378     return FINISHED;
 379   }
 380
 381   static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
 382                            T *plower, T *pupper, ST *pstride,
 383                            DynamicScheduleTracker *DST) {
 384     // ID of a thread in its own warp
 385
 386     // automatically selects thread or warp ID based on selected implementation
 387     ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
 388             "current thread is not needed here; error");
 389     // retrieve schedule
 390     kmp_sched_t schedule = DST->ScheduleType;
 391
 392     // xxx reduce to one
 393     if (schedule == kmp_sched_static_chunk ||
 394         schedule == kmp_sched_static_nochunk) {
 395       T myLb = DST->NextLowerBound;
 396       T ub = DST->LoopUpperBound;
 397       // finished?
 398       if (myLb > ub) {
 399         return DISPATCH_FINISHED;
 400       }
 401       // not finished, save current bounds
 402       ST chunk = DST->Chunk;
 403       *plower = myLb;
 404       T myUb = myLb + chunk - 1; // Clang uses i <= ub
 405       if (myUb > ub)
 406         myUb = ub;
 407       *pupper = myUb;
 408       *plast = (int32_t)(myUb == ub);
 409
 410       // increment next lower bound by the stride
 411       ST stride = DST->Stride;
 412       DST->NextLowerBound = myLb + stride;
 413       return DISPATCH_NOTFINISHED;
 414     }
 415     ASSERT0(LT_FUSSY,
 416             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
 417             "bad sched");
 418     T myLb, myUb;
 419     int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
 420                                     DST->LoopUpperBound);
 421
 422     if (finished == FINISHED)
 423       return DISPATCH_FINISHED;
 424
 425     // not finished (either not finished or last chunk)
 426     *plast = (int32_t)(finished == LAST_CHUNK);
 427     *plower = myLb;
 428     *pupper = myUb;
 429     *pstride = 1;
 430
 431     return DISPATCH_NOTFINISHED;
 432   }
 433
 434   static void dispatch_fini() {
 435     // nothing
 436   }
 437
 438   ////////////////////////////////////////////////////////////////////////////////
 439   // end of template class that encapsulate all the helper functions
 440   ////////////////////////////////////////////////////////////////////////////////
 441 };
 442
 443 ////////////////////////////////////////////////////////////////////////////////
 444 // KMP interface implementation (dyn loops)
 445 ////////////////////////////////////////////////////////////////////////////////
 446
 447 // TODO: This is a stopgap. We probably want to expand the dispatch API to take
 448 //       an DST pointer which can then be allocated properly without malloc.
 449 static DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
 450
 451 // Create a new DST, link the current one, and define the new as current.
 452 static DynamicScheduleTracker *pushDST() {
 453   DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
 454       memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
 455   *NewDST = DynamicScheduleTracker({0});
 456   NewDST->NextDST = ThreadDSTPtr;
 457   ThreadDSTPtr = NewDST;
 458   return ThreadDSTPtr;
 459 }
 460
 461 // Return the current DST.
 462 static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
 463
 464 // Pop the current DST and restore the last one.
 465 static void popDST() {
 466   DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
 467   memory::freeGlobal(ThreadDSTPtr, "remove DST");
 468   ThreadDSTPtr = OldDST;
 469 }
 470
 471 extern "C" {
 472
 473 // init
 474 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
 475                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
 476   DynamicScheduleTracker *DST = pushDST();
 477   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
 478       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 479 }
 480
 481 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
 482                              uint32_t lb, uint32_t ub, int32_t st,
 483                              int32_t chunk) {
 484   DynamicScheduleTracker *DST = pushDST();
 485   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
 486       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 487 }
 488
 489 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
 490                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
 491   DynamicScheduleTracker *DST = pushDST();
 492   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
 493       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 494 }
 495
 496 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
 497                              uint64_t lb, uint64_t ub, int64_t st,
 498                              int64_t chunk) {
 499   DynamicScheduleTracker *DST = pushDST();
 500   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
 501       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 502 }
 503
 504 // next
 505 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
 506                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
 507   DynamicScheduleTracker *DST = peekDST();
 508   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
 509       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 510 }
 511
 512 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
 513                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
 514   DynamicScheduleTracker *DST = peekDST();
 515   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
 516       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 517 }
 518
 519 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
 520                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
 521   DynamicScheduleTracker *DST = peekDST();
 522   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
 523       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 524 }
 525
 526 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
 527                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
 528   DynamicScheduleTracker *DST = peekDST();
 529   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
 530       loc, tid, p_last, p_lb, p_ub, p_st, DST);
 531 }
 532
 533 // fini
 534 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
 535   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
 536   popDST();
 537 }
 538
 539 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
 540   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
 541   popDST();
 542 }
 543
 544 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
 545   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
 546   popDST();
 547 }
 548
 549 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
 550   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
 551   popDST();
 552 }
 553
 554 ////////////////////////////////////////////////////////////////////////////////
 555 // KMP interface implementation (static loops)
 556 ////////////////////////////////////////////////////////////////////////////////
 557
 558 void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
 559                               int32_t schedtype, int32_t *plastiter,
 560                               int32_t *plower, int32_t *pupper,
 561                               int32_t *pstride, int32_t incr, int32_t chunk) {
 562   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
 563       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 564       mapping::isSPMDMode());
 565 }
 566
 567 void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
 568                                int32_t schedtype, int32_t *plastiter,
 569                                uint32_t *plower, uint32_t *pupper,
 570                                int32_t *pstride, int32_t incr, int32_t chunk) {
 571   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
 572       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 573       mapping::isSPMDMode());
 574 }
 575
 576 void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
 577                               int32_t schedtype, int32_t *plastiter,
 578                               int64_t *plower, int64_t *pupper,
 579                               int64_t *pstride, int64_t incr, int64_t chunk) {
 580   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
 581       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 582       mapping::isSPMDMode());
 583 }
 584
 585 void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
 586                                int32_t schedtype, int32_t *plastiter,
 587                                uint64_t *plower, uint64_t *pupper,
 588                                int64_t *pstride, int64_t incr, int64_t chunk) {
 589   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
 590       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 591       mapping::isSPMDMode());
 592 }
 593
 594 void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
 595                                      int32_t schedtype, int32_t *plastiter,
 596                                      int32_t *plower, int32_t *pupper,
 597                                      int32_t *pstride, int32_t incr,
 598                                      int32_t chunk) {
 599   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
 600       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 601       mapping::isSPMDMode());
 602 }
 603
 604 void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
 605                                       int32_t schedtype, int32_t *plastiter,
 606                                       uint32_t *plower, uint32_t *pupper,
 607                                       int32_t *pstride, int32_t incr,
 608                                       int32_t chunk) {
 609   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
 610       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 611       mapping::isSPMDMode());
 612 }
 613
 614 void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
 615                                      int32_t schedtype, int32_t *plastiter,
 616                                      int64_t *plower, int64_t *pupper,
 617                                      int64_t *pstride, int64_t incr,
 618                                      int64_t chunk) {
 619   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
 620       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 621       mapping::isSPMDMode());
 622 }
 623
 624 void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
 625                                       int32_t schedtype, int32_t *plastiter,
 626                                       uint64_t *plower, uint64_t *pupper,
 627                                       int64_t *pstride, int64_t incr,
 628                                       int64_t chunk) {
 629   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
 630       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
 631       mapping::isSPMDMode());
 632 }
 633
 634 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 635
 636 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
 637 }
 638
 639 #pragma omp end declare target