offload/DeviceRTL/src/Parallelism.cpp

   1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Parallel implementation in the GPU. Here is the pattern:
  10 //
  11 //    while (not finished) {
  12 //
  13 //    if (master) {
  14 //      sequential code, decide which par loop to do, or if finished
  15 //     __kmpc_kernel_prepare_parallel() // exec by master only
  16 //    }
  17 //    syncthreads // A
  18 //    __kmpc_kernel_parallel() // exec by all
  19 //    if (this thread is included in the parallel) {
  20 //      switch () for all parallel loops
  21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
  22 //    }
  23 //
  24 //
  25 //    The reason we don't exec end_parallel for the threads not included
  26 //    in the parallel loop is that for each barrier in the parallel
  27 //    region, these non-included threads will cycle through the
  28 //    syncthread A. Thus they must preserve their current threadId that
  29 //    is larger than thread in team.
  30 //
  31 //    To make a long story short...
  32 //
  33 //===----------------------------------------------------------------------===//
  34
  35 #include "Debug.h"
  36 #include "DeviceTypes.h"
  37 #include "DeviceUtils.h"
  38 #include "Interface.h"
  39 #include "Mapping.h"
  40 #include "State.h"
  41 #include "Synchronization.h"
  42
  43 using namespace ompx;
  44
  45 #pragma omp begin declare target device_type(nohost)
  46
  47 namespace {
  48
  49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
  50   uint32_t NThreadsICV =
  51       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  52   uint32_t NumThreads = mapping::getMaxTeamThreads();
  53
  54   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
  55     NumThreads = NThreadsICV;
  56
  57   // SPMD mode allows any number of threads, for generic mode we round down to a
  58   // multiple of WARPSIZE since it is legal to do so in OpenMP.
  59   if (mapping::isSPMDMode())
  60     return NumThreads;
  61
  62   if (NumThreads < mapping::getWarpSize())
  63     NumThreads = 1;
  64   else
  65     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
  66
  67   return NumThreads;
  68 }
  69
  70 // Invoke an outlined parallel function unwrapping arguments (up to 32).
  71 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
  72                                               int32_t bound_tid, void *fn,
  73                                               void **args, int64_t nargs) {
  74   switch (nargs) {
  75 #include "generated_microtask_cases.gen"
  76   default:
  77     PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
  78     __builtin_trap();
  79   }
  80 }
  81
  82 } // namespace
  83
  84 extern "C" {
  85
  86 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
  87                                                    int32_t num_threads,
  88                                                    void *fn, void **args,
  89                                                    const int64_t nargs) {
  90   uint32_t TId = mapping::getThreadIdInBlock();
  91   uint32_t NumThreads = determineNumberOfThreads(num_threads);
  92   uint32_t PTeamSize =
  93       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
  94   // Avoid the race between the read of the `icv::Level` above and the write
  95   // below by synchronizing all threads here.
  96   synchronize::threadsAligned(atomic::seq_cst);
  97   {
  98     // Note that the order here is important. `icv::Level` has to be updated
  99     // last or the other updates will cause a thread specific state to be
 100     // created.
 101     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
 102                                           1u, TId == 0, ident,
 103                                           /*ForceTeamState=*/true);
 104     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
 105                                      /*ForceTeamState=*/true);
 106     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
 107                                /*ForceTeamState=*/true);
 108
 109     // Synchronize all threads after the main thread (TId == 0) set up the
 110     // team state properly.
 111     synchronize::threadsAligned(atomic::acq_rel);
 112
 113     state::ParallelTeamSize.assert_eq(PTeamSize, ident,
 114                                       /*ForceTeamState=*/true);
 115     icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
 116     icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
 117
 118     // Ensure we synchronize before we run user code to avoid invalidating the
 119     // assumptions above.
 120     synchronize::threadsAligned(atomic::relaxed);
 121
 122     if (!PTeamSize || TId < PTeamSize)
 123       invokeMicrotask(TId, 0, fn, args, nargs);
 124
 125     // Synchronize all threads at the end of a parallel region.
 126     synchronize::threadsAligned(atomic::seq_cst);
 127   }
 128
 129   // Synchronize all threads to make sure every thread exits the scope above;
 130   // otherwise the following assertions and the assumption in
 131   // __kmpc_target_deinit may not hold.
 132   synchronize::threadsAligned(atomic::acq_rel);
 133
 134   state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
 135   icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
 136   icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
 137
 138   // Ensure we synchronize to create an aligned region around the assumptions.
 139   synchronize::threadsAligned(atomic::relaxed);
 140
 141   return;
 142 }
 143
 144 [[clang::always_inline]] void
 145 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 146                    int32_t num_threads, int proc_bind, void *fn,
 147                    void *wrapper_fn, void **args, int64_t nargs) {
 148   uint32_t TId = mapping::getThreadIdInBlock();
 149
 150   // Assert the parallelism level is zero if disabled by the user.
 151   ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
 152          "nested parallelism while disabled");
 153
 154   // Handle the serialized case first, same for SPMD/non-SPMD:
 155   // 1) if-clause(0)
 156   // 2) parallel in task or other thread state inducing construct
 157   // 3) nested parallel regions
 158   if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
 159                    (config::mayUseNestedParallelism() && icv::Level))) {
 160     state::DateEnvironmentRAII DERAII(ident);
 161     ++icv::Level;
 162     invokeMicrotask(TId, 0, fn, args, nargs);
 163     return;
 164   }
 165
 166   // From this point forward we know that there is no thread state used.
 167   ASSERT(state::HasThreadState == false, nullptr);
 168
 169   if (mapping::isSPMDMode()) {
 170     // This was moved to its own routine so it could be called directly
 171     // in certain situations to avoid resource consumption of unused
 172     // logic in parallel_51.
 173     __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
 174
 175     return;
 176   }
 177
 178   uint32_t NumThreads = determineNumberOfThreads(num_threads);
 179   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
 180   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
 181
 182   // We do *not* create a new data environment because all threads in the team
 183   // that are active are now running this parallel region. They share the
 184   // TeamState, which has an increase level-var and potentially active-level
 185   // set, but they do not have individual ThreadStates yet. If they ever
 186   // modify the ICVs beyond this point a ThreadStates will be allocated.
 187
 188   bool IsActiveParallelRegion = NumThreads > 1;
 189   if (!IsActiveParallelRegion) {
 190     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
 191     invokeMicrotask(TId, 0, fn, args, nargs);
 192     return;
 193   }
 194
 195   void **GlobalArgs = nullptr;
 196   if (nargs) {
 197     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
 198     switch (nargs) {
 199     default:
 200       for (int I = 0; I < nargs; I++)
 201         GlobalArgs[I] = args[I];
 202       break;
 203     case 16:
 204       GlobalArgs[15] = args[15];
 205       [[fallthrough]];
 206     case 15:
 207       GlobalArgs[14] = args[14];
 208       [[fallthrough]];
 209     case 14:
 210       GlobalArgs[13] = args[13];
 211       [[fallthrough]];
 212     case 13:
 213       GlobalArgs[12] = args[12];
 214       [[fallthrough]];
 215     case 12:
 216       GlobalArgs[11] = args[11];
 217       [[fallthrough]];
 218     case 11:
 219       GlobalArgs[10] = args[10];
 220       [[fallthrough]];
 221     case 10:
 222       GlobalArgs[9] = args[9];
 223       [[fallthrough]];
 224     case 9:
 225       GlobalArgs[8] = args[8];
 226       [[fallthrough]];
 227     case 8:
 228       GlobalArgs[7] = args[7];
 229       [[fallthrough]];
 230     case 7:
 231       GlobalArgs[6] = args[6];
 232       [[fallthrough]];
 233     case 6:
 234       GlobalArgs[5] = args[5];
 235       [[fallthrough]];
 236     case 5:
 237       GlobalArgs[4] = args[4];
 238       [[fallthrough]];
 239     case 4:
 240       GlobalArgs[3] = args[3];
 241       [[fallthrough]];
 242     case 3:
 243       GlobalArgs[2] = args[2];
 244       [[fallthrough]];
 245     case 2:
 246       GlobalArgs[1] = args[1];
 247       [[fallthrough]];
 248     case 1:
 249       GlobalArgs[0] = args[0];
 250       [[fallthrough]];
 251     case 0:
 252       break;
 253     }
 254   }
 255
 256   {
 257     // Note that the order here is important. `icv::Level` has to be updated
 258     // last or the other updates will cause a thread specific state to be
 259     // created.
 260     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
 261                                           1u, true, ident,
 262                                           /*ForceTeamState=*/true);
 263     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
 264                                           (void *)nullptr, true, ident,
 265                                           /*ForceTeamState=*/true);
 266     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
 267                                      /*ForceTeamState=*/true);
 268     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
 269                                /*ForceTeamState=*/true);
 270
 271     // Master signals work to activate workers.
 272     synchronize::threads(atomic::seq_cst);
 273     // Master waits for workers to signal.
 274     synchronize::threads(atomic::seq_cst);
 275   }
 276
 277   if (nargs)
 278     __kmpc_end_sharing_variables();
 279 }
 280
 281 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
 282   // Work function and arguments for L1 parallel region.
 283   *WorkFn = state::ParallelRegionFn;
 284
 285   // If this is the termination signal from the master, quit early.
 286   if (!*WorkFn)
 287     return false;
 288
 289   // Set to true for workers participating in the parallel region.
 290   uint32_t TId = mapping::getThreadIdInBlock();
 291   bool ThreadIsActive = TId < state::getEffectivePTeamSize();
 292   return ThreadIsActive;
 293 }
 294
 295 [[clang::noinline]] void __kmpc_kernel_end_parallel() {
 296   // In case we have modified an ICV for this thread before a ThreadState was
 297   // created. We drop it now to not contaminate the next parallel region.
 298   ASSERT(!mapping::isSPMDMode(), nullptr);
 299   uint32_t TId = mapping::getThreadIdInBlock();
 300   state::resetStateForThread(TId);
 301   ASSERT(!mapping::isSPMDMode(), nullptr);
 302 }
 303
 304 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
 305
 306 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
 307
 308 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
 309                            int32_t thread_limit) {}
 310
 311 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
 312 }
 313
 314 #pragma omp end declare target