openmp/libomptarget/DeviceRTL/src/Parallelism.cpp

   1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Parallel implementation in the GPU. Here is the pattern:
  10 //
  11 //    while (not finished) {
  12 //
  13 //    if (master) {
  14 //      sequential code, decide which par loop to do, or if finished
  15 //     __kmpc_kernel_prepare_parallel() // exec by master only
  16 //    }
  17 //    syncthreads // A
  18 //    __kmpc_kernel_parallel() // exec by all
  19 //    if (this thread is included in the parallel) {
  20 //      switch () for all parallel loops
  21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
  22 //    }
  23 //
  24 //
  25 //    The reason we don't exec end_parallel for the threads not included
  26 //    in the parallel loop is that for each barrier in the parallel
  27 //    region, these non-included threads will cycle through the
  28 //    syncthread A. Thus they must preserve their current threadId that
  29 //    is larger than thread in team.
  30 //
  31 //    To make a long story short...
  32 //
  33 //===----------------------------------------------------------------------===//
  34
  35 #include "Debug.h"
  36 #include "Interface.h"
  37 #include "Mapping.h"
  38 #include "State.h"
  39 #include "Synchronization.h"
  40 #include "Types.h"
  41 #include "Utils.h"
  42
  43 using namespace ompx;
  44
  45 #pragma omp begin declare target device_type(nohost)
  46
  47 namespace {
  48
  49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
  50   uint32_t NThreadsICV =
  51       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  52   uint32_t NumThreads = mapping::getMaxTeamThreads();
  53
  54   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
  55     NumThreads = NThreadsICV;
  56
  57   // SPMD mode allows any number of threads, for generic mode we round down to a
  58   // multiple of WARPSIZE since it is legal to do so in OpenMP.
  59   if (mapping::isSPMDMode())
  60     return NumThreads;
  61
  62   if (NumThreads < mapping::getWarpSize())
  63     NumThreads = 1;
  64   else
  65     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
  66
  67   return NumThreads;
  68 }
  69
  70 // Invoke an outlined parallel function unwrapping arguments (up to 32).
  71 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
  72                                               int32_t bound_tid, void *fn,
  73                                               void **args, int64_t nargs) {
  74   switch (nargs) {
  75 #include "generated_microtask_cases.gen"
  76   default:
  77     PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
  78     __builtin_trap();
  79   }
  80 }
  81
  82 } // namespace
  83
  84 extern "C" {
  85
  86 [[clang::always_inline]] void
  87 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
  88                    int32_t num_threads, int proc_bind, void *fn,
  89                    void *wrapper_fn, void **args, int64_t nargs) {
  90   uint32_t TId = mapping::getThreadIdInBlock();
  91
  92   // Assert the parallelism level is zero if disabled by the user.
  93   ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
  94          "nested parallelism while disabled");
  95
  96   // Handle the serialized case first, same for SPMD/non-SPMD:
  97   // 1) if-clause(0)
  98   // 2) parallel in task or other thread state inducing construct
  99   // 3) nested parallel regions
 100   if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
 101                    (config::mayUseNestedParallelism() && icv::Level))) {
 102     state::DateEnvironmentRAII DERAII(ident);
 103     ++icv::Level;
 104     invokeMicrotask(TId, 0, fn, args, nargs);
 105     return;
 106   }
 107
 108   // From this point forward we know that there is no thread state used.
 109   ASSERT(state::HasThreadState == false, nullptr);
 110
 111   uint32_t NumThreads = determineNumberOfThreads(num_threads);
 112   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
 113   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
 114   if (mapping::isSPMDMode()) {
 115     // Avoid the race between the read of the `icv::Level` above and the write
 116     // below by synchronizing all threads here.
 117     synchronize::threadsAligned(atomic::seq_cst);
 118     {
 119       // Note that the order here is important. `icv::Level` has to be updated
 120       // last or the other updates will cause a thread specific state to be
 121       // created.
 122       state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
 123                                             1u, TId == 0, ident,
 124                                             /* ForceTeamState */ true);
 125       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
 126                                        ident, /* ForceTeamState */ true);
 127       state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
 128                                  /* ForceTeamState */ true);
 129
 130       // Synchronize all threads after the main thread (TId == 0) set up the
 131       // team state properly.
 132       synchronize::threadsAligned(atomic::acq_rel);
 133
 134       state::ParallelTeamSize.assert_eq(PTeamSize, ident,
 135                                         /* ForceTeamState */ true);
 136       icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
 137       icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
 138
 139       // Ensure we synchronize before we run user code to avoid invalidating the
 140       // assumptions above.
 141       synchronize::threadsAligned(atomic::relaxed);
 142
 143       if (!PTeamSize || TId < PTeamSize)
 144         invokeMicrotask(TId, 0, fn, args, nargs);
 145
 146       // Synchronize all threads at the end of a parallel region.
 147       synchronize::threadsAligned(atomic::seq_cst);
 148     }
 149
 150     // Synchronize all threads to make sure every thread exits the scope above;
 151     // otherwise the following assertions and the assumption in
 152     // __kmpc_target_deinit may not hold.
 153     synchronize::threadsAligned(atomic::acq_rel);
 154
 155     state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
 156     icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
 157     icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
 158
 159     // Ensure we synchronize to create an aligned region around the assumptions.
 160     synchronize::threadsAligned(atomic::relaxed);
 161
 162     return;
 163   }
 164
 165   // We do *not* create a new data environment because all threads in the team
 166   // that are active are now running this parallel region. They share the
 167   // TeamState, which has an increase level-var and potentially active-level
 168   // set, but they do not have individual ThreadStates yet. If they ever
 169   // modify the ICVs beyond this point a ThreadStates will be allocated.
 170
 171   bool IsActiveParallelRegion = NumThreads > 1;
 172   if (!IsActiveParallelRegion) {
 173     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
 174     invokeMicrotask(TId, 0, fn, args, nargs);
 175     return;
 176   }
 177
 178   void **GlobalArgs = nullptr;
 179   if (nargs) {
 180     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
 181     switch (nargs) {
 182     default:
 183       for (int I = 0; I < nargs; I++)
 184         GlobalArgs[I] = args[I];
 185       break;
 186     case 16:
 187       GlobalArgs[15] = args[15];
 188       [[fallthrough]];
 189     case 15:
 190       GlobalArgs[14] = args[14];
 191       [[fallthrough]];
 192     case 14:
 193       GlobalArgs[13] = args[13];
 194       [[fallthrough]];
 195     case 13:
 196       GlobalArgs[12] = args[12];
 197       [[fallthrough]];
 198     case 12:
 199       GlobalArgs[11] = args[11];
 200       [[fallthrough]];
 201     case 11:
 202       GlobalArgs[10] = args[10];
 203       [[fallthrough]];
 204     case 10:
 205       GlobalArgs[9] = args[9];
 206       [[fallthrough]];
 207     case 9:
 208       GlobalArgs[8] = args[8];
 209       [[fallthrough]];
 210     case 8:
 211       GlobalArgs[7] = args[7];
 212       [[fallthrough]];
 213     case 7:
 214       GlobalArgs[6] = args[6];
 215       [[fallthrough]];
 216     case 6:
 217       GlobalArgs[5] = args[5];
 218       [[fallthrough]];
 219     case 5:
 220       GlobalArgs[4] = args[4];
 221       [[fallthrough]];
 222     case 4:
 223       GlobalArgs[3] = args[3];
 224       [[fallthrough]];
 225     case 3:
 226       GlobalArgs[2] = args[2];
 227       [[fallthrough]];
 228     case 2:
 229       GlobalArgs[1] = args[1];
 230       [[fallthrough]];
 231     case 1:
 232       GlobalArgs[0] = args[0];
 233       [[fallthrough]];
 234     case 0:
 235       break;
 236     }
 237   }
 238
 239   {
 240     // Note that the order here is important. `icv::Level` has to be updated
 241     // last or the other updates will cause a thread specific state to be
 242     // created.
 243     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
 244                                           1u, true, ident,
 245                                           /* ForceTeamState */ true);
 246     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
 247                                           (void *)nullptr, true, ident,
 248                                           /* ForceTeamState */ true);
 249     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
 250                                      /* ForceTeamState */ true);
 251     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
 252                                /* ForceTeamState */ true);
 253
 254     // Master signals work to activate workers.
 255     synchronize::threads(atomic::seq_cst);
 256     // Master waits for workers to signal.
 257     synchronize::threads(atomic::seq_cst);
 258   }
 259
 260   if (nargs)
 261     __kmpc_end_sharing_variables();
 262 }
 263
 264 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
 265   // Work function and arguments for L1 parallel region.
 266   *WorkFn = state::ParallelRegionFn;
 267
 268   // If this is the termination signal from the master, quit early.
 269   if (!*WorkFn)
 270     return false;
 271
 272   // Set to true for workers participating in the parallel region.
 273   uint32_t TId = mapping::getThreadIdInBlock();
 274   bool ThreadIsActive = TId < state::getEffectivePTeamSize();
 275   return ThreadIsActive;
 276 }
 277
 278 [[clang::noinline]] void __kmpc_kernel_end_parallel() {
 279   // In case we have modified an ICV for this thread before a ThreadState was
 280   // created. We drop it now to not contaminate the next parallel region.
 281   ASSERT(!mapping::isSPMDMode(), nullptr);
 282   uint32_t TId = mapping::getThreadIdInBlock();
 283   state::resetStateForThread(TId);
 284   ASSERT(!mapping::isSPMDMode(), nullptr);
 285 }
 286
 287 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
 288
 289 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
 290
 291 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
 292                            int32_t thread_limit) {}
 293
 294 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
 295 }
 296
 297 #pragma omp end declare target