openmp/libomptarget/DeviceRTL/src/Parallelism.cpp

   1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Parallel implementation in the GPU. Here is the pattern:
  10 //
  11 //    while (not finished) {
  12 //
  13 //    if (master) {
  14 //      sequential code, decide which par loop to do, or if finished
  15 //     __kmpc_kernel_prepare_parallel() // exec by master only
  16 //    }
  17 //    syncthreads // A
  18 //    __kmpc_kernel_parallel() // exec by all
  19 //    if (this thread is included in the parallel) {
  20 //      switch () for all parallel loops
  21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
  22 //    }
  23 //
  24 //
  25 //    The reason we don't exec end_parallel for the threads not included
  26 //    in the parallel loop is that for each barrier in the parallel
  27 //    region, these non-included threads will cycle through the
  28 //    syncthread A. Thus they must preserve their current threadId that
  29 //    is larger than thread in team.
  30 //
  31 //    To make a long story short...
  32 //
  33 //===----------------------------------------------------------------------===//
  34
  35 #include "Debug.h"
  36 #include "Interface.h"
  37 #include "Mapping.h"
  38 #include "State.h"
  39 #include "Synchronization.h"
  40 #include "Types.h"
  41 #include "Utils.h"
  42
  43 using namespace _OMP;
  44
  45 #pragma omp declare target
  46
  47 namespace {
  48
  49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
  50   uint32_t NThreadsICV =
  51       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  52   uint32_t NumThreads = mapping::getBlockSize();
  53
  54   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
  55     NumThreads = NThreadsICV;
  56
  57   // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
  58   if (NumThreads < mapping::getWarpSize())
  59     NumThreads = 1;
  60   else
  61     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
  62
  63   return NumThreads;
  64 }
  65
  66 // Invoke an outlined parallel function unwrapping arguments (up to 32).
  67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
  68                      void **args, int64_t nargs) {
  69   DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
  70   switch (nargs) {
  71 #include "generated_microtask_cases.gen"
  72   default:
  73     PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
  74     __builtin_trap();
  75   }
  76 }
  77
  78 } // namespace
  79
  80 extern "C" {
  81
  82 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
  83                         int32_t num_threads, int proc_bind, void *fn,
  84                         void *wrapper_fn, void **args, int64_t nargs) {
  85   FunctionTracingRAII();
  86
  87   uint32_t TId = mapping::getThreadIdInBlock();
  88   // Handle the serialized case first, same for SPMD/non-SPMD.
  89   if (OMP_UNLIKELY(!if_expr || icv::Level)) {
  90     state::enterDataEnvironment();
  91     ++icv::Level;
  92     invokeMicrotask(TId, 0, fn, args, nargs);
  93     state::exitDataEnvironment();
  94     return;
  95   }
  96
  97   uint32_t NumThreads = determineNumberOfThreads(num_threads);
  98   if (mapping::isSPMDMode()) {
  99     // Avoid the race between the read of the `icv::Level` above and the write
 100     // below by synchronizing all threads here.
 101     synchronize::threadsAligned();
 102     {
 103       // Note that the order here is important. `icv::Level` has to be updated
 104       // last or the other updates will cause a thread specific state to be
 105       // created.
 106       state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
 107                                             1u, TId == 0);
 108       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
 109       state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
 110
 111       // Synchronize all threads after the main thread (TId == 0) set up the
 112       // team state properly.
 113       synchronize::threadsAligned();
 114
 115       ASSERT(state::ParallelTeamSize == NumThreads);
 116       ASSERT(icv::ActiveLevel == 1u);
 117       ASSERT(icv::Level == 1u);
 118
 119       if (TId < NumThreads)
 120         invokeMicrotask(TId, 0, fn, args, nargs);
 121
 122       // Synchronize all threads at the end of a parallel region.
 123       synchronize::threadsAligned();
 124     }
 125
 126     // Synchronize all threads to make sure every thread exits the scope above;
 127     // otherwise the following assertions and the assumption in
 128     // __kmpc_target_deinit may not hold.
 129     synchronize::threadsAligned();
 130
 131     ASSERT(state::ParallelTeamSize == 1u);
 132     ASSERT(icv::ActiveLevel == 0u);
 133     ASSERT(icv::Level == 0u);
 134     return;
 135   }
 136
 137   // We do *not* create a new data environment because all threads in the team
 138   // that are active are now running this parallel region. They share the
 139   // TeamState, which has an increase level-var and potentially active-level
 140   // set, but they do not have individual ThreadStates yet. If they ever
 141   // modify the ICVs beyond this point a ThreadStates will be allocated.
 142
 143   bool IsActiveParallelRegion = NumThreads > 1;
 144   if (!IsActiveParallelRegion) {
 145     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
 146     invokeMicrotask(TId, 0, fn, args, nargs);
 147     return;
 148   }
 149
 150   void **GlobalArgs = nullptr;
 151   if (nargs) {
 152     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
 153 #pragma unroll
 154     for (int I = 0; I < nargs; I++)
 155       GlobalArgs[I] = args[I];
 156   }
 157
 158   {
 159     // Note that the order here is important. `icv::Level` has to be updated
 160     // last or the other updates will cause a thread specific state to be
 161     // created.
 162     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
 163                                           1u, true);
 164     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
 165                                           (void *)nullptr, true);
 166     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true);
 167     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
 168
 169     // Master signals work to activate workers.
 170     synchronize::threads();
 171     // Master waits for workers to signal.
 172     synchronize::threads();
 173   }
 174
 175   if (nargs)
 176     __kmpc_end_sharing_variables();
 177 }
 178
 179 __attribute__((noinline)) bool
 180 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
 181   FunctionTracingRAII();
 182   // Work function and arguments for L1 parallel region.
 183   *WorkFn = state::ParallelRegionFn;
 184
 185   // If this is the termination signal from the master, quit early.
 186   if (!*WorkFn)
 187     return false;
 188
 189   // Set to true for workers participating in the parallel region.
 190   uint32_t TId = mapping::getThreadIdInBlock();
 191   bool ThreadIsActive = TId < state::ParallelTeamSize;
 192   return ThreadIsActive;
 193 }
 194
 195 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
 196   FunctionTracingRAII();
 197   // In case we have modified an ICV for this thread before a ThreadState was
 198   // created. We drop it now to not contaminate the next parallel region.
 199   ASSERT(!mapping::isSPMDMode());
 200   uint32_t TId = mapping::getThreadIdInBlock();
 201   state::resetStateForThread(TId);
 202   ASSERT(!mapping::isSPMDMode());
 203 }
 204
 205 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
 206   FunctionTracingRAII();
 207   return omp_get_level();
 208 }
 209
 210 int32_t __kmpc_global_thread_num(IdentTy *) {
 211   FunctionTracingRAII();
 212   return omp_get_thread_num();
 213 }
 214
 215 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
 216                            int32_t thread_limit) {
 217   FunctionTracingRAII();
 218 }
 219
 220 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
 221   FunctionTracingRAII();
 222 }
 223 }
 224
 225 #pragma omp end declare target