openmp/libomptarget/DeviceRTL/src/Reduction.cpp

   1 //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file contains the implementation of reduction with KMPC interface.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "Debug.h"
  14 #include "Interface.h"
  15 #include "Mapping.h"
  16 #include "State.h"
  17 #include "Synchronization.h"
  18 #include "Types.h"
  19 #include "Utils.h"
  20
  21 using namespace ompx;
  22
  23 namespace {
  24
  25 #pragma omp begin declare target device_type(nohost)
  26
  27 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
  28   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
  29     shflFct(reduce_data, /*LaneId - not used= */ 0,
  30             /*Offset = */ mask, /*AlgoVersion=*/0);
  31   }
  32 }
  33
  34 void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
  35                                uint32_t size, uint32_t tid) {
  36   uint32_t curr_size;
  37   uint32_t mask;
  38   curr_size = size;
  39   mask = curr_size / 2;
  40   while (mask > 0) {
  41     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
  42     curr_size = (curr_size + 1) / 2;
  43     mask = curr_size / 2;
  44   }
  45 }
  46
  47 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
  48 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
  49                                           ShuffleReductFnTy shflFct) {
  50   uint32_t size, remote_id, physical_lane_id;
  51   physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
  52   __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
  53   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
  54   uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
  55   __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
  56   do {
  57     Liveness = mapping::activemask();
  58     remote_id = utils::ffs(Liveness & lanemask_gt);
  59     size = utils::popc(Liveness);
  60     logical_lane_id /= 2;
  61     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
  62             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
  63   } while (logical_lane_id % 2 == 0 && size > 1);
  64   return (logical_lane_id == 0);
  65 }
  66 #endif
  67
  68 static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
  69                                             uint64_t reduce_size,
  70                                             void *reduce_data,
  71                                             ShuffleReductFnTy shflFct,
  72                                             InterWarpCopyFnTy cpyFct,
  73                                             bool isSPMDExecutionMode, bool) {
  74   uint32_t BlockThreadId = mapping::getThreadIdInBlock();
  75   if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
  76     BlockThreadId = 0;
  77   uint32_t NumThreads = omp_get_num_threads();
  78   if (NumThreads == 1)
  79     return 1;
  80     /*
  81      * This reduce function handles reduction within a team. It handles
  82      * parallel regions in both L1 and L2 parallelism levels. It also
  83      * supports Generic, SPMD, and NoOMP modes.
  84      *
  85      * 1. Reduce within a warp.
  86      * 2. Warp master copies value to warp 0 via shared memory.
  87      * 3. Warp 0 reduces to a single value.
  88      * 4. The reduced value is available in the thread that returns 1.
  89      */
  90
  91 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  92   uint32_t WarpsNeeded =
  93       (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
  94   uint32_t WarpId = mapping::getWarpIdInBlock();
  95
  96   // Volta execution model:
  97   // For the Generic execution mode a parallel region either has 1 thread and
  98   // beyond that, always a multiple of 32. For the SPMD execution mode we may
  99   // have any number of threads.
 100   if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
 101     gpu_regular_warp_reduce(reduce_data, shflFct);
 102   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
 103     gpu_irregular_warp_reduce(reduce_data, shflFct,
 104                               /*LaneCount=*/NumThreads % mapping::getWarpSize(),
 105                               /*LaneId=*/mapping::getThreadIdInBlock() %
 106                                   mapping::getWarpSize());
 107
 108   // When we have more than [mapping::getWarpSize()] number of threads
 109   // a block reduction is performed here.
 110   //
 111   // Only L1 parallel region can enter this if condition.
 112   if (NumThreads > mapping::getWarpSize()) {
 113     // Gather all the reduced values from each warp
 114     // to the first warp.
 115     cpyFct(reduce_data, WarpsNeeded);
 116
 117     if (WarpId == 0)
 118       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
 119                                 BlockThreadId);
 120   }
 121   return BlockThreadId == 0;
 122 #else
 123   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
 124   if (Liveness == lanes::All) // Full warp
 125     gpu_regular_warp_reduce(reduce_data, shflFct);
 126   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
 127     gpu_irregular_warp_reduce(reduce_data, shflFct,
 128                               /*LaneCount=*/utils::popc(Liveness),
 129                               /*LaneId=*/mapping::getThreadIdInBlock() %
 130                                   mapping::getWarpSize());
 131   else { // Dispersed lanes. Only threads in L2
 132          // parallel region may enter here; return
 133          // early.
 134     return gpu_irregular_simd_reduce(reduce_data, shflFct);
 135   }
 136
 137   // When we have more than [mapping::getWarpSize()] number of threads
 138   // a block reduction is performed here.
 139   //
 140   // Only L1 parallel region can enter this if condition.
 141   if (NumThreads > mapping::getWarpSize()) {
 142     uint32_t WarpsNeeded =
 143         (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
 144     // Gather all the reduced values from each warp
 145     // to the first warp.
 146     cpyFct(reduce_data, WarpsNeeded);
 147
 148     uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
 149     if (WarpId == 0)
 150       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
 151                                 BlockThreadId);
 152
 153     return BlockThreadId == 0;
 154   }
 155
 156   // Get the OMP thread Id. This is different from BlockThreadId in the case of
 157   // an L2 parallel region.
 158   return TId == 0;
 159 #endif // __CUDA_ARCH__ >= 700
 160 }
 161
 162 uint32_t roundToWarpsize(uint32_t s) {
 163   if (s < mapping::getWarpSize())
 164     return 1;
 165   return (s & ~(unsigned)(mapping::getWarpSize() - 1));
 166 }
 167
 168 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
 169
 170 } // namespace
 171
 172 extern "C" {
 173 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
 174     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
 175     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
 176   return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
 177                                       shflFct, cpyFct, mapping::isSPMDMode(),
 178                                       false);
 179 }
 180
 181 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 182     IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
 183     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
 184     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
 185     ListGlobalFnTy glredFct) {
 186   // Terminate all threads in non-SPMD mode except for the master thread.
 187   uint32_t ThreadId = mapping::getThreadIdInBlock();
 188   if (mapping::isGenericMode()) {
 189     if (!mapping::isMainThreadInGenericMode())
 190       return 0;
 191     ThreadId = 0;
 192   }
 193
 194   uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
 195   uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
 196
 197   // In non-generic mode all workers participate in the teams reduction.
 198   // In generic mode only the team master participates in the teams
 199   // reduction because the workers are waiting for parallel work.
 200   uint32_t NumThreads = omp_get_num_threads();
 201   uint32_t TeamId = omp_get_team_num();
 202   uint32_t NumTeams = omp_get_num_teams();
 203   static unsigned SHARED(Bound);
 204   static unsigned SHARED(ChunkTeamCount);
 205
 206   // Block progress for teams greater than the current upper
 207   // limit. We always only allow a number of teams less or equal
 208   // to the number of slots in the buffer.
 209   bool IsMaster = (ThreadId == 0);
 210   while (IsMaster) {
 211     Bound = atomic::load(&IterCnt, atomic::aquire);
 212     if (TeamId < Bound + num_of_records)
 213       break;
 214   }
 215
 216   if (IsMaster) {
 217     int ModBockId = TeamId % num_of_records;
 218     if (TeamId < num_of_records) {
 219       lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
 220     } else
 221       lgredFct(GlobalBuffer, ModBockId, reduce_data);
 222
 223     // Increment team counter.
 224     // This counter is incremented by all teams in the current
 225     // BUFFER_SIZE chunk.
 226     ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
 227                                  atomic::MemScopeTy::device);
 228   }
 229   // Synchronize
 230   if (mapping::isSPMDMode())
 231     synchronize::threadsAligned(atomic::acq_rel);
 232   else
 233     fence::kernel(atomic::acq_rel);
 234
 235   // reduce_data is global or shared so before being reduced within the
 236   // warp we need to bring it in local memory:
 237   // local_reduce_data = reduce_data[i]
 238   //
 239   // Example for 3 reduction variables a, b, c (of potentially different
 240   // types):
 241   //
 242   // buffer layout (struct of arrays):
 243   // a, a, ..., a, b, b, ... b, c, c, ... c
 244   // |__________|
 245   //     num_of_records
 246   //
 247   // local_data_reduce layout (struct):
 248   // a, b, c
 249   //
 250   // Each thread will have a local struct containing the values to be
 251   // reduced:
 252   //      1. do reduction within each warp.
 253   //      2. do reduction across warps.
 254   //      3. write the final result to the main reduction variable
 255   //         by returning 1 in the thread holding the reduction result.
 256
 257   // Check if this is the very last team.
 258   unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
 259   if (ChunkTeamCount == NumTeams - Bound - 1) {
 260     //
 261     // Last team processing.
 262     //
 263     if (ThreadId >= NumRecs)
 264       return 0;
 265     NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
 266     if (ThreadId >= NumThreads)
 267       return 0;
 268
 269     // Load from buffer and reduce.
 270     glcpyFct(GlobalBuffer, ThreadId, reduce_data);
 271     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
 272       glredFct(GlobalBuffer, i, reduce_data);
 273
 274     // Reduce across warps to the warp master.
 275     if (NumThreads > 1) {
 276       gpu_regular_warp_reduce(reduce_data, shflFct);
 277
 278       // When we have more than [mapping::getWarpSize()] number of threads
 279       // a block reduction is performed here.
 280       uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
 281       if (ActiveThreads > mapping::getWarpSize()) {
 282         uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
 283                                mapping::getWarpSize();
 284         // Gather all the reduced values from each warp
 285         // to the first warp.
 286         cpyFct(reduce_data, WarpsNeeded);
 287
 288         uint32_t WarpId = ThreadId / mapping::getWarpSize();
 289         if (WarpId == 0)
 290           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
 291                                     ThreadId);
 292       }
 293     }
 294
 295     if (IsMaster) {
 296       Cnt = 0;
 297       IterCnt = 0;
 298       return 1;
 299     }
 300     return 0;
 301   }
 302   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
 303     // Allow SIZE number of teams to proceed writing their
 304     // intermediate results to the global buffer.
 305     atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
 306   }
 307
 308   return 0;
 309 }
 310
 311 void __kmpc_nvptx_end_reduce(int32_t TId) {}
 312
 313 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
 314 }
 315
 316 void *__kmpc_reduction_get_fixed_buffer() {
 317   return state::getKernelLaunchEnvironment().ReductionBuffer;
 318 }
 319
 320 #pragma omp end declare target