offload/DeviceRTL/src/Mapping.cpp

   1 //===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 //
  10 //===----------------------------------------------------------------------===//
  11
  12 #include "Mapping.h"
  13 #include "DeviceTypes.h"
  14 #include "DeviceUtils.h"
  15 #include "Interface.h"
  16 #include "State.h"
  17
  18 #pragma omp begin declare target device_type(nohost)
  19
  20 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
  21
  22 using namespace ompx;
  23
  24 namespace ompx {
  25 namespace impl {
  26
  27 // Forward declarations defined to be defined for AMDGCN and NVPTX.
  28 const llvm::omp::GV &getGridValue();
  29 LaneMaskTy activemask();
  30 LaneMaskTy lanemaskLT();
  31 LaneMaskTy lanemaskGT();
  32 uint32_t getThreadIdInWarp();
  33 uint32_t getThreadIdInBlock(int32_t Dim);
  34 uint32_t getNumberOfThreadsInBlock(int32_t Dim);
  35 uint32_t getNumberOfThreadsInKernel();
  36 uint32_t getBlockIdInKernel(int32_t Dim);
  37 uint32_t getNumberOfBlocksInKernel(int32_t Dim);
  38 uint32_t getWarpIdInBlock();
  39 uint32_t getNumberOfWarpsInBlock();
  40
  41 /// AMDGCN Implementation
  42 ///
  43 ///{
  44 #pragma omp begin declare variant match(device = {arch(amdgcn)})
  45
  46 const llvm::omp::GV &getGridValue() {
  47   return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
  48 }
  49
  50 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
  51   switch (Dim) {
  52   case 0:
  53     return __builtin_amdgcn_workgroup_size_x();
  54   case 1:
  55     return __builtin_amdgcn_workgroup_size_y();
  56   case 2:
  57     return __builtin_amdgcn_workgroup_size_z();
  58   };
  59   UNREACHABLE("Dim outside range!");
  60 }
  61
  62 LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
  63
  64 LaneMaskTy lanemaskLT() {
  65   uint32_t Lane = mapping::getThreadIdInWarp();
  66   int64_t Ballot = mapping::activemask();
  67   uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
  68   return Mask & Ballot;
  69 }
  70
  71 LaneMaskTy lanemaskGT() {
  72   uint32_t Lane = mapping::getThreadIdInWarp();
  73   if (Lane == (mapping::getWarpSize() - 1))
  74     return 0;
  75   int64_t Ballot = mapping::activemask();
  76   uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
  77   return Mask & Ballot;
  78 }
  79
  80 uint32_t getThreadIdInWarp() {
  81   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
  82 }
  83
  84 uint32_t getThreadIdInBlock(int32_t Dim) {
  85   switch (Dim) {
  86   case 0:
  87     return __builtin_amdgcn_workitem_id_x();
  88   case 1:
  89     return __builtin_amdgcn_workitem_id_y();
  90   case 2:
  91     return __builtin_amdgcn_workitem_id_z();
  92   };
  93   UNREACHABLE("Dim outside range!");
  94 }
  95
  96 uint32_t getNumberOfThreadsInKernel() {
  97   return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
  98          __builtin_amdgcn_grid_size_z();
  99 }
 100
 101 uint32_t getBlockIdInKernel(int32_t Dim) {
 102   switch (Dim) {
 103   case 0:
 104     return __builtin_amdgcn_workgroup_id_x();
 105   case 1:
 106     return __builtin_amdgcn_workgroup_id_y();
 107   case 2:
 108     return __builtin_amdgcn_workgroup_id_z();
 109   };
 110   UNREACHABLE("Dim outside range!");
 111 }
 112
 113 uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
 114   switch (Dim) {
 115   case 0:
 116     return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
 117   case 1:
 118     return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
 119   case 2:
 120     return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
 121   };
 122   UNREACHABLE("Dim outside range!");
 123 }
 124
 125 uint32_t getWarpIdInBlock() {
 126   return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
 127 }
 128
 129 uint32_t getNumberOfWarpsInBlock() {
 130   return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
 131 }
 132
 133 #pragma omp end declare variant
 134 ///}
 135
 136 /// NVPTX Implementation
 137 ///
 138 ///{
 139 #pragma omp begin declare variant match(                                       \
 140         device = {arch(nvptx, nvptx64)},                                       \
 141             implementation = {extension(match_any)})
 142
 143 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
 144   switch (Dim) {
 145   case 0:
 146     return __nvvm_read_ptx_sreg_ntid_x();
 147   case 1:
 148     return __nvvm_read_ptx_sreg_ntid_y();
 149   case 2:
 150     return __nvvm_read_ptx_sreg_ntid_z();
 151   };
 152   UNREACHABLE("Dim outside range!");
 153 }
 154
 155 const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
 156
 157 LaneMaskTy activemask() { return __nvvm_activemask(); }
 158
 159 LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
 160
 161 LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
 162
 163 uint32_t getThreadIdInBlock(int32_t Dim) {
 164   switch (Dim) {
 165   case 0:
 166     return __nvvm_read_ptx_sreg_tid_x();
 167   case 1:
 168     return __nvvm_read_ptx_sreg_tid_y();
 169   case 2:
 170     return __nvvm_read_ptx_sreg_tid_z();
 171   };
 172   UNREACHABLE("Dim outside range!");
 173 }
 174
 175 uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
 176
 177 uint32_t getBlockIdInKernel(int32_t Dim) {
 178   switch (Dim) {
 179   case 0:
 180     return __nvvm_read_ptx_sreg_ctaid_x();
 181   case 1:
 182     return __nvvm_read_ptx_sreg_ctaid_y();
 183   case 2:
 184     return __nvvm_read_ptx_sreg_ctaid_z();
 185   };
 186   UNREACHABLE("Dim outside range!");
 187 }
 188
 189 uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
 190   switch (Dim) {
 191   case 0:
 192     return __nvvm_read_ptx_sreg_nctaid_x();
 193   case 1:
 194     return __nvvm_read_ptx_sreg_nctaid_y();
 195   case 2:
 196     return __nvvm_read_ptx_sreg_nctaid_z();
 197   };
 198   UNREACHABLE("Dim outside range!");
 199 }
 200
 201 uint32_t getNumberOfThreadsInKernel() {
 202   return impl::getNumberOfThreadsInBlock(0) *
 203          impl::getNumberOfBlocksInKernel(0) *
 204          impl::getNumberOfThreadsInBlock(1) *
 205          impl::getNumberOfBlocksInKernel(1) *
 206          impl::getNumberOfThreadsInBlock(2) *
 207          impl::getNumberOfBlocksInKernel(2);
 208 }
 209
 210 uint32_t getWarpIdInBlock() {
 211   return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
 212 }
 213
 214 uint32_t getNumberOfWarpsInBlock() {
 215   return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
 216          mapping::getWarpSize();
 217 }
 218
 219 #pragma omp end declare variant
 220 ///}
 221
 222 uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
 223
 224 } // namespace impl
 225 } // namespace ompx
 226
 227 /// We have to be deliberate about the distinction of `mapping::` and `impl::`
 228 /// below to avoid repeating assumptions or including irrelevant ones.
 229 ///{
 230
 231 static bool isInLastWarp() {
 232   uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
 233                      ~(mapping::getWarpSize() - 1);
 234   return mapping::getThreadIdInBlock() == MainTId;
 235 }
 236
 237 bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
 238   if (IsSPMD || icv::Level)
 239     return false;
 240
 241   // Check if this is the last warp in the block.
 242   return isInLastWarp();
 243 }
 244
 245 bool mapping::isMainThreadInGenericMode() {
 246   return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
 247 }
 248
 249 bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
 250   if (IsSPMD)
 251     return mapping::getThreadIdInBlock() == 0;
 252   return isInLastWarp();
 253 }
 254
 255 bool mapping::isLeaderInWarp() {
 256   __kmpc_impl_lanemask_t Active = mapping::activemask();
 257   __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
 258   return utils::popc(Active & LaneMaskLT) == 0;
 259 }
 260
 261 LaneMaskTy mapping::activemask() { return impl::activemask(); }
 262
 263 LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
 264
 265 LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
 266
 267 uint32_t mapping::getThreadIdInWarp() {
 268   uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
 269   ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
 270   return ThreadIdInWarp;
 271 }
 272
 273 uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
 274   uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
 275   return ThreadIdInBlock;
 276 }
 277
 278 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
 279
 280 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
 281   uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
 282   // If we are in SPMD mode, remove one warp.
 283   return BlockSize - (!IsSPMD * impl::getWarpSize());
 284 }
 285 uint32_t mapping::getMaxTeamThreads() {
 286   return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 287 }
 288
 289 uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
 290   return impl::getNumberOfThreadsInBlock(Dim);
 291 }
 292
 293 uint32_t mapping::getNumberOfThreadsInKernel() {
 294   return impl::getNumberOfThreadsInKernel();
 295 }
 296
 297 uint32_t mapping::getWarpIdInBlock() {
 298   uint32_t WarpID = impl::getWarpIdInBlock();
 299   ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
 300   return WarpID;
 301 }
 302
 303 uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
 304   uint32_t BlockId = impl::getBlockIdInKernel(Dim);
 305   ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
 306   return BlockId;
 307 }
 308
 309 uint32_t mapping::getNumberOfWarpsInBlock() {
 310   uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
 311   ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
 312   return NumberOfWarpsInBlocks;
 313 }
 314
 315 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
 316   uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
 317   ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
 318   return NumberOfBlocks;
 319 }
 320
 321 uint32_t mapping::getNumberOfProcessorElements() {
 322   return static_cast<uint32_t>(config::getHardwareParallelism());
 323 }
 324
 325 ///}
 326
 327 /// Execution mode
 328 ///
 329 ///{
 330
 331 // TODO: This is a workaround for initialization coming from kernels outside of
 332 //       the TU. We will need to solve this more correctly in the future.
 333 [[gnu::weak]] int SHARED(IsSPMDMode);
 334
 335 void mapping::init(bool IsSPMD) {
 336   if (mapping::isInitialThreadInLevel0(IsSPMD))
 337     IsSPMDMode = IsSPMD;
 338 }
 339
 340 bool mapping::isSPMDMode() { return IsSPMDMode; }
 341
 342 bool mapping::isGenericMode() { return !isSPMDMode(); }
 343 ///}
 344
 345 extern "C" {
 346 [[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
 347   return mapping::getThreadIdInBlock();
 348 }
 349
 350 [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
 351   return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
 352 }
 353
 354 [[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
 355   return impl::getWarpSize();
 356 }
 357 }
 358
 359 #define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME)                                \
 360   extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
 361
 362 _TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
 363 _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 364 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 365 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 366
 367 extern "C" {
 368 uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
 369   return utils::ballotSync(mask, pred);
 370 }
 371
 372 int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
 373   return utils::shuffleDown(mask, var, delta, width);
 374 }
 375
 376 float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
 377                             int width) {
 378   return utils::convertViaPun<float>(utils::shuffleDown(
 379       mask, utils::convertViaPun<int32_t>(var), delta, width));
 380 }
 381
 382 long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
 383   return utils::shuffleDown(mask, var, delta, width);
 384 }
 385
 386 double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
 387                              int width) {
 388   return utils::convertViaPun<double>(utils::shuffleDown(
 389       mask, utils::convertViaPun<int64_t>(var), delta, width));
 390 }
 391 }
 392
 393 #pragma omp end declare target