openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

   1 //===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Definitions of target specific functions
  10 //
  11 //===----------------------------------------------------------------------===//
  12 #pragma omp declare target
  13
  14 #include "common/omptarget.h"
  15 #include "target_impl.h"
  16 #include "target_interface.h"
  17
  18 // Implementations initially derived from hcc
  19
  20 // Initialized with a 64-bit mask with bits set in positions less than the
  21 // thread's lane number in the warp
  22 EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
  23   uint32_t lane = GetLaneId();
  24   int64_t ballot = __kmpc_impl_activemask();
  25   uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
  26   return mask & ballot;
  27 }
  28
  29 // Initialized with a 64-bit mask with bits set in positions greater than the
  30 // thread's lane number in the warp
  31 EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
  32   uint32_t lane = GetLaneId();
  33   if (lane == (WARPSIZE - 1))
  34     return 0;
  35   uint64_t ballot = __kmpc_impl_activemask();
  36   uint64_t mask = (~((uint64_t)0)) << (lane + 1);
  37   return mask & ballot;
  38 }
  39
  40 EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); }
  41
  42 EXTERN double __kmpc_impl_get_wtime() {
  43   // The intrinsics for measuring time have undocumented frequency
  44   // This will probably need to be found by measurement on a number of
  45   // architectures. Until then, return 0, which is very inaccurate as a
  46   // timer but resolves the undefined symbol at link time.
  47   return 0;
  48 }
  49
  50 // Warp vote function
  51 EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
  52   return __builtin_amdgcn_read_exec();
  53 }
  54
  55 static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
  56   __atomic_thread_fence(__ATOMIC_ACQUIRE);
  57
  58   uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;
  59
  60   // Partial barrier implementation for amdgcn.
  61   // Uses two 16 bit unsigned counters. One for the number of waves to have
  62   // reached the barrier, and one to count how many times the barrier has been
  63   // passed. These are packed in a single atomically accessed 32 bit integer.
  64   // Low bits for the number of waves, assumed zero before this call.
  65   // High bits to count the number of times the barrier has been passed.
  66
  67   // precondition: num_waves != 0;
  68   // invariant: num_waves * WARPSIZE == num_threads;
  69   // precondition: num_waves < 0xffffu;
  70
  71   // Increment the low 16 bits once, using the lowest active thread.
  72   uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
  73   bool isLowest = GetLaneId() == lowestActiveThread;
  74
  75   if (isLowest) {
  76     uint32_t load = __atomic_fetch_add(barrier_state, 1,
  77                                        __ATOMIC_RELAXED); // commutative
  78
  79     // Record the number of times the barrier has been passed
  80     uint32_t generation = load & 0xffff0000u;
  81
  82     if ((load & 0x0000ffffu) == (num_waves - 1)) {
  83       // Reached num_waves in low bits so this is the last wave.
  84       // Set low bits to zero and increment high bits
  85       load += 0x00010000u; // wrap is safe
  86       load &= 0xffff0000u; // because bits zeroed second
  87
  88       // Reset the wave counter and release the waiting waves
  89       __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
  90     } else {
  91       // more waves still to go, spin until generation counter changes
  92       do {
  93         __builtin_amdgcn_s_sleep(0);
  94         load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
  95       } while ((load & 0xffff0000u) == generation);
  96     }
  97   }
  98   __atomic_thread_fence(__ATOMIC_RELEASE);
  99 }
 100
 101 uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
 102 #pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
 103
 104 EXTERN void __kmpc_impl_target_init() {
 105   // Don't have global ctors, and shared memory is not zero init
 106   __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
 107 }
 108
 109 EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
 110   pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
 111 }
 112
 113 namespace {
 114 uint32_t get_grid_dim(uint32_t n, uint16_t d) {
 115   uint32_t q = n / d;
 116   return q + (n > q * d);
 117 }
 118 uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
 119                            uint16_t group_size) {
 120   uint32_t r = grid_size - group_id * group_size;
 121   return (r < group_size) ? r : group_size;
 122 }
 123 } // namespace
 124
 125 EXTERN int __kmpc_get_hardware_num_blocks() {
 126   return get_grid_dim(__builtin_amdgcn_grid_size_x(),
 127                       __builtin_amdgcn_workgroup_size_x());
 128 }
 129
 130 EXTERN int __kmpc_get_hardware_num_threads_in_block() {
 131   return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
 132                            __builtin_amdgcn_grid_size_x(),
 133                            __builtin_amdgcn_workgroup_size_x());
 134 }
 135
 136 EXTERN unsigned __kmpc_get_warp_size() {
 137   return WARPSIZE;
 138 }
 139
 140 EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
 141 EXTERN unsigned GetLaneId() {
 142   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 143 }
 144
 145 EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
 146   return __kmpc_get_hardware_num_threads_in_block();
 147 }
 148
 149 // Atomics
 150 uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
 151   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 152 }
 153 uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
 154   return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
 155 }
 156 uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
 157   return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
 158 }
 159
 160 uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
 161   uint32_t R;
 162   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
 163   return R;
 164 }
 165 uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
 166   (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
 167                                   __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 168   return Compare;
 169 }
 170
 171 unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
 172                                           unsigned long long Val) {
 173   unsigned long long R;
 174   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
 175   return R;
 176 }
 177 unsigned long long __kmpc_atomic_add(unsigned long long *Address,
 178                                      unsigned long long Val) {
 179   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 180 }
 181
 182 // Stub implementations
 183 // Weak to allow overriding by local versions while comparing different
 184 // potential implementations
 185 __attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
 186   return nullptr;
 187 }
 188 __attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
 189
 190 EXTERN
 191 int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
 192   return -1;
 193 }
 194
 195 EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
 196   lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
 197   hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
 198 }
 199
 200 EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
 201   return (((uint64_t)hi) << 32) | (uint64_t)lo;
 202 }
 203
 204 EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
 205
 206 EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
 207   // AMDGCN doesn't need to sync threads in a warp
 208 }
 209
 210 EXTERN void __kmpc_impl_threadfence() {
 211   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
 212 }
 213
 214 EXTERN void __kmpc_impl_threadfence_block() {
 215   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 216 }
 217
 218 EXTERN void __kmpc_impl_threadfence_system() {
 219   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
 220 }
 221
 222 // Calls to the AMDGCN layer (assuming 1D layout)
 223 EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
 224 EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
 225
 226 #pragma omp end declare target