openmp/libomptarget/deviceRTLs/common/omptargeti.h

   1 //===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file contains the declarations of all library macros, types,
  10 // and functions.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 ////////////////////////////////////////////////////////////////////////////////
  15 // Task Descriptor
  16 ////////////////////////////////////////////////////////////////////////////////
  17
  18 INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
  19   // sched starts from 1..4; encode it as 0..3; so add 1 here
  20   uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
  21   return (omp_sched_t)rc;
  22 }
  23
  24 INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
  25   // sched starts from 1..4; encode it as 0..3; so sub 1 here
  26   uint8_t val = ((uint8_t)sched) - 1;
  27   // clear current sched
  28   items.flags &= ~TaskDescr_SchedMask;
  29   // set new sched
  30   items.flags |= val;
  31 }
  32
  33 INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
  34   // slow method
  35   // flag:
  36   //   default sched is static,
  37   //   dyn is off (unused now anyway, but may need to sample from host ?)
  38   //   not in parallel
  39
  40   items.flags = 0;
  41   items.threadId = 0;         // is master
  42   items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
  43 }
  44
  45 // This is called when all threads are started together in SPMD mode.
  46 // OMP directives include target parallel, target distribute parallel for, etc.
  47 INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
  48     omptarget_nvptx_TaskDescr *parentTaskDescr) {
  49   // slow method
  50   // flag:
  51   //   default sched is static,
  52   //   dyn is off (unused now anyway, but may need to sample from host ?)
  53   //   in L1 parallel
  54
  55   items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
  56   items.threadId =
  57       __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
  58                                                 // called for 1st level)
  59   items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
  60   prev = parentTaskDescr;
  61 }
  62
  63 INLINE void omptarget_nvptx_TaskDescr::CopyData(
  64     omptarget_nvptx_TaskDescr *sourceTaskDescr) {
  65   items = sourceTaskDescr->items;
  66 }
  67
  68 INLINE void
  69 omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
  70   CopyData(sourceTaskDescr);
  71   prev = sourceTaskDescr->prev;
  72 }
  73
  74 INLINE void omptarget_nvptx_TaskDescr::CopyParent(
  75     omptarget_nvptx_TaskDescr *parentTaskDescr) {
  76   CopyData(parentTaskDescr);
  77   prev = parentTaskDescr;
  78 }
  79
  80 INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
  81     omptarget_nvptx_TaskDescr *parentTaskDescr) {
  82   CopyParent(parentTaskDescr);
  83   items.flags = items.flags & ~TaskDescr_IsParConstr;
  84   ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
  85 }
  86
  87 INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
  88     omptarget_nvptx_TaskDescr *masterTaskDescr) {
  89   CopyParent(masterTaskDescr);
  90   // overwrite specific items;
  91   items.flags |=
  92       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
  93 }
  94
  95 INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
  96     omptarget_nvptx_TaskDescr *workTaskDescr) {
  97   Copy(workTaskDescr);
  98   //
  99   // overwrite specific items;
 100   //
 101   // The threadID should be __kmpc_get_hardware_thread_id_in_block() %
 102   // GetMasterThreadID(). This is so that the serial master (first lane in the
 103   // master warp) gets a threadId of 0. However, we know that this function is
 104   // always called in a parallel region where only workers are active.  The
 105   // serial master thread never enters this region.  When a parallel region is
 106   // executed serially, the threadId is set to 0 elsewhere and the
 107   // kmpc_serialized_* functions are called, which never activate this region.
 108   items.threadId =
 109       __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
 110                                                 // called for 1st level)
 111 }
 112
 113 INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
 114     omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
 115   CopyParent(parentTaskDescr);
 116   items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
 117   items.threadId = tid;
 118 }
 119
 120 INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
 121   loopData.loopUpperBound =
 122       omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
 123   loopData.nextLowerBound =
 124       omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
 125   loopData.schedule =
 126       omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
 127   loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
 128   loopData.stride =
 129       omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
 130 }
 131
 132 INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
 133   omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
 134   omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
 135       loopData.loopUpperBound;
 136   omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
 137       loopData.nextLowerBound;
 138   omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
 139       loopData.stride;
 140   omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
 141       loopData.schedule;
 142 }
 143
 144 ////////////////////////////////////////////////////////////////////////////////
 145 // Thread Private Context
 146 ////////////////////////////////////////////////////////////////////////////////
 147
 148 INLINE omptarget_nvptx_TaskDescr *
 149 omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
 150   ASSERT0(
 151       LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
 152       "Getting top level, tid is larger than allocated data structure size");
 153   return topTaskDescr[tid];
 154 }
 155
 156 INLINE void
 157 omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
 158   // levelOneTaskDescr is init when starting the parallel region
 159   // top task descr is NULL (team master version will be fixed separately)
 160   topTaskDescr[tid] = NULL;
 161   // the following don't need to be init here; they are init when using dyn
 162   // sched
 163   // current_Event, events_Number, chunk, num_Iterations, schedule
 164 }
 165
 166 ////////////////////////////////////////////////////////////////////////////////
 167 // Team Descriptor
 168 ////////////////////////////////////////////////////////////////////////////////
 169
 170 INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
 171   levelZeroTaskDescr.InitLevelZeroTaskDescr();
 172 }
 173
 174 ////////////////////////////////////////////////////////////////////////////////
 175 // Get private data structure for thread
 176 ////////////////////////////////////////////////////////////////////////////////
 177
 178 // Utility routines for CUDA threads
 179 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
 180   return omptarget_nvptx_threadPrivateContext->TeamContext();
 181 }
 182
 183 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
 184   omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
 185   return currTeamDescr.WorkDescr();
 186 }
 187
 188 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
 189   return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
 190 }
 191
 192 INLINE omptarget_nvptx_TaskDescr *
 193 getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
 194   return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
 195 }
 196
 197 ////////////////////////////////////////////////////////////////////////////////
 198 // Memory management runtime functions.
 199 ////////////////////////////////////////////////////////////////////////////////
 200
 201 INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
 202   ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
 203           "SlotIdx is too big or uninitialized.");
 204   ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
 205           "MemIdx is too big or uninitialized.");
 206   MemDataTy &MD = MemData[usedSlotIdx];
 207   __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
 208 }
 209
 210 INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
 211                                                                 size_t size) {
 212   ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
 213           "SlotIdx is too big or uninitialized.");
 214   const unsigned sm = usedSlotIdx;
 215   MemDataTy &MD = MemData[sm];
 216   unsigned i = hash(GetBlockIdInKernel());
 217   while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
 218     i = hash(i + 1);
 219   }
 220   usedSlotIdx = sm;
 221   usedMemIdx = i;
 222   return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
 223 }