offload/DeviceRTL/src/State.cpp

   1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 //===----------------------------------------------------------------------===//
  10
  11 #include "Shared/Environment.h"
  12
  13 #include "Allocator.h"
  14 #include "Configuration.h"
  15 #include "Debug.h"
  16 #include "DeviceTypes.h"
  17 #include "DeviceUtils.h"
  18 #include "Interface.h"
  19 #include "LibC.h"
  20 #include "Mapping.h"
  21 #include "State.h"
  22 #include "Synchronization.h"
  23
  24 using namespace ompx;
  25
  26 #pragma omp begin declare target device_type(nohost)
  27
  28 /// Memory implementation
  29 ///
  30 ///{
  31
  32 /// External symbol to access dynamic shared memory.
  33 [[gnu::aligned(
  34     allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
  35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
  36
  37 /// The kernel environment passed to the init method by the compiler.
  38 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
  39
  40 /// The kernel launch environment passed as argument to the kernel by the
  41 /// runtime.
  42 static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
  43
  44 ///}
  45
  46 namespace {
  47
  48 /// Fallback implementations are missing to trigger a link time error.
  49 /// Implementations for new devices, including the host, should go into a
  50 /// dedicated begin/end declare variant.
  51 ///
  52 ///{
  53 extern "C" {
  54 #ifdef __AMDGPU__
  55
  56 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
  57 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
  58
  59 #else
  60
  61 [[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
  62 [[gnu::weak, gnu::leaf]] void free(void *Ptr);
  63
  64 #endif
  65 }
  66 ///}
  67
  68 /// A "smart" stack in shared memory.
  69 ///
  70 /// The stack exposes a malloc/free interface but works like a stack internally.
  71 /// In fact, it is a separate stack *per warp*. That means, each warp must push
  72 /// and pop symmetrically or this breaks, badly. The implementation will (aim
  73 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
  74 /// happen if a warp runs out of memory. The master warp in generic memory is
  75 /// special and is given more memory than the rest.
  76 ///
  77 struct SharedMemorySmartStackTy {
  78   /// Initialize the stack. Must be called by all threads.
  79   void init(bool IsSPMD);
  80
  81   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
  82   /// can call this function.
  83   void *push(uint64_t Bytes);
  84
  85   /// Deallocate the last allocation made by the encountering thread and pointed
  86   /// to by \p Ptr from the stack. Each thread can call this function.
  87   void pop(void *Ptr, uint64_t Bytes);
  88
  89 private:
  90   /// Compute the size of the storage space reserved for a thread.
  91   uint32_t computeThreadStorageTotal() {
  92     uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
  93     return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
  94                             allocator::ALIGNMENT);
  95   }
  96
  97   /// Return the top address of the warp data stack, that is the first address
  98   /// this warp will allocate memory at next.
  99   void *getThreadDataTop(uint32_t TId) {
 100     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
 101   }
 102
 103   /// The actual storage, shared among all warps.
 104   [[gnu::aligned(
 105       allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
 106   [[gnu::aligned(
 107       allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
 108 };
 109
 110 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
 111               "Shared scratchpad of this size not supported yet.");
 112
 113 /// The allocation of a single shared memory scratchpad.
 114 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
 115
 116 void SharedMemorySmartStackTy::init(bool IsSPMD) {
 117   Usage[mapping::getThreadIdInBlock()] = 0;
 118 }
 119
 120 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
 121   // First align the number of requested bytes.
 122   /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
 123   /// be passed in as an argument and the stack rewritten to support it.
 124   uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
 125
 126   uint32_t StorageTotal = computeThreadStorageTotal();
 127
 128   // The main thread in generic mode gets the space of its entire warp as the
 129   // other threads do not participate in any computation at all.
 130   if (mapping::isMainThreadInGenericMode())
 131     StorageTotal *= mapping::getWarpSize();
 132
 133   int TId = mapping::getThreadIdInBlock();
 134   if (Usage[TId] + AlignedBytes <= StorageTotal) {
 135     void *Ptr = getThreadDataTop(TId);
 136     Usage[TId] += AlignedBytes;
 137     return Ptr;
 138   }
 139
 140   if (config::isDebugMode(DeviceDebugKind::CommonIssues))
 141     PRINT("Shared memory stack full, fallback to dynamic allocation of global "
 142           "memory will negatively impact performance.\n");
 143   void *GlobalMemory = memory::allocGlobal(
 144       AlignedBytes, "Slow path shared memory allocation, insufficient "
 145                     "shared memory stack memory!");
 146   ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
 147
 148   return GlobalMemory;
 149 }
 150
 151 void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
 152   uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
 153   if (utils::isSharedMemPtr(Ptr)) {
 154     int TId = mapping::getThreadIdInBlock();
 155     Usage[TId] -= AlignedBytes;
 156     return;
 157   }
 158   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
 159 }
 160
 161 } // namespace
 162
 163 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
 164
 165 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
 166   return SharedMemorySmartStack.push(Bytes);
 167 }
 168
 169 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
 170   SharedMemorySmartStack.pop(Ptr, Bytes);
 171 }
 172
 173 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
 174   void *Ptr = malloc(Bytes);
 175   if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
 176     PRINT("nullptr returned by malloc!\n");
 177   return Ptr;
 178 }
 179
 180 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
 181
 182 ///}
 183
 184 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
 185   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
 186          (ActiveLevelVar == Other.ActiveLevelVar) &
 187          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
 188          (RunSchedVar == Other.RunSchedVar) &
 189          (RunSchedChunkVar == Other.RunSchedChunkVar);
 190 }
 191
 192 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
 193   ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
 194   ASSERT(LevelVar == Other.LevelVar, nullptr);
 195   ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
 196   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
 197   ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
 198   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
 199 }
 200
 201 void state::TeamStateTy::init(bool IsSPMD) {
 202   ICVState.NThreadsVar = 0;
 203   ICVState.LevelVar = 0;
 204   ICVState.ActiveLevelVar = 0;
 205   ICVState.Padding0Val = 0;
 206   ICVState.MaxActiveLevelsVar = 1;
 207   ICVState.RunSchedVar = omp_sched_static;
 208   ICVState.RunSchedChunkVar = 1;
 209   ParallelTeamSize = 1;
 210   HasThreadState = false;
 211   ParallelRegionFnVar = nullptr;
 212 }
 213
 214 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
 215   return (ICVState == Other.ICVState) &
 216          (HasThreadState == Other.HasThreadState) &
 217          (ParallelTeamSize == Other.ParallelTeamSize);
 218 }
 219
 220 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
 221   ICVState.assertEqual(Other.ICVState);
 222   ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
 223   ASSERT(HasThreadState == Other.HasThreadState, nullptr);
 224 }
 225
 226 state::TeamStateTy SHARED(ompx::state::TeamState);
 227 state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
 228
 229 namespace {
 230
 231 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 232                              int OutOfBoundsVal = -1) {
 233   if (Level == 0)
 234     return DefaultVal;
 235   int LevelVar = omp_get_level();
 236   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
 237     return OutOfBoundsVal;
 238   int ActiveLevel = icv::ActiveLevel;
 239   if (OMP_UNLIKELY(Level != ActiveLevel))
 240     return DefaultVal;
 241   return Val;
 242 }
 243
 244 } // namespace
 245
 246 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
 247                  KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
 248   SharedMemorySmartStack.init(IsSPMD);
 249   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
 250     TeamState.init(IsSPMD);
 251     ThreadStates = nullptr;
 252     KernelEnvironmentPtr = &KernelEnvironment;
 253     KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
 254   }
 255 }
 256
 257 KernelEnvironmentTy &state::getKernelEnvironment() {
 258   return *KernelEnvironmentPtr;
 259 }
 260
 261 KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
 262   return *KernelLaunchEnvironmentPtr;
 263 }
 264
 265 void state::enterDataEnvironment(IdentTy *Ident) {
 266   ASSERT(config::mayUseThreadStates(),
 267          "Thread state modified while explicitly disabled!");
 268   if (!config::mayUseThreadStates())
 269     return;
 270
 271   unsigned TId = mapping::getThreadIdInBlock();
 272   ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
 273       memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
 274   uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
 275   if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
 276     uint32_t Bytes =
 277         sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
 278     void *ThreadStatesPtr =
 279         memory::allocGlobal(Bytes, "Thread state array allocation");
 280     memset(ThreadStatesPtr, 0, Bytes);
 281     if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
 282                      reinterpret_cast<uintptr_t>(ThreadStatesPtr),
 283                      atomic::seq_cst, atomic::seq_cst))
 284       memory::freeGlobal(ThreadStatesPtr,
 285                          "Thread state array allocated multiple times");
 286     ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
 287            "Expected valid thread states bit!");
 288   }
 289   NewThreadState->init(ThreadStates[TId]);
 290   TeamState.HasThreadState = true;
 291   ThreadStates[TId] = NewThreadState;
 292 }
 293
 294 void state::exitDataEnvironment() {
 295   ASSERT(config::mayUseThreadStates(),
 296          "Thread state modified while explicitly disabled!");
 297
 298   unsigned TId = mapping::getThreadIdInBlock();
 299   resetStateForThread(TId);
 300 }
 301
 302 void state::resetStateForThread(uint32_t TId) {
 303   if (!config::mayUseThreadStates())
 304     return;
 305   if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
 306     return;
 307
 308   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
 309   memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
 310   ThreadStates[TId] = PreviousThreadState;
 311 }
 312
 313 void state::runAndCheckState(void(Func(void))) {
 314   TeamStateTy OldTeamState = TeamState;
 315   OldTeamState.assertEqual(TeamState);
 316
 317   Func();
 318
 319   OldTeamState.assertEqual(TeamState);
 320 }
 321
 322 void state::assumeInitialState(bool IsSPMD) {
 323   TeamStateTy InitialTeamState;
 324   InitialTeamState.init(IsSPMD);
 325   InitialTeamState.assertEqual(TeamState);
 326   ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
 327 }
 328
 329 int state::getEffectivePTeamSize() {
 330   int PTeamSize = state::ParallelTeamSize;
 331   return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
 332 }
 333
 334 extern "C" {
 335 void omp_set_dynamic(int V) {}
 336
 337 int omp_get_dynamic(void) { return 0; }
 338
 339 void omp_set_num_threads(int V) { icv::NThreads = V; }
 340
 341 int omp_get_max_threads(void) {
 342   int NT = icv::NThreads;
 343   return NT > 0 ? NT : mapping::getMaxTeamThreads();
 344 }
 345
 346 int omp_get_level(void) {
 347   int LevelVar = icv::Level;
 348   ASSERT(LevelVar >= 0, nullptr);
 349   return LevelVar;
 350 }
 351
 352 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
 353
 354 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
 355
 356 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
 357   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
 358   *ChunkSize = state::RunSchedChunk;
 359 }
 360
 361 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
 362   icv::RunSched = (int)ScheduleKind;
 363   state::RunSchedChunk = ChunkSize;
 364 }
 365
 366 int omp_get_ancestor_thread_num(int Level) {
 367   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
 368 }
 369
 370 int omp_get_thread_num(void) {
 371   return omp_get_ancestor_thread_num(omp_get_level());
 372 }
 373
 374 int omp_get_team_size(int Level) {
 375   return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 376 }
 377
 378 int omp_get_num_threads(void) {
 379   return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
 380 }
 381
 382 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
 383
 384 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
 385
 386 void omp_set_nested(int) {}
 387
 388 int omp_get_nested(void) { return false; }
 389
 390 void omp_set_max_active_levels(int Levels) {
 391   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
 392 }
 393
 394 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
 395
 396 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
 397
 398 int omp_get_num_places(void) { return 0; }
 399
 400 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
 401
 402 void omp_get_place_proc_ids(int, int *) {
 403   // TODO
 404 }
 405
 406 int omp_get_place_num(void) { return 0; }
 407
 408 int omp_get_partition_num_places(void) { return 0; }
 409
 410 void omp_get_partition_place_nums(int *) {
 411   // TODO
 412 }
 413
 414 int omp_get_cancellation(void) { return 0; }
 415
 416 void omp_set_default_device(int) {}
 417
 418 int omp_get_default_device(void) { return -1; }
 419
 420 int omp_get_num_devices(void) { return config::getNumDevices(); }
 421
 422 int omp_get_device_num(void) { return config::getDeviceNum(); }
 423
 424 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 425
 426 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 427
 428 int omp_get_initial_device(void) { return -1; }
 429
 430 int omp_is_initial_device(void) { return 0; }
 431 }
 432
 433 extern "C" {
 434 [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
 435   return memory::allocShared(Bytes, "Frontend alloc shared");
 436 }
 437
 438 [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
 439   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 440 }
 441
 442 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
 443
 444 void *llvm_omp_target_dynamic_shared_alloc() {
 445   return __kmpc_get_dynamic_shared();
 446 }
 447
 448 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
 449
 450 /// Allocate storage in shared memory to communicate arguments from the main
 451 /// thread to the workers in generic mode. If we exceed
 452 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
 453 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
 454
 455 [[clang::loader_uninitialized]] static void
 456     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
 457 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
 458     allocator(omp_pteam_mem_alloc)
 459 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
 460 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
 461     allocator(omp_pteam_mem_alloc)
 462
 463 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
 464   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
 465     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
 466   } else {
 467     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
 468         nArgs * sizeof(void *), "new extended args");
 469     ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
 470            "Nullptr returned by malloc!");
 471   }
 472   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 473 }
 474
 475 void __kmpc_end_sharing_variables() {
 476   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
 477     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
 478 }
 479
 480 void __kmpc_get_shared_variables(void ***GlobalArgs) {
 481   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 482 }
 483 }
 484 #pragma omp end declare target