openmp/libomptarget/DeviceRTL/src/State.cpp

   1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 //===----------------------------------------------------------------------===//
  10
  11 #include "State.h"
  12 #include "Allocator.h"
  13 #include "Configuration.h"
  14 #include "Debug.h"
  15 #include "Environment.h"
  16 #include "Interface.h"
  17 #include "LibC.h"
  18 #include "Mapping.h"
  19 #include "Synchronization.h"
  20 #include "Types.h"
  21 #include "Utils.h"
  22
  23 using namespace ompx;
  24
  25 #pragma omp begin declare target device_type(nohost)
  26
  27 /// Memory implementation
  28 ///
  29 ///{
  30
  31 /// External symbol to access dynamic shared memory.
  32 [[gnu::aligned(
  33     allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
  34 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
  35
  36 /// The kernel environment passed to the init method by the compiler.
  37 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
  38
  39 /// The kernel launch environment passed as argument to the kernel by the
  40 /// runtime.
  41 static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
  42
  43 ///}
  44
  45 namespace {
  46
  47 /// Fallback implementations are missing to trigger a link time error.
  48 /// Implementations for new devices, including the host, should go into a
  49 /// dedicated begin/end declare variant.
  50 ///
  51 ///{
  52 extern "C" {
  53 #ifdef __AMDGPU__
  54
  55 [[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
  56 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
  57
  58 #else
  59
  60 [[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
  61 [[gnu::weak, gnu::leaf]] void free(void *Ptr);
  62
  63 #endif
  64 }
  65 ///}
  66
  67 /// A "smart" stack in shared memory.
  68 ///
  69 /// The stack exposes a malloc/free interface but works like a stack internally.
  70 /// In fact, it is a separate stack *per warp*. That means, each warp must push
  71 /// and pop symmetrically or this breaks, badly. The implementation will (aim
  72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
  73 /// happen if a warp runs out of memory. The master warp in generic memory is
  74 /// special and is given more memory than the rest.
  75 ///
  76 struct SharedMemorySmartStackTy {
  77   /// Initialize the stack. Must be called by all threads.
  78   void init(bool IsSPMD);
  79
  80   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
  81   /// can call this function.
  82   void *push(uint64_t Bytes);
  83
  84   /// Deallocate the last allocation made by the encountering thread and pointed
  85   /// to by \p Ptr from the stack. Each thread can call this function.
  86   void pop(void *Ptr, uint32_t Bytes);
  87
  88 private:
  89   /// Compute the size of the storage space reserved for a thread.
  90   uint32_t computeThreadStorageTotal() {
  91     uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
  92     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
  93                              allocator::ALIGNMENT);
  94   }
  95
  96   /// Return the top address of the warp data stack, that is the first address
  97   /// this warp will allocate memory at next.
  98   void *getThreadDataTop(uint32_t TId) {
  99     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
 100   }
 101
 102   /// The actual storage, shared among all warps.
 103   [[gnu::aligned(
 104       allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
 105   [[gnu::aligned(
 106       allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
 107 };
 108
 109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
 110               "Shared scratchpad of this size not supported yet.");
 111
 112 /// The allocation of a single shared memory scratchpad.
 113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
 114
 115 void SharedMemorySmartStackTy::init(bool IsSPMD) {
 116   Usage[mapping::getThreadIdInBlock()] = 0;
 117 }
 118
 119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
 120   // First align the number of requested bytes.
 121   /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
 122   /// be passed in as an argument and the stack rewritten to support it.
 123   uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
 124
 125   uint32_t StorageTotal = computeThreadStorageTotal();
 126
 127   // The main thread in generic mode gets the space of its entire warp as the
 128   // other threads do not participate in any computation at all.
 129   if (mapping::isMainThreadInGenericMode())
 130     StorageTotal *= mapping::getWarpSize();
 131
 132   int TId = mapping::getThreadIdInBlock();
 133   if (Usage[TId] + AlignedBytes <= StorageTotal) {
 134     void *Ptr = getThreadDataTop(TId);
 135     Usage[TId] += AlignedBytes;
 136     return Ptr;
 137   }
 138
 139   if (config::isDebugMode(DeviceDebugKind::CommonIssues))
 140     PRINT("Shared memory stack full, fallback to dynamic allocation of global "
 141           "memory will negatively impact performance.\n");
 142   void *GlobalMemory = memory::allocGlobal(
 143       AlignedBytes, "Slow path shared memory allocation, insufficient "
 144                     "shared memory stack memory!");
 145   ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
 146
 147   return GlobalMemory;
 148 }
 149
 150 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
 151   uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
 152   if (utils::isSharedMemPtr(Ptr)) {
 153     int TId = mapping::getThreadIdInBlock();
 154     Usage[TId] -= AlignedBytes;
 155     return;
 156   }
 157   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
 158 }
 159
 160 } // namespace
 161
 162 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
 163
 164 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
 165   return SharedMemorySmartStack.push(Bytes);
 166 }
 167
 168 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
 169   SharedMemorySmartStack.pop(Ptr, Bytes);
 170 }
 171
 172 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
 173   void *Ptr = malloc(Bytes);
 174   if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
 175     PRINT("nullptr returned by malloc!\n");
 176   return Ptr;
 177 }
 178
 179 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
 180
 181 ///}
 182
 183 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
 184   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
 185          (ActiveLevelVar == Other.ActiveLevelVar) &
 186          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
 187          (RunSchedVar == Other.RunSchedVar) &
 188          (RunSchedChunkVar == Other.RunSchedChunkVar);
 189 }
 190
 191 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
 192   ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
 193   ASSERT(LevelVar == Other.LevelVar, nullptr);
 194   ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
 195   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
 196   ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
 197   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
 198 }
 199
 200 void state::TeamStateTy::init(bool IsSPMD) {
 201   ICVState.NThreadsVar = 0;
 202   ICVState.LevelVar = 0;
 203   ICVState.ActiveLevelVar = 0;
 204   ICVState.Padding0Val = 0;
 205   ICVState.MaxActiveLevelsVar = 1;
 206   ICVState.RunSchedVar = omp_sched_static;
 207   ICVState.RunSchedChunkVar = 1;
 208   ParallelTeamSize = 1;
 209   HasThreadState = false;
 210   ParallelRegionFnVar = nullptr;
 211 }
 212
 213 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
 214   return (ICVState == Other.ICVState) &
 215          (HasThreadState == Other.HasThreadState) &
 216          (ParallelTeamSize == Other.ParallelTeamSize);
 217 }
 218
 219 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
 220   ICVState.assertEqual(Other.ICVState);
 221   ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
 222   ASSERT(HasThreadState == Other.HasThreadState, nullptr);
 223 }
 224
 225 state::TeamStateTy SHARED(ompx::state::TeamState);
 226 state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
 227
 228 namespace {
 229
 230 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 231                              int OutOfBoundsVal = -1) {
 232   if (Level == 0)
 233     return DefaultVal;
 234   int LevelVar = omp_get_level();
 235   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
 236     return OutOfBoundsVal;
 237   int ActiveLevel = icv::ActiveLevel;
 238   if (OMP_UNLIKELY(Level != ActiveLevel))
 239     return DefaultVal;
 240   return Val;
 241 }
 242
 243 } // namespace
 244
 245 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
 246                  KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
 247   SharedMemorySmartStack.init(IsSPMD);
 248   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
 249     TeamState.init(IsSPMD);
 250     ThreadStates = nullptr;
 251     KernelEnvironmentPtr = &KernelEnvironment;
 252     KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
 253   }
 254 }
 255
 256 KernelEnvironmentTy &state::getKernelEnvironment() {
 257   return *KernelEnvironmentPtr;
 258 }
 259
 260 KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
 261   return *KernelLaunchEnvironmentPtr;
 262 }
 263
 264 void state::enterDataEnvironment(IdentTy *Ident) {
 265   ASSERT(config::mayUseThreadStates(),
 266          "Thread state modified while explicitly disabled!");
 267   if (!config::mayUseThreadStates())
 268     return;
 269
 270   unsigned TId = mapping::getThreadIdInBlock();
 271   ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
 272       memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
 273   uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
 274   if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
 275     uint32_t Bytes =
 276         sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
 277     void *ThreadStatesPtr =
 278         memory::allocGlobal(Bytes, "Thread state array allocation");
 279     memset(ThreadStatesPtr, 0, Bytes);
 280     if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
 281                      reinterpret_cast<uintptr_t>(ThreadStatesPtr),
 282                      atomic::seq_cst, atomic::seq_cst))
 283       memory::freeGlobal(ThreadStatesPtr,
 284                          "Thread state array allocated multiple times");
 285     ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
 286            "Expected valid thread states bit!");
 287   }
 288   NewThreadState->init(ThreadStates[TId]);
 289   TeamState.HasThreadState = true;
 290   ThreadStates[TId] = NewThreadState;
 291 }
 292
 293 void state::exitDataEnvironment() {
 294   ASSERT(config::mayUseThreadStates(),
 295          "Thread state modified while explicitly disabled!");
 296
 297   unsigned TId = mapping::getThreadIdInBlock();
 298   resetStateForThread(TId);
 299 }
 300
 301 void state::resetStateForThread(uint32_t TId) {
 302   if (!config::mayUseThreadStates())
 303     return;
 304   if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
 305     return;
 306
 307   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
 308   memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
 309   ThreadStates[TId] = PreviousThreadState;
 310 }
 311
 312 void state::runAndCheckState(void(Func(void))) {
 313   TeamStateTy OldTeamState = TeamState;
 314   OldTeamState.assertEqual(TeamState);
 315
 316   Func();
 317
 318   OldTeamState.assertEqual(TeamState);
 319 }
 320
 321 void state::assumeInitialState(bool IsSPMD) {
 322   TeamStateTy InitialTeamState;
 323   InitialTeamState.init(IsSPMD);
 324   InitialTeamState.assertEqual(TeamState);
 325   ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
 326 }
 327
 328 int state::getEffectivePTeamSize() {
 329   int PTeamSize = state::ParallelTeamSize;
 330   return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
 331 }
 332
 333 extern "C" {
 334 void omp_set_dynamic(int V) {}
 335
 336 int omp_get_dynamic(void) { return 0; }
 337
 338 void omp_set_num_threads(int V) { icv::NThreads = V; }
 339
 340 int omp_get_max_threads(void) {
 341   int NT = icv::NThreads;
 342   return NT > 0 ? NT : mapping::getMaxTeamThreads();
 343 }
 344
 345 int omp_get_level(void) {
 346   int LevelVar = icv::Level;
 347   ASSERT(LevelVar >= 0, nullptr);
 348   return LevelVar;
 349 }
 350
 351 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
 352
 353 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
 354
 355 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
 356   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
 357   *ChunkSize = state::RunSchedChunk;
 358 }
 359
 360 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
 361   icv::RunSched = (int)ScheduleKind;
 362   state::RunSchedChunk = ChunkSize;
 363 }
 364
 365 int omp_get_ancestor_thread_num(int Level) {
 366   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
 367 }
 368
 369 int omp_get_thread_num(void) {
 370   return omp_get_ancestor_thread_num(omp_get_level());
 371 }
 372
 373 int omp_get_team_size(int Level) {
 374   return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 375 }
 376
 377 int omp_get_num_threads(void) {
 378   return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
 379 }
 380
 381 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
 382
 383 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
 384
 385 void omp_set_nested(int) {}
 386
 387 int omp_get_nested(void) { return false; }
 388
 389 void omp_set_max_active_levels(int Levels) {
 390   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
 391 }
 392
 393 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
 394
 395 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
 396
 397 int omp_get_num_places(void) { return 0; }
 398
 399 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
 400
 401 void omp_get_place_proc_ids(int, int *) {
 402   // TODO
 403 }
 404
 405 int omp_get_place_num(void) { return 0; }
 406
 407 int omp_get_partition_num_places(void) { return 0; }
 408
 409 void omp_get_partition_place_nums(int *) {
 410   // TODO
 411 }
 412
 413 int omp_get_cancellation(void) { return 0; }
 414
 415 void omp_set_default_device(int) {}
 416
 417 int omp_get_default_device(void) { return -1; }
 418
 419 int omp_get_num_devices(void) { return config::getNumDevices(); }
 420
 421 int omp_get_device_num(void) { return config::getDeviceNum(); }
 422
 423 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 424
 425 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 426
 427 int omp_get_initial_device(void) { return -1; }
 428 }
 429
 430 extern "C" {
 431 [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
 432   return memory::allocShared(Bytes, "Frontend alloc shared");
 433 }
 434
 435 [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
 436   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 437 }
 438
 439 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
 440
 441 void *llvm_omp_target_dynamic_shared_alloc() {
 442   return __kmpc_get_dynamic_shared();
 443 }
 444
 445 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
 446
 447 /// Allocate storage in shared memory to communicate arguments from the main
 448 /// thread to the workers in generic mode. If we exceed
 449 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
 450 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
 451
 452 [[clang::loader_uninitialized]] static void
 453     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
 454 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
 455     allocator(omp_pteam_mem_alloc)
 456 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
 457 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
 458     allocator(omp_pteam_mem_alloc)
 459
 460 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
 461   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
 462     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
 463   } else {
 464     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
 465         nArgs * sizeof(void *), "new extended args");
 466     ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
 467            "Nullptr returned by malloc!");
 468   }
 469   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 470 }
 471
 472 void __kmpc_end_sharing_variables() {
 473   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
 474     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
 475 }
 476
 477 void __kmpc_get_shared_variables(void ***GlobalArgs) {
 478   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 479 }
 480 }
 481 #pragma omp end declare target