Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / DeviceRTL / src / State.cpp
blobf8a6d333df0d9edb15c22c0aeada453290d9c5bd
1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
11 #include "State.h"
12 #include "Allocator.h"
13 #include "Configuration.h"
14 #include "Debug.h"
15 #include "Environment.h"
16 #include "Interface.h"
17 #include "LibC.h"
18 #include "Mapping.h"
19 #include "Synchronization.h"
20 #include "Types.h"
21 #include "Utils.h"
23 using namespace ompx;
25 #pragma omp begin declare target device_type(nohost)
27 /// Memory implementation
28 ///
29 ///{
31 /// External symbol to access dynamic shared memory.
32 [[gnu::aligned(
33 allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
34 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
36 /// The kernel environment passed to the init method by the compiler.
37 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
39 /// The kernel launch environment passed as argument to the kernel by the
40 /// runtime.
41 static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
43 ///}
45 namespace {
47 /// Fallback implementations are missing to trigger a link time error.
48 /// Implementations for new devices, including the host, should go into a
49 /// dedicated begin/end declare variant.
50 ///
51 ///{
52 extern "C" {
53 #ifdef __AMDGPU__
55 [[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
56 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
58 #else
60 [[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
61 [[gnu::weak, gnu::leaf]] void free(void *Ptr);
63 #endif
65 ///}
67 /// A "smart" stack in shared memory.
68 ///
69 /// The stack exposes a malloc/free interface but works like a stack internally.
70 /// In fact, it is a separate stack *per warp*. That means, each warp must push
71 /// and pop symmetrically or this breaks, badly. The implementation will (aim
72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
73 /// happen if a warp runs out of memory. The master warp in generic memory is
74 /// special and is given more memory than the rest.
75 ///
76 struct SharedMemorySmartStackTy {
77 /// Initialize the stack. Must be called by all threads.
78 void init(bool IsSPMD);
80 /// Allocate \p Bytes on the stack for the encountering thread. Each thread
81 /// can call this function.
82 void *push(uint64_t Bytes);
84 /// Deallocate the last allocation made by the encountering thread and pointed
85 /// to by \p Ptr from the stack. Each thread can call this function.
86 void pop(void *Ptr, uint32_t Bytes);
88 private:
89 /// Compute the size of the storage space reserved for a thread.
90 uint32_t computeThreadStorageTotal() {
91 uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
92 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
93 allocator::ALIGNMENT);
96 /// Return the top address of the warp data stack, that is the first address
97 /// this warp will allocate memory at next.
98 void *getThreadDataTop(uint32_t TId) {
99 return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
102 /// The actual storage, shared among all warps.
103 [[gnu::aligned(
104 allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
105 [[gnu::aligned(
106 allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
110 "Shared scratchpad of this size not supported yet.");
112 /// The allocation of a single shared memory scratchpad.
113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
115 void SharedMemorySmartStackTy::init(bool IsSPMD) {
116 Usage[mapping::getThreadIdInBlock()] = 0;
119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
120 // First align the number of requested bytes.
121 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
122 /// be passed in as an argument and the stack rewritten to support it.
123 uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
125 uint32_t StorageTotal = computeThreadStorageTotal();
127 // The main thread in generic mode gets the space of its entire warp as the
128 // other threads do not participate in any computation at all.
129 if (mapping::isMainThreadInGenericMode())
130 StorageTotal *= mapping::getWarpSize();
132 int TId = mapping::getThreadIdInBlock();
133 if (Usage[TId] + AlignedBytes <= StorageTotal) {
134 void *Ptr = getThreadDataTop(TId);
135 Usage[TId] += AlignedBytes;
136 return Ptr;
139 if (config::isDebugMode(DeviceDebugKind::CommonIssues))
140 PRINT("Shared memory stack full, fallback to dynamic allocation of global "
141 "memory will negatively impact performance.\n");
142 void *GlobalMemory = memory::allocGlobal(
143 AlignedBytes, "Slow path shared memory allocation, insufficient "
144 "shared memory stack memory!");
145 ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
147 return GlobalMemory;
150 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
151 uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
152 if (utils::isSharedMemPtr(Ptr)) {
153 int TId = mapping::getThreadIdInBlock();
154 Usage[TId] -= AlignedBytes;
155 return;
157 memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
160 } // namespace
162 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
164 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
165 return SharedMemorySmartStack.push(Bytes);
168 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
169 SharedMemorySmartStack.pop(Ptr, Bytes);
172 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
173 void *Ptr = malloc(Bytes);
174 if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
175 PRINT("nullptr returned by malloc!\n");
176 return Ptr;
179 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
181 ///}
183 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
184 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
185 (ActiveLevelVar == Other.ActiveLevelVar) &
186 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
187 (RunSchedVar == Other.RunSchedVar) &
188 (RunSchedChunkVar == Other.RunSchedChunkVar);
191 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
192 ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
193 ASSERT(LevelVar == Other.LevelVar, nullptr);
194 ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
195 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
196 ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
197 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
200 void state::TeamStateTy::init(bool IsSPMD) {
201 ICVState.NThreadsVar = 0;
202 ICVState.LevelVar = 0;
203 ICVState.ActiveLevelVar = 0;
204 ICVState.Padding0Val = 0;
205 ICVState.MaxActiveLevelsVar = 1;
206 ICVState.RunSchedVar = omp_sched_static;
207 ICVState.RunSchedChunkVar = 1;
208 ParallelTeamSize = 1;
209 HasThreadState = false;
210 ParallelRegionFnVar = nullptr;
213 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
214 return (ICVState == Other.ICVState) &
215 (HasThreadState == Other.HasThreadState) &
216 (ParallelTeamSize == Other.ParallelTeamSize);
219 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
220 ICVState.assertEqual(Other.ICVState);
221 ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
222 ASSERT(HasThreadState == Other.HasThreadState, nullptr);
225 state::TeamStateTy SHARED(ompx::state::TeamState);
226 state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
228 namespace {
230 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
231 int OutOfBoundsVal = -1) {
232 if (Level == 0)
233 return DefaultVal;
234 int LevelVar = omp_get_level();
235 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
236 return OutOfBoundsVal;
237 int ActiveLevel = icv::ActiveLevel;
238 if (OMP_UNLIKELY(Level != ActiveLevel))
239 return DefaultVal;
240 return Val;
243 } // namespace
245 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
246 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
247 SharedMemorySmartStack.init(IsSPMD);
248 if (mapping::isInitialThreadInLevel0(IsSPMD)) {
249 TeamState.init(IsSPMD);
250 ThreadStates = nullptr;
251 KernelEnvironmentPtr = &KernelEnvironment;
252 KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
256 KernelEnvironmentTy &state::getKernelEnvironment() {
257 return *KernelEnvironmentPtr;
260 KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
261 return *KernelLaunchEnvironmentPtr;
264 void state::enterDataEnvironment(IdentTy *Ident) {
265 ASSERT(config::mayUseThreadStates(),
266 "Thread state modified while explicitly disabled!");
267 if (!config::mayUseThreadStates())
268 return;
270 unsigned TId = mapping::getThreadIdInBlock();
271 ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
272 memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
273 uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
274 if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
275 uint32_t Bytes =
276 sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
277 void *ThreadStatesPtr =
278 memory::allocGlobal(Bytes, "Thread state array allocation");
279 memset(ThreadStatesPtr, 0, Bytes);
280 if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
281 reinterpret_cast<uintptr_t>(ThreadStatesPtr),
282 atomic::seq_cst, atomic::seq_cst))
283 memory::freeGlobal(ThreadStatesPtr,
284 "Thread state array allocated multiple times");
285 ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
286 "Expected valid thread states bit!");
288 NewThreadState->init(ThreadStates[TId]);
289 TeamState.HasThreadState = true;
290 ThreadStates[TId] = NewThreadState;
293 void state::exitDataEnvironment() {
294 ASSERT(config::mayUseThreadStates(),
295 "Thread state modified while explicitly disabled!");
297 unsigned TId = mapping::getThreadIdInBlock();
298 resetStateForThread(TId);
301 void state::resetStateForThread(uint32_t TId) {
302 if (!config::mayUseThreadStates())
303 return;
304 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
305 return;
307 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
308 memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
309 ThreadStates[TId] = PreviousThreadState;
312 void state::runAndCheckState(void(Func(void))) {
313 TeamStateTy OldTeamState = TeamState;
314 OldTeamState.assertEqual(TeamState);
316 Func();
318 OldTeamState.assertEqual(TeamState);
321 void state::assumeInitialState(bool IsSPMD) {
322 TeamStateTy InitialTeamState;
323 InitialTeamState.init(IsSPMD);
324 InitialTeamState.assertEqual(TeamState);
325 ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
328 int state::getEffectivePTeamSize() {
329 int PTeamSize = state::ParallelTeamSize;
330 return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
333 extern "C" {
334 void omp_set_dynamic(int V) {}
336 int omp_get_dynamic(void) { return 0; }
338 void omp_set_num_threads(int V) { icv::NThreads = V; }
340 int omp_get_max_threads(void) {
341 int NT = icv::NThreads;
342 return NT > 0 ? NT : mapping::getMaxTeamThreads();
345 int omp_get_level(void) {
346 int LevelVar = icv::Level;
347 ASSERT(LevelVar >= 0, nullptr);
348 return LevelVar;
351 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
353 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
355 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
356 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
357 *ChunkSize = state::RunSchedChunk;
360 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
361 icv::RunSched = (int)ScheduleKind;
362 state::RunSchedChunk = ChunkSize;
365 int omp_get_ancestor_thread_num(int Level) {
366 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
369 int omp_get_thread_num(void) {
370 return omp_get_ancestor_thread_num(omp_get_level());
373 int omp_get_team_size(int Level) {
374 return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
377 int omp_get_num_threads(void) {
378 return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
381 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
383 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
385 void omp_set_nested(int) {}
387 int omp_get_nested(void) { return false; }
389 void omp_set_max_active_levels(int Levels) {
390 icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
393 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
395 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
397 int omp_get_num_places(void) { return 0; }
399 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
401 void omp_get_place_proc_ids(int, int *) {
402 // TODO
405 int omp_get_place_num(void) { return 0; }
407 int omp_get_partition_num_places(void) { return 0; }
409 void omp_get_partition_place_nums(int *) {
410 // TODO
413 int omp_get_cancellation(void) { return 0; }
415 void omp_set_default_device(int) {}
417 int omp_get_default_device(void) { return -1; }
419 int omp_get_num_devices(void) { return config::getNumDevices(); }
421 int omp_get_device_num(void) { return config::getDeviceNum(); }
423 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
425 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
427 int omp_get_initial_device(void) { return -1; }
430 extern "C" {
431 [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
432 return memory::allocShared(Bytes, "Frontend alloc shared");
435 [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
436 memory::freeShared(Ptr, Bytes, "Frontend free shared");
439 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
441 void *llvm_omp_target_dynamic_shared_alloc() {
442 return __kmpc_get_dynamic_shared();
445 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
447 /// Allocate storage in shared memory to communicate arguments from the main
448 /// thread to the workers in generic mode. If we exceed
449 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
450 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
452 [[clang::loader_uninitialized]] static void
453 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
454 #pragma omp allocate(SharedMemVariableSharingSpace) \
455 allocator(omp_pteam_mem_alloc)
456 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
457 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \
458 allocator(omp_pteam_mem_alloc)
460 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
461 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
462 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
463 } else {
464 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
465 nArgs * sizeof(void *), "new extended args");
466 ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
467 "Nullptr returned by malloc!");
469 *GlobalArgs = SharedMemVariableSharingSpacePtr;
472 void __kmpc_end_sharing_variables() {
473 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
474 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
477 void __kmpc_get_shared_variables(void ***GlobalArgs) {
478 *GlobalArgs = SharedMemVariableSharingSpacePtr;
481 #pragma omp end declare target