[NFC][MLIR][Linalg] Refactor linalg.matmul tablegen ODS and related C++ code. (#116377)
[llvm-project.git] / offload / DeviceRTL / src / State.cpp
blob855c74fa58e0a5d860b4ddc6ba2e0aa55b0b7d61
1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
11 #include "Shared/Environment.h"
13 #include "Allocator.h"
14 #include "Configuration.h"
15 #include "Debug.h"
16 #include "DeviceTypes.h"
17 #include "DeviceUtils.h"
18 #include "Interface.h"
19 #include "LibC.h"
20 #include "Mapping.h"
21 #include "State.h"
22 #include "Synchronization.h"
24 using namespace ompx;
26 #pragma omp begin declare target device_type(nohost)
28 /// Memory implementation
29 ///
30 ///{
32 /// External symbol to access dynamic shared memory.
33 [[gnu::aligned(
34 allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
37 /// The kernel environment passed to the init method by the compiler.
38 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
40 /// The kernel launch environment passed as argument to the kernel by the
41 /// runtime.
42 static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
44 ///}
46 namespace {
48 /// Fallback implementations are missing to trigger a link time error.
49 /// Implementations for new devices, including the host, should go into a
50 /// dedicated begin/end declare variant.
51 ///
52 ///{
53 extern "C" {
54 #ifdef __AMDGPU__
56 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
57 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
59 #else
61 [[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
62 [[gnu::weak, gnu::leaf]] void free(void *Ptr);
64 #endif
66 ///}
68 /// A "smart" stack in shared memory.
69 ///
70 /// The stack exposes a malloc/free interface but works like a stack internally.
71 /// In fact, it is a separate stack *per warp*. That means, each warp must push
72 /// and pop symmetrically or this breaks, badly. The implementation will (aim
73 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
74 /// happen if a warp runs out of memory. The master warp in generic memory is
75 /// special and is given more memory than the rest.
76 ///
77 struct SharedMemorySmartStackTy {
78 /// Initialize the stack. Must be called by all threads.
79 void init(bool IsSPMD);
81 /// Allocate \p Bytes on the stack for the encountering thread. Each thread
82 /// can call this function.
83 void *push(uint64_t Bytes);
85 /// Deallocate the last allocation made by the encountering thread and pointed
86 /// to by \p Ptr from the stack. Each thread can call this function.
87 void pop(void *Ptr, uint64_t Bytes);
89 private:
90 /// Compute the size of the storage space reserved for a thread.
91 uint32_t computeThreadStorageTotal() {
92 uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
93 return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
94 allocator::ALIGNMENT);
97 /// Return the top address of the warp data stack, that is the first address
98 /// this warp will allocate memory at next.
99 void *getThreadDataTop(uint32_t TId) {
100 return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
103 /// The actual storage, shared among all warps.
104 [[gnu::aligned(
105 allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
106 [[gnu::aligned(
107 allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
110 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
111 "Shared scratchpad of this size not supported yet.");
113 /// The allocation of a single shared memory scratchpad.
114 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
116 void SharedMemorySmartStackTy::init(bool IsSPMD) {
117 Usage[mapping::getThreadIdInBlock()] = 0;
120 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
121 // First align the number of requested bytes.
122 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
123 /// be passed in as an argument and the stack rewritten to support it.
124 uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
126 uint32_t StorageTotal = computeThreadStorageTotal();
128 // The main thread in generic mode gets the space of its entire warp as the
129 // other threads do not participate in any computation at all.
130 if (mapping::isMainThreadInGenericMode())
131 StorageTotal *= mapping::getWarpSize();
133 int TId = mapping::getThreadIdInBlock();
134 if (Usage[TId] + AlignedBytes <= StorageTotal) {
135 void *Ptr = getThreadDataTop(TId);
136 Usage[TId] += AlignedBytes;
137 return Ptr;
140 if (config::isDebugMode(DeviceDebugKind::CommonIssues))
141 PRINT("Shared memory stack full, fallback to dynamic allocation of global "
142 "memory will negatively impact performance.\n");
143 void *GlobalMemory = memory::allocGlobal(
144 AlignedBytes, "Slow path shared memory allocation, insufficient "
145 "shared memory stack memory!");
146 ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
148 return GlobalMemory;
151 void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
152 uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
153 if (utils::isSharedMemPtr(Ptr)) {
154 int TId = mapping::getThreadIdInBlock();
155 Usage[TId] -= AlignedBytes;
156 return;
158 memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
161 } // namespace
163 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
165 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
166 return SharedMemorySmartStack.push(Bytes);
169 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
170 SharedMemorySmartStack.pop(Ptr, Bytes);
173 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
174 void *Ptr = malloc(Bytes);
175 if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
176 PRINT("nullptr returned by malloc!\n");
177 return Ptr;
180 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
182 ///}
184 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
185 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
186 (ActiveLevelVar == Other.ActiveLevelVar) &
187 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
188 (RunSchedVar == Other.RunSchedVar) &
189 (RunSchedChunkVar == Other.RunSchedChunkVar);
192 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
193 ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
194 ASSERT(LevelVar == Other.LevelVar, nullptr);
195 ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
196 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
197 ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
198 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
201 void state::TeamStateTy::init(bool IsSPMD) {
202 ICVState.NThreadsVar = 0;
203 ICVState.LevelVar = 0;
204 ICVState.ActiveLevelVar = 0;
205 ICVState.Padding0Val = 0;
206 ICVState.MaxActiveLevelsVar = 1;
207 ICVState.RunSchedVar = omp_sched_static;
208 ICVState.RunSchedChunkVar = 1;
209 ParallelTeamSize = 1;
210 HasThreadState = false;
211 ParallelRegionFnVar = nullptr;
214 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
215 return (ICVState == Other.ICVState) &
216 (HasThreadState == Other.HasThreadState) &
217 (ParallelTeamSize == Other.ParallelTeamSize);
220 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
221 ICVState.assertEqual(Other.ICVState);
222 ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
223 ASSERT(HasThreadState == Other.HasThreadState, nullptr);
226 state::TeamStateTy SHARED(ompx::state::TeamState);
227 state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
229 namespace {
231 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
232 int OutOfBoundsVal = -1) {
233 if (Level == 0)
234 return DefaultVal;
235 int LevelVar = omp_get_level();
236 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
237 return OutOfBoundsVal;
238 int ActiveLevel = icv::ActiveLevel;
239 if (OMP_UNLIKELY(Level != ActiveLevel))
240 return DefaultVal;
241 return Val;
244 } // namespace
246 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
247 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
248 SharedMemorySmartStack.init(IsSPMD);
249 if (mapping::isInitialThreadInLevel0(IsSPMD)) {
250 TeamState.init(IsSPMD);
251 ThreadStates = nullptr;
252 KernelEnvironmentPtr = &KernelEnvironment;
253 KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
257 KernelEnvironmentTy &state::getKernelEnvironment() {
258 return *KernelEnvironmentPtr;
261 KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
262 return *KernelLaunchEnvironmentPtr;
265 void state::enterDataEnvironment(IdentTy *Ident) {
266 ASSERT(config::mayUseThreadStates(),
267 "Thread state modified while explicitly disabled!");
268 if (!config::mayUseThreadStates())
269 return;
271 unsigned TId = mapping::getThreadIdInBlock();
272 ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
273 memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
274 uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
275 if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
276 uint32_t Bytes =
277 sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
278 void *ThreadStatesPtr =
279 memory::allocGlobal(Bytes, "Thread state array allocation");
280 memset(ThreadStatesPtr, 0, Bytes);
281 if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
282 reinterpret_cast<uintptr_t>(ThreadStatesPtr),
283 atomic::seq_cst, atomic::seq_cst))
284 memory::freeGlobal(ThreadStatesPtr,
285 "Thread state array allocated multiple times");
286 ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
287 "Expected valid thread states bit!");
289 NewThreadState->init(ThreadStates[TId]);
290 TeamState.HasThreadState = true;
291 ThreadStates[TId] = NewThreadState;
294 void state::exitDataEnvironment() {
295 ASSERT(config::mayUseThreadStates(),
296 "Thread state modified while explicitly disabled!");
298 unsigned TId = mapping::getThreadIdInBlock();
299 resetStateForThread(TId);
302 void state::resetStateForThread(uint32_t TId) {
303 if (!config::mayUseThreadStates())
304 return;
305 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
306 return;
308 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
309 memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
310 ThreadStates[TId] = PreviousThreadState;
313 void state::runAndCheckState(void(Func(void))) {
314 TeamStateTy OldTeamState = TeamState;
315 OldTeamState.assertEqual(TeamState);
317 Func();
319 OldTeamState.assertEqual(TeamState);
322 void state::assumeInitialState(bool IsSPMD) {
323 TeamStateTy InitialTeamState;
324 InitialTeamState.init(IsSPMD);
325 InitialTeamState.assertEqual(TeamState);
326 ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
329 int state::getEffectivePTeamSize() {
330 int PTeamSize = state::ParallelTeamSize;
331 return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
334 extern "C" {
335 void omp_set_dynamic(int V) {}
337 int omp_get_dynamic(void) { return 0; }
339 void omp_set_num_threads(int V) { icv::NThreads = V; }
341 int omp_get_max_threads(void) {
342 int NT = icv::NThreads;
343 return NT > 0 ? NT : mapping::getMaxTeamThreads();
346 int omp_get_level(void) {
347 int LevelVar = icv::Level;
348 ASSERT(LevelVar >= 0, nullptr);
349 return LevelVar;
352 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
354 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
356 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
357 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
358 *ChunkSize = state::RunSchedChunk;
361 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
362 icv::RunSched = (int)ScheduleKind;
363 state::RunSchedChunk = ChunkSize;
366 int omp_get_ancestor_thread_num(int Level) {
367 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
370 int omp_get_thread_num(void) {
371 return omp_get_ancestor_thread_num(omp_get_level());
374 int omp_get_team_size(int Level) {
375 return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
378 int omp_get_num_threads(void) {
379 return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
382 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
384 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
386 void omp_set_nested(int) {}
388 int omp_get_nested(void) { return false; }
390 void omp_set_max_active_levels(int Levels) {
391 icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
394 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
396 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
398 int omp_get_num_places(void) { return 0; }
400 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
402 void omp_get_place_proc_ids(int, int *) {
403 // TODO
406 int omp_get_place_num(void) { return 0; }
408 int omp_get_partition_num_places(void) { return 0; }
410 void omp_get_partition_place_nums(int *) {
411 // TODO
414 int omp_get_cancellation(void) { return 0; }
416 void omp_set_default_device(int) {}
418 int omp_get_default_device(void) { return -1; }
420 int omp_get_num_devices(void) { return config::getNumDevices(); }
422 int omp_get_device_num(void) { return config::getDeviceNum(); }
424 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
426 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
428 int omp_get_initial_device(void) { return -1; }
430 int omp_is_initial_device(void) { return 0; }
433 extern "C" {
434 [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
435 return memory::allocShared(Bytes, "Frontend alloc shared");
438 [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
439 memory::freeShared(Ptr, Bytes, "Frontend free shared");
442 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
444 void *llvm_omp_target_dynamic_shared_alloc() {
445 return __kmpc_get_dynamic_shared();
448 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
450 /// Allocate storage in shared memory to communicate arguments from the main
451 /// thread to the workers in generic mode. If we exceed
452 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
453 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
455 [[clang::loader_uninitialized]] static void
456 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
457 #pragma omp allocate(SharedMemVariableSharingSpace) \
458 allocator(omp_pteam_mem_alloc)
459 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
460 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \
461 allocator(omp_pteam_mem_alloc)
463 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
464 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
465 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
466 } else {
467 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
468 nArgs * sizeof(void *), "new extended args");
469 ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
470 "Nullptr returned by malloc!");
472 *GlobalArgs = SharedMemVariableSharingSpacePtr;
475 void __kmpc_end_sharing_variables() {
476 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
477 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
480 void __kmpc_get_shared_variables(void ***GlobalArgs) {
481 *GlobalArgs = SharedMemVariableSharingSpacePtr;
484 #pragma omp end declare target