1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
11 #include "Shared/Environment.h"
13 #include "Allocator.h"
14 #include "Configuration.h"
16 #include "DeviceTypes.h"
17 #include "DeviceUtils.h"
18 #include "Interface.h"
22 #include "Synchronization.h"
26 #pragma omp begin declare target device_type(nohost)
28 /// Memory implementation
32 /// External symbol to access dynamic shared memory.
34 allocator::ALIGNMENT
)]] extern unsigned char DynamicSharedBuffer
[];
35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
37 /// The kernel environment passed to the init method by the compiler.
38 static KernelEnvironmentTy
*SHARED(KernelEnvironmentPtr
);
40 /// The kernel launch environment passed as argument to the kernel by the
42 static KernelLaunchEnvironmentTy
*SHARED(KernelLaunchEnvironmentPtr
);
48 /// Fallback implementations are missing to trigger a link time error.
49 /// Implementations for new devices, including the host, should go into a
50 /// dedicated begin/end declare variant.
56 [[gnu::weak
]] void *malloc(size_t Size
) { return allocator::alloc(Size
); }
57 [[gnu::weak
]] void free(void *Ptr
) { allocator::free(Ptr
); }
61 [[gnu::weak
, gnu::leaf
]] void *malloc(size_t Size
);
62 [[gnu::weak
, gnu::leaf
]] void free(void *Ptr
);
68 /// A "smart" stack in shared memory.
70 /// The stack exposes a malloc/free interface but works like a stack internally.
71 /// In fact, it is a separate stack *per warp*. That means, each warp must push
72 /// and pop symmetrically or this breaks, badly. The implementation will (aim
73 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
74 /// happen if a warp runs out of memory. The master warp in generic memory is
75 /// special and is given more memory than the rest.
77 struct SharedMemorySmartStackTy
{
78 /// Initialize the stack. Must be called by all threads.
79 void init(bool IsSPMD
);
81 /// Allocate \p Bytes on the stack for the encountering thread. Each thread
82 /// can call this function.
83 void *push(uint64_t Bytes
);
85 /// Deallocate the last allocation made by the encountering thread and pointed
86 /// to by \p Ptr from the stack. Each thread can call this function.
87 void pop(void *Ptr
, uint64_t Bytes
);
90 /// Compute the size of the storage space reserved for a thread.
91 uint32_t computeThreadStorageTotal() {
92 uint32_t NumLanesInBlock
= mapping::getNumberOfThreadsInBlock();
93 return utils::alignDown((state::SharedScratchpadSize
/ NumLanesInBlock
),
94 allocator::ALIGNMENT
);
97 /// Return the top address of the warp data stack, that is the first address
98 /// this warp will allocate memory at next.
99 void *getThreadDataTop(uint32_t TId
) {
100 return &Data
[computeThreadStorageTotal() * TId
+ Usage
[TId
]];
103 /// The actual storage, shared among all warps.
105 allocator::ALIGNMENT
)]] unsigned char Data
[state::SharedScratchpadSize
];
107 allocator::ALIGNMENT
)]] unsigned char Usage
[mapping::MaxThreadsPerTeam
];
110 static_assert(state::SharedScratchpadSize
/ mapping::MaxThreadsPerTeam
<= 256,
111 "Shared scratchpad of this size not supported yet.");
113 /// The allocation of a single shared memory scratchpad.
114 static SharedMemorySmartStackTy
SHARED(SharedMemorySmartStack
);
116 void SharedMemorySmartStackTy::init(bool IsSPMD
) {
117 Usage
[mapping::getThreadIdInBlock()] = 0;
120 void *SharedMemorySmartStackTy::push(uint64_t Bytes
) {
121 // First align the number of requested bytes.
122 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
123 /// be passed in as an argument and the stack rewritten to support it.
124 uint64_t AlignedBytes
= utils::alignPtr(Bytes
, allocator::ALIGNMENT
);
126 uint32_t StorageTotal
= computeThreadStorageTotal();
128 // The main thread in generic mode gets the space of its entire warp as the
129 // other threads do not participate in any computation at all.
130 if (mapping::isMainThreadInGenericMode())
131 StorageTotal
*= mapping::getWarpSize();
133 int TId
= mapping::getThreadIdInBlock();
134 if (Usage
[TId
] + AlignedBytes
<= StorageTotal
) {
135 void *Ptr
= getThreadDataTop(TId
);
136 Usage
[TId
] += AlignedBytes
;
140 if (config::isDebugMode(DeviceDebugKind::CommonIssues
))
141 PRINT("Shared memory stack full, fallback to dynamic allocation of global "
142 "memory will negatively impact performance.\n");
143 void *GlobalMemory
= memory::allocGlobal(
144 AlignedBytes
, "Slow path shared memory allocation, insufficient "
145 "shared memory stack memory!");
146 ASSERT(GlobalMemory
!= nullptr, "nullptr returned by malloc!");
151 void SharedMemorySmartStackTy::pop(void *Ptr
, uint64_t Bytes
) {
152 uint64_t AlignedBytes
= utils::alignPtr(Bytes
, allocator::ALIGNMENT
);
153 if (utils::isSharedMemPtr(Ptr
)) {
154 int TId
= mapping::getThreadIdInBlock();
155 Usage
[TId
] -= AlignedBytes
;
158 memory::freeGlobal(Ptr
, "Slow path shared memory deallocation");
163 void *memory::getDynamicBuffer() { return DynamicSharedBuffer
; }
165 void *memory::allocShared(uint64_t Bytes
, const char *Reason
) {
166 return SharedMemorySmartStack
.push(Bytes
);
169 void memory::freeShared(void *Ptr
, uint64_t Bytes
, const char *Reason
) {
170 SharedMemorySmartStack
.pop(Ptr
, Bytes
);
173 void *memory::allocGlobal(uint64_t Bytes
, const char *Reason
) {
174 void *Ptr
= malloc(Bytes
);
175 if (config::isDebugMode(DeviceDebugKind::CommonIssues
) && Ptr
== nullptr)
176 PRINT("nullptr returned by malloc!\n");
180 void memory::freeGlobal(void *Ptr
, const char *Reason
) { free(Ptr
); }
184 bool state::ICVStateTy::operator==(const ICVStateTy
&Other
) const {
185 return (NThreadsVar
== Other
.NThreadsVar
) & (LevelVar
== Other
.LevelVar
) &
186 (ActiveLevelVar
== Other
.ActiveLevelVar
) &
187 (MaxActiveLevelsVar
== Other
.MaxActiveLevelsVar
) &
188 (RunSchedVar
== Other
.RunSchedVar
) &
189 (RunSchedChunkVar
== Other
.RunSchedChunkVar
);
192 void state::ICVStateTy::assertEqual(const ICVStateTy
&Other
) const {
193 ASSERT(NThreadsVar
== Other
.NThreadsVar
, nullptr);
194 ASSERT(LevelVar
== Other
.LevelVar
, nullptr);
195 ASSERT(ActiveLevelVar
== Other
.ActiveLevelVar
, nullptr);
196 ASSERT(MaxActiveLevelsVar
== Other
.MaxActiveLevelsVar
, nullptr);
197 ASSERT(RunSchedVar
== Other
.RunSchedVar
, nullptr);
198 ASSERT(RunSchedChunkVar
== Other
.RunSchedChunkVar
, nullptr);
201 void state::TeamStateTy::init(bool IsSPMD
) {
202 ICVState
.NThreadsVar
= 0;
203 ICVState
.LevelVar
= 0;
204 ICVState
.ActiveLevelVar
= 0;
205 ICVState
.Padding0Val
= 0;
206 ICVState
.MaxActiveLevelsVar
= 1;
207 ICVState
.RunSchedVar
= omp_sched_static
;
208 ICVState
.RunSchedChunkVar
= 1;
209 ParallelTeamSize
= 1;
210 HasThreadState
= false;
211 ParallelRegionFnVar
= nullptr;
214 bool state::TeamStateTy::operator==(const TeamStateTy
&Other
) const {
215 return (ICVState
== Other
.ICVState
) &
216 (HasThreadState
== Other
.HasThreadState
) &
217 (ParallelTeamSize
== Other
.ParallelTeamSize
);
220 void state::TeamStateTy::assertEqual(TeamStateTy
&Other
) const {
221 ICVState
.assertEqual(Other
.ICVState
);
222 ASSERT(ParallelTeamSize
== Other
.ParallelTeamSize
, nullptr);
223 ASSERT(HasThreadState
== Other
.HasThreadState
, nullptr);
226 state::TeamStateTy
SHARED(ompx::state::TeamState
);
227 state::ThreadStateTy
**SHARED(ompx::state::ThreadStates
);
231 int returnValIfLevelIsActive(int Level
, int Val
, int DefaultVal
,
232 int OutOfBoundsVal
= -1) {
235 int LevelVar
= omp_get_level();
236 if (OMP_UNLIKELY(Level
< 0 || Level
> LevelVar
))
237 return OutOfBoundsVal
;
238 int ActiveLevel
= icv::ActiveLevel
;
239 if (OMP_UNLIKELY(Level
!= ActiveLevel
))
246 void state::init(bool IsSPMD
, KernelEnvironmentTy
&KernelEnvironment
,
247 KernelLaunchEnvironmentTy
&KernelLaunchEnvironment
) {
248 SharedMemorySmartStack
.init(IsSPMD
);
249 if (mapping::isInitialThreadInLevel0(IsSPMD
)) {
250 TeamState
.init(IsSPMD
);
251 ThreadStates
= nullptr;
252 KernelEnvironmentPtr
= &KernelEnvironment
;
253 KernelLaunchEnvironmentPtr
= &KernelLaunchEnvironment
;
257 KernelEnvironmentTy
&state::getKernelEnvironment() {
258 return *KernelEnvironmentPtr
;
261 KernelLaunchEnvironmentTy
&state::getKernelLaunchEnvironment() {
262 return *KernelLaunchEnvironmentPtr
;
265 void state::enterDataEnvironment(IdentTy
*Ident
) {
266 ASSERT(config::mayUseThreadStates(),
267 "Thread state modified while explicitly disabled!");
268 if (!config::mayUseThreadStates())
271 unsigned TId
= mapping::getThreadIdInBlock();
272 ThreadStateTy
*NewThreadState
= static_cast<ThreadStateTy
*>(
273 memory::allocGlobal(sizeof(ThreadStateTy
), "ThreadStates alloc"));
274 uintptr_t *ThreadStatesBitsPtr
= reinterpret_cast<uintptr_t *>(&ThreadStates
);
275 if (!atomic::load(ThreadStatesBitsPtr
, atomic::seq_cst
)) {
277 sizeof(ThreadStates
[0]) * mapping::getNumberOfThreadsInBlock();
278 void *ThreadStatesPtr
=
279 memory::allocGlobal(Bytes
, "Thread state array allocation");
280 memset(ThreadStatesPtr
, 0, Bytes
);
281 if (!atomic::cas(ThreadStatesBitsPtr
, uintptr_t(0),
282 reinterpret_cast<uintptr_t>(ThreadStatesPtr
),
283 atomic::seq_cst
, atomic::seq_cst
))
284 memory::freeGlobal(ThreadStatesPtr
,
285 "Thread state array allocated multiple times");
286 ASSERT(atomic::load(ThreadStatesBitsPtr
, atomic::seq_cst
),
287 "Expected valid thread states bit!");
289 NewThreadState
->init(ThreadStates
[TId
]);
290 TeamState
.HasThreadState
= true;
291 ThreadStates
[TId
] = NewThreadState
;
294 void state::exitDataEnvironment() {
295 ASSERT(config::mayUseThreadStates(),
296 "Thread state modified while explicitly disabled!");
298 unsigned TId
= mapping::getThreadIdInBlock();
299 resetStateForThread(TId
);
302 void state::resetStateForThread(uint32_t TId
) {
303 if (!config::mayUseThreadStates())
305 if (OMP_LIKELY(!TeamState
.HasThreadState
|| !ThreadStates
[TId
]))
308 ThreadStateTy
*PreviousThreadState
= ThreadStates
[TId
]->PreviousThreadState
;
309 memory::freeGlobal(ThreadStates
[TId
], "ThreadStates dealloc");
310 ThreadStates
[TId
] = PreviousThreadState
;
313 void state::runAndCheckState(void(Func(void))) {
314 TeamStateTy OldTeamState
= TeamState
;
315 OldTeamState
.assertEqual(TeamState
);
319 OldTeamState
.assertEqual(TeamState
);
322 void state::assumeInitialState(bool IsSPMD
) {
323 TeamStateTy InitialTeamState
;
324 InitialTeamState
.init(IsSPMD
);
325 InitialTeamState
.assertEqual(TeamState
);
326 ASSERT(mapping::isSPMDMode() == IsSPMD
, nullptr);
329 int state::getEffectivePTeamSize() {
330 int PTeamSize
= state::ParallelTeamSize
;
331 return PTeamSize
? PTeamSize
: mapping::getMaxTeamThreads();
335 void omp_set_dynamic(int V
) {}
337 int omp_get_dynamic(void) { return 0; }
339 void omp_set_num_threads(int V
) { icv::NThreads
= V
; }
341 int omp_get_max_threads(void) {
342 int NT
= icv::NThreads
;
343 return NT
> 0 ? NT
: mapping::getMaxTeamThreads();
346 int omp_get_level(void) {
347 int LevelVar
= icv::Level
;
348 ASSERT(LevelVar
>= 0, nullptr);
352 int omp_get_active_level(void) { return !!icv::ActiveLevel
; }
354 int omp_in_parallel(void) { return !!icv::ActiveLevel
; }
356 void omp_get_schedule(omp_sched_t
*ScheduleKind
, int *ChunkSize
) {
357 *ScheduleKind
= static_cast<omp_sched_t
>((int)icv::RunSched
);
358 *ChunkSize
= state::RunSchedChunk
;
361 void omp_set_schedule(omp_sched_t ScheduleKind
, int ChunkSize
) {
362 icv::RunSched
= (int)ScheduleKind
;
363 state::RunSchedChunk
= ChunkSize
;
366 int omp_get_ancestor_thread_num(int Level
) {
367 return returnValIfLevelIsActive(Level
, mapping::getThreadIdInBlock(), 0);
370 int omp_get_thread_num(void) {
371 return omp_get_ancestor_thread_num(omp_get_level());
374 int omp_get_team_size(int Level
) {
375 return returnValIfLevelIsActive(Level
, state::getEffectivePTeamSize(), 1);
378 int omp_get_num_threads(void) {
379 return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
382 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
384 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
386 void omp_set_nested(int) {}
388 int omp_get_nested(void) { return false; }
390 void omp_set_max_active_levels(int Levels
) {
391 icv::MaxActiveLevels
= Levels
> 0 ? 1 : 0;
394 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels
; }
396 omp_proc_bind_t
omp_get_proc_bind(void) { return omp_proc_bind_false
; }
398 int omp_get_num_places(void) { return 0; }
400 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
402 void omp_get_place_proc_ids(int, int *) {
406 int omp_get_place_num(void) { return 0; }
408 int omp_get_partition_num_places(void) { return 0; }
410 void omp_get_partition_place_nums(int *) {
414 int omp_get_cancellation(void) { return 0; }
416 void omp_set_default_device(int) {}
418 int omp_get_default_device(void) { return -1; }
420 int omp_get_num_devices(void) { return config::getNumDevices(); }
422 int omp_get_device_num(void) { return config::getDeviceNum(); }
424 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
426 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
428 int omp_get_initial_device(void) { return -1; }
430 int omp_is_initial_device(void) { return 0; }
434 [[clang::noinline
]] void *__kmpc_alloc_shared(uint64_t Bytes
) {
435 return memory::allocShared(Bytes
, "Frontend alloc shared");
438 [[clang::noinline
]] void __kmpc_free_shared(void *Ptr
, uint64_t Bytes
) {
439 memory::freeShared(Ptr
, Bytes
, "Frontend free shared");
442 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
444 void *llvm_omp_target_dynamic_shared_alloc() {
445 return __kmpc_get_dynamic_shared();
448 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
450 /// Allocate storage in shared memory to communicate arguments from the main
451 /// thread to the workers in generic mode. If we exceed
452 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
453 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM
= 64;
455 [[clang::loader_uninitialized
]] static void
456 *SharedMemVariableSharingSpace
[NUM_SHARED_VARIABLES_IN_SHARED_MEM
];
457 #pragma omp allocate(SharedMemVariableSharingSpace) \
458 allocator(omp_pteam_mem_alloc)
459 [[clang::loader_uninitialized
]] static void **SharedMemVariableSharingSpacePtr
;
460 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \
461 allocator(omp_pteam_mem_alloc)
463 void __kmpc_begin_sharing_variables(void ***GlobalArgs
, uint64_t nArgs
) {
464 if (nArgs
<= NUM_SHARED_VARIABLES_IN_SHARED_MEM
) {
465 SharedMemVariableSharingSpacePtr
= &SharedMemVariableSharingSpace
[0];
467 SharedMemVariableSharingSpacePtr
= (void **)memory::allocGlobal(
468 nArgs
* sizeof(void *), "new extended args");
469 ASSERT(SharedMemVariableSharingSpacePtr
!= nullptr,
470 "Nullptr returned by malloc!");
472 *GlobalArgs
= SharedMemVariableSharingSpacePtr
;
475 void __kmpc_end_sharing_variables() {
476 if (SharedMemVariableSharingSpacePtr
!= &SharedMemVariableSharingSpace
[0])
477 memory::freeGlobal(SharedMemVariableSharingSpacePtr
, "new extended args");
480 void __kmpc_get_shared_variables(void ***GlobalArgs
) {
481 *GlobalArgs
= SharedMemVariableSharingSpacePtr
;
484 #pragma omp end declare target