1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Parallel implementation in the GPU. Here is the pattern:
11 // while (not finished) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
31 // To make a long story short...
33 //===----------------------------------------------------------------------===//
36 #include "DeviceTypes.h"
37 #include "DeviceUtils.h"
38 #include "Interface.h"
41 #include "Synchronization.h"
45 #pragma omp begin declare target device_type(nohost)
49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause
) {
50 uint32_t NThreadsICV
=
51 NumThreadsClause
!= -1 ? NumThreadsClause
: icv::NThreads
;
52 uint32_t NumThreads
= mapping::getMaxTeamThreads();
54 if (NThreadsICV
!= 0 && NThreadsICV
< NumThreads
)
55 NumThreads
= NThreadsICV
;
57 // SPMD mode allows any number of threads, for generic mode we round down to a
58 // multiple of WARPSIZE since it is legal to do so in OpenMP.
59 if (mapping::isSPMDMode())
62 if (NumThreads
< mapping::getWarpSize())
65 NumThreads
= (NumThreads
& ~((uint32_t)mapping::getWarpSize() - 1));
70 // Invoke an outlined parallel function unwrapping arguments (up to 32).
71 [[clang::always_inline
]] void invokeMicrotask(int32_t global_tid
,
72 int32_t bound_tid
, void *fn
,
73 void **args
, int64_t nargs
) {
75 #include "generated_microtask_cases.gen"
77 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
86 [[clang::always_inline
]] void __kmpc_parallel_spmd(IdentTy
*ident
,
88 void *fn
, void **args
,
89 const int64_t nargs
) {
90 uint32_t TId
= mapping::getThreadIdInBlock();
91 uint32_t NumThreads
= determineNumberOfThreads(num_threads
);
93 NumThreads
== mapping::getMaxTeamThreads() ? 0 : NumThreads
;
94 // Avoid the race between the read of the `icv::Level` above and the write
95 // below by synchronizing all threads here.
96 synchronize::threadsAligned(atomic::seq_cst
);
98 // Note that the order here is important. `icv::Level` has to be updated
99 // last or the other updates will cause a thread specific state to be
101 state::ValueRAII
ParallelTeamSizeRAII(state::ParallelTeamSize
, PTeamSize
,
103 /*ForceTeamState=*/true);
104 state::ValueRAII
ActiveLevelRAII(icv::ActiveLevel
, 1u, 0u, TId
== 0, ident
,
105 /*ForceTeamState=*/true);
106 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, TId
== 0, ident
,
107 /*ForceTeamState=*/true);
109 // Synchronize all threads after the main thread (TId == 0) set up the
110 // team state properly.
111 synchronize::threadsAligned(atomic::acq_rel
);
113 state::ParallelTeamSize
.assert_eq(PTeamSize
, ident
,
114 /*ForceTeamState=*/true);
115 icv::ActiveLevel
.assert_eq(1u, ident
, /*ForceTeamState=*/true);
116 icv::Level
.assert_eq(1u, ident
, /*ForceTeamState=*/true);
118 // Ensure we synchronize before we run user code to avoid invalidating the
119 // assumptions above.
120 synchronize::threadsAligned(atomic::relaxed
);
122 if (!PTeamSize
|| TId
< PTeamSize
)
123 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
125 // Synchronize all threads at the end of a parallel region.
126 synchronize::threadsAligned(atomic::seq_cst
);
129 // Synchronize all threads to make sure every thread exits the scope above;
130 // otherwise the following assertions and the assumption in
131 // __kmpc_target_deinit may not hold.
132 synchronize::threadsAligned(atomic::acq_rel
);
134 state::ParallelTeamSize
.assert_eq(1u, ident
, /*ForceTeamState=*/true);
135 icv::ActiveLevel
.assert_eq(0u, ident
, /*ForceTeamState=*/true);
136 icv::Level
.assert_eq(0u, ident
, /*ForceTeamState=*/true);
138 // Ensure we synchronize to create an aligned region around the assumptions.
139 synchronize::threadsAligned(atomic::relaxed
);
144 [[clang::always_inline
]] void
145 __kmpc_parallel_51(IdentTy
*ident
, int32_t, int32_t if_expr
,
146 int32_t num_threads
, int proc_bind
, void *fn
,
147 void *wrapper_fn
, void **args
, int64_t nargs
) {
148 uint32_t TId
= mapping::getThreadIdInBlock();
150 // Assert the parallelism level is zero if disabled by the user.
151 ASSERT((config::mayUseNestedParallelism() || icv::Level
== 0),
152 "nested parallelism while disabled");
154 // Handle the serialized case first, same for SPMD/non-SPMD:
156 // 2) parallel in task or other thread state inducing construct
157 // 3) nested parallel regions
158 if (OMP_UNLIKELY(!if_expr
|| state::HasThreadState
||
159 (config::mayUseNestedParallelism() && icv::Level
))) {
160 state::DateEnvironmentRAII
DERAII(ident
);
162 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
166 // From this point forward we know that there is no thread state used.
167 ASSERT(state::HasThreadState
== false, nullptr);
169 if (mapping::isSPMDMode()) {
170 // This was moved to its own routine so it could be called directly
171 // in certain situations to avoid resource consumption of unused
172 // logic in parallel_51.
173 __kmpc_parallel_spmd(ident
, num_threads
, fn
, args
, nargs
);
178 uint32_t NumThreads
= determineNumberOfThreads(num_threads
);
179 uint32_t MaxTeamThreads
= mapping::getMaxTeamThreads();
180 uint32_t PTeamSize
= NumThreads
== MaxTeamThreads
? 0 : NumThreads
;
182 // We do *not* create a new data environment because all threads in the team
183 // that are active are now running this parallel region. They share the
184 // TeamState, which has an increase level-var and potentially active-level
185 // set, but they do not have individual ThreadStates yet. If they ever
186 // modify the ICVs beyond this point a ThreadStates will be allocated.
188 bool IsActiveParallelRegion
= NumThreads
> 1;
189 if (!IsActiveParallelRegion
) {
190 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, true, ident
);
191 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
195 void **GlobalArgs
= nullptr;
197 __kmpc_begin_sharing_variables(&GlobalArgs
, nargs
);
200 for (int I
= 0; I
< nargs
; I
++)
201 GlobalArgs
[I
] = args
[I
];
204 GlobalArgs
[15] = args
[15];
207 GlobalArgs
[14] = args
[14];
210 GlobalArgs
[13] = args
[13];
213 GlobalArgs
[12] = args
[12];
216 GlobalArgs
[11] = args
[11];
219 GlobalArgs
[10] = args
[10];
222 GlobalArgs
[9] = args
[9];
225 GlobalArgs
[8] = args
[8];
228 GlobalArgs
[7] = args
[7];
231 GlobalArgs
[6] = args
[6];
234 GlobalArgs
[5] = args
[5];
237 GlobalArgs
[4] = args
[4];
240 GlobalArgs
[3] = args
[3];
243 GlobalArgs
[2] = args
[2];
246 GlobalArgs
[1] = args
[1];
249 GlobalArgs
[0] = args
[0];
257 // Note that the order here is important. `icv::Level` has to be updated
258 // last or the other updates will cause a thread specific state to be
260 state::ValueRAII
ParallelTeamSizeRAII(state::ParallelTeamSize
, PTeamSize
,
262 /*ForceTeamState=*/true);
263 state::ValueRAII
ParallelRegionFnRAII(state::ParallelRegionFn
, wrapper_fn
,
264 (void *)nullptr, true, ident
,
265 /*ForceTeamState=*/true);
266 state::ValueRAII
ActiveLevelRAII(icv::ActiveLevel
, 1u, 0u, true, ident
,
267 /*ForceTeamState=*/true);
268 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, true, ident
,
269 /*ForceTeamState=*/true);
271 // Master signals work to activate workers.
272 synchronize::threads(atomic::seq_cst
);
273 // Master waits for workers to signal.
274 synchronize::threads(atomic::seq_cst
);
278 __kmpc_end_sharing_variables();
281 [[clang::noinline
]] bool __kmpc_kernel_parallel(ParallelRegionFnTy
*WorkFn
) {
282 // Work function and arguments for L1 parallel region.
283 *WorkFn
= state::ParallelRegionFn
;
285 // If this is the termination signal from the master, quit early.
289 // Set to true for workers participating in the parallel region.
290 uint32_t TId
= mapping::getThreadIdInBlock();
291 bool ThreadIsActive
= TId
< state::getEffectivePTeamSize();
292 return ThreadIsActive
;
295 [[clang::noinline
]] void __kmpc_kernel_end_parallel() {
296 // In case we have modified an ICV for this thread before a ThreadState was
297 // created. We drop it now to not contaminate the next parallel region.
298 ASSERT(!mapping::isSPMDMode(), nullptr);
299 uint32_t TId
= mapping::getThreadIdInBlock();
300 state::resetStateForThread(TId
);
301 ASSERT(!mapping::isSPMDMode(), nullptr);
304 uint16_t __kmpc_parallel_level(IdentTy
*, uint32_t) { return omp_get_level(); }
306 int32_t __kmpc_global_thread_num(IdentTy
*) { return omp_get_thread_num(); }
308 void __kmpc_push_num_teams(IdentTy
*loc
, int32_t tid
, int32_t num_teams
,
309 int32_t thread_limit
) {}
311 void __kmpc_push_proc_bind(IdentTy
*loc
, uint32_t tid
, int proc_bind
) {}
314 #pragma omp end declare target