1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Parallel implementation in the GPU. Here is the pattern:
11 // while (not finished) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
31 // To make a long story short...
33 //===----------------------------------------------------------------------===//
36 #include "Interface.h"
39 #include "Synchronization.h"
45 #pragma omp begin declare target device_type(nohost)
49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause
) {
50 uint32_t NThreadsICV
=
51 NumThreadsClause
!= -1 ? NumThreadsClause
: icv::NThreads
;
52 uint32_t NumThreads
= mapping::getMaxTeamThreads();
54 if (NThreadsICV
!= 0 && NThreadsICV
< NumThreads
)
55 NumThreads
= NThreadsICV
;
57 // SPMD mode allows any number of threads, for generic mode we round down to a
58 // multiple of WARPSIZE since it is legal to do so in OpenMP.
59 if (mapping::isSPMDMode())
62 if (NumThreads
< mapping::getWarpSize())
65 NumThreads
= (NumThreads
& ~((uint32_t)mapping::getWarpSize() - 1));
70 // Invoke an outlined parallel function unwrapping arguments (up to 32).
71 [[clang::always_inline
]] void invokeMicrotask(int32_t global_tid
,
72 int32_t bound_tid
, void *fn
,
73 void **args
, int64_t nargs
) {
75 #include "generated_microtask_cases.gen"
77 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
86 [[clang::always_inline
]] void
87 __kmpc_parallel_51(IdentTy
*ident
, int32_t, int32_t if_expr
,
88 int32_t num_threads
, int proc_bind
, void *fn
,
89 void *wrapper_fn
, void **args
, int64_t nargs
) {
90 uint32_t TId
= mapping::getThreadIdInBlock();
92 // Assert the parallelism level is zero if disabled by the user.
93 ASSERT((config::mayUseNestedParallelism() || icv::Level
== 0),
94 "nested parallelism while disabled");
96 // Handle the serialized case first, same for SPMD/non-SPMD:
98 // 2) parallel in task or other thread state inducing construct
99 // 3) nested parallel regions
100 if (OMP_UNLIKELY(!if_expr
|| state::HasThreadState
||
101 (config::mayUseNestedParallelism() && icv::Level
))) {
102 state::DateEnvironmentRAII
DERAII(ident
);
104 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
108 // From this point forward we know that there is no thread state used.
109 ASSERT(state::HasThreadState
== false, nullptr);
111 uint32_t NumThreads
= determineNumberOfThreads(num_threads
);
112 uint32_t MaxTeamThreads
= mapping::getMaxTeamThreads();
113 uint32_t PTeamSize
= NumThreads
== MaxTeamThreads
? 0 : NumThreads
;
114 if (mapping::isSPMDMode()) {
115 // Avoid the race between the read of the `icv::Level` above and the write
116 // below by synchronizing all threads here.
117 synchronize::threadsAligned(atomic::seq_cst
);
119 // Note that the order here is important. `icv::Level` has to be updated
120 // last or the other updates will cause a thread specific state to be
122 state::ValueRAII
ParallelTeamSizeRAII(state::ParallelTeamSize
, PTeamSize
,
124 /* ForceTeamState */ true);
125 state::ValueRAII
ActiveLevelRAII(icv::ActiveLevel
, 1u, 0u, TId
== 0,
126 ident
, /* ForceTeamState */ true);
127 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, TId
== 0, ident
,
128 /* ForceTeamState */ true);
130 // Synchronize all threads after the main thread (TId == 0) set up the
131 // team state properly.
132 synchronize::threadsAligned(atomic::acq_rel
);
134 state::ParallelTeamSize
.assert_eq(PTeamSize
, ident
,
135 /* ForceTeamState */ true);
136 icv::ActiveLevel
.assert_eq(1u, ident
, /* ForceTeamState */ true);
137 icv::Level
.assert_eq(1u, ident
, /* ForceTeamState */ true);
139 // Ensure we synchronize before we run user code to avoid invalidating the
140 // assumptions above.
141 synchronize::threadsAligned(atomic::relaxed
);
143 if (!PTeamSize
|| TId
< PTeamSize
)
144 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
146 // Synchronize all threads at the end of a parallel region.
147 synchronize::threadsAligned(atomic::seq_cst
);
150 // Synchronize all threads to make sure every thread exits the scope above;
151 // otherwise the following assertions and the assumption in
152 // __kmpc_target_deinit may not hold.
153 synchronize::threadsAligned(atomic::acq_rel
);
155 state::ParallelTeamSize
.assert_eq(1u, ident
, /* ForceTeamState */ true);
156 icv::ActiveLevel
.assert_eq(0u, ident
, /* ForceTeamState */ true);
157 icv::Level
.assert_eq(0u, ident
, /* ForceTeamState */ true);
159 // Ensure we synchronize to create an aligned region around the assumptions.
160 synchronize::threadsAligned(atomic::relaxed
);
165 // We do *not* create a new data environment because all threads in the team
166 // that are active are now running this parallel region. They share the
167 // TeamState, which has an increase level-var and potentially active-level
168 // set, but they do not have individual ThreadStates yet. If they ever
169 // modify the ICVs beyond this point a ThreadStates will be allocated.
171 bool IsActiveParallelRegion
= NumThreads
> 1;
172 if (!IsActiveParallelRegion
) {
173 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, true, ident
);
174 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
178 void **GlobalArgs
= nullptr;
180 __kmpc_begin_sharing_variables(&GlobalArgs
, nargs
);
183 for (int I
= 0; I
< nargs
; I
++)
184 GlobalArgs
[I
] = args
[I
];
187 GlobalArgs
[15] = args
[15];
190 GlobalArgs
[14] = args
[14];
193 GlobalArgs
[13] = args
[13];
196 GlobalArgs
[12] = args
[12];
199 GlobalArgs
[11] = args
[11];
202 GlobalArgs
[10] = args
[10];
205 GlobalArgs
[9] = args
[9];
208 GlobalArgs
[8] = args
[8];
211 GlobalArgs
[7] = args
[7];
214 GlobalArgs
[6] = args
[6];
217 GlobalArgs
[5] = args
[5];
220 GlobalArgs
[4] = args
[4];
223 GlobalArgs
[3] = args
[3];
226 GlobalArgs
[2] = args
[2];
229 GlobalArgs
[1] = args
[1];
232 GlobalArgs
[0] = args
[0];
240 // Note that the order here is important. `icv::Level` has to be updated
241 // last or the other updates will cause a thread specific state to be
243 state::ValueRAII
ParallelTeamSizeRAII(state::ParallelTeamSize
, PTeamSize
,
245 /* ForceTeamState */ true);
246 state::ValueRAII
ParallelRegionFnRAII(state::ParallelRegionFn
, wrapper_fn
,
247 (void *)nullptr, true, ident
,
248 /* ForceTeamState */ true);
249 state::ValueRAII
ActiveLevelRAII(icv::ActiveLevel
, 1u, 0u, true, ident
,
250 /* ForceTeamState */ true);
251 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, true, ident
,
252 /* ForceTeamState */ true);
254 // Master signals work to activate workers.
255 synchronize::threads(atomic::seq_cst
);
256 // Master waits for workers to signal.
257 synchronize::threads(atomic::seq_cst
);
261 __kmpc_end_sharing_variables();
264 [[clang::noinline
]] bool __kmpc_kernel_parallel(ParallelRegionFnTy
*WorkFn
) {
265 // Work function and arguments for L1 parallel region.
266 *WorkFn
= state::ParallelRegionFn
;
268 // If this is the termination signal from the master, quit early.
272 // Set to true for workers participating in the parallel region.
273 uint32_t TId
= mapping::getThreadIdInBlock();
274 bool ThreadIsActive
= TId
< state::getEffectivePTeamSize();
275 return ThreadIsActive
;
278 [[clang::noinline
]] void __kmpc_kernel_end_parallel() {
279 // In case we have modified an ICV for this thread before a ThreadState was
280 // created. We drop it now to not contaminate the next parallel region.
281 ASSERT(!mapping::isSPMDMode(), nullptr);
282 uint32_t TId
= mapping::getThreadIdInBlock();
283 state::resetStateForThread(TId
);
284 ASSERT(!mapping::isSPMDMode(), nullptr);
287 uint16_t __kmpc_parallel_level(IdentTy
*, uint32_t) { return omp_get_level(); }
289 int32_t __kmpc_global_thread_num(IdentTy
*) { return omp_get_thread_num(); }
291 void __kmpc_push_num_teams(IdentTy
*loc
, int32_t tid
, int32_t num_teams
,
292 int32_t thread_limit
) {}
294 void __kmpc_push_proc_bind(IdentTy
*loc
, uint32_t tid
, int proc_bind
) {}
297 #pragma omp end declare target