Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / DeviceRTL / src / Parallelism.cpp
blob2c0701bd5358fd93065ece0b8168c2c5ca453a20
1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
11 // while (not finished) {
13 // if (master) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
16 // }
17 // syncthreads // A
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 // }
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
31 // To make a long story short...
33 //===----------------------------------------------------------------------===//
35 #include "Debug.h"
36 #include "Interface.h"
37 #include "Mapping.h"
38 #include "State.h"
39 #include "Synchronization.h"
40 #include "Types.h"
41 #include "Utils.h"
43 using namespace ompx;
45 #pragma omp begin declare target device_type(nohost)
47 namespace {
49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
50 uint32_t NThreadsICV =
51 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
52 uint32_t NumThreads = mapping::getMaxTeamThreads();
54 if (NThreadsICV != 0 && NThreadsICV < NumThreads)
55 NumThreads = NThreadsICV;
57 // SPMD mode allows any number of threads, for generic mode we round down to a
58 // multiple of WARPSIZE since it is legal to do so in OpenMP.
59 if (mapping::isSPMDMode())
60 return NumThreads;
62 if (NumThreads < mapping::getWarpSize())
63 NumThreads = 1;
64 else
65 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
67 return NumThreads;
70 // Invoke an outlined parallel function unwrapping arguments (up to 32).
71 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
72 int32_t bound_tid, void *fn,
73 void **args, int64_t nargs) {
74 switch (nargs) {
75 #include "generated_microtask_cases.gen"
76 default:
77 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
78 __builtin_trap();
82 } // namespace
84 extern "C" {
86 [[clang::always_inline]] void
87 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
88 int32_t num_threads, int proc_bind, void *fn,
89 void *wrapper_fn, void **args, int64_t nargs) {
90 uint32_t TId = mapping::getThreadIdInBlock();
92 // Assert the parallelism level is zero if disabled by the user.
93 ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
94 "nested parallelism while disabled");
96 // Handle the serialized case first, same for SPMD/non-SPMD:
97 // 1) if-clause(0)
98 // 2) parallel in task or other thread state inducing construct
99 // 3) nested parallel regions
100 if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
101 (config::mayUseNestedParallelism() && icv::Level))) {
102 state::DateEnvironmentRAII DERAII(ident);
103 ++icv::Level;
104 invokeMicrotask(TId, 0, fn, args, nargs);
105 return;
108 // From this point forward we know that there is no thread state used.
109 ASSERT(state::HasThreadState == false, nullptr);
111 uint32_t NumThreads = determineNumberOfThreads(num_threads);
112 uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
113 uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
114 if (mapping::isSPMDMode()) {
115 // Avoid the race between the read of the `icv::Level` above and the write
116 // below by synchronizing all threads here.
117 synchronize::threadsAligned(atomic::seq_cst);
119 // Note that the order here is important. `icv::Level` has to be updated
120 // last or the other updates will cause a thread specific state to be
121 // created.
122 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
123 1u, TId == 0, ident,
124 /* ForceTeamState */ true);
125 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
126 ident, /* ForceTeamState */ true);
127 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
128 /* ForceTeamState */ true);
130 // Synchronize all threads after the main thread (TId == 0) set up the
131 // team state properly.
132 synchronize::threadsAligned(atomic::acq_rel);
134 state::ParallelTeamSize.assert_eq(PTeamSize, ident,
135 /* ForceTeamState */ true);
136 icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
137 icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
139 // Ensure we synchronize before we run user code to avoid invalidating the
140 // assumptions above.
141 synchronize::threadsAligned(atomic::relaxed);
143 if (!PTeamSize || TId < PTeamSize)
144 invokeMicrotask(TId, 0, fn, args, nargs);
146 // Synchronize all threads at the end of a parallel region.
147 synchronize::threadsAligned(atomic::seq_cst);
150 // Synchronize all threads to make sure every thread exits the scope above;
151 // otherwise the following assertions and the assumption in
152 // __kmpc_target_deinit may not hold.
153 synchronize::threadsAligned(atomic::acq_rel);
155 state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
156 icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
157 icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
159 // Ensure we synchronize to create an aligned region around the assumptions.
160 synchronize::threadsAligned(atomic::relaxed);
162 return;
165 // We do *not* create a new data environment because all threads in the team
166 // that are active are now running this parallel region. They share the
167 // TeamState, which has an increase level-var and potentially active-level
168 // set, but they do not have individual ThreadStates yet. If they ever
169 // modify the ICVs beyond this point a ThreadStates will be allocated.
171 bool IsActiveParallelRegion = NumThreads > 1;
172 if (!IsActiveParallelRegion) {
173 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
174 invokeMicrotask(TId, 0, fn, args, nargs);
175 return;
178 void **GlobalArgs = nullptr;
179 if (nargs) {
180 __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
181 switch (nargs) {
182 default:
183 for (int I = 0; I < nargs; I++)
184 GlobalArgs[I] = args[I];
185 break;
186 case 16:
187 GlobalArgs[15] = args[15];
188 [[fallthrough]];
189 case 15:
190 GlobalArgs[14] = args[14];
191 [[fallthrough]];
192 case 14:
193 GlobalArgs[13] = args[13];
194 [[fallthrough]];
195 case 13:
196 GlobalArgs[12] = args[12];
197 [[fallthrough]];
198 case 12:
199 GlobalArgs[11] = args[11];
200 [[fallthrough]];
201 case 11:
202 GlobalArgs[10] = args[10];
203 [[fallthrough]];
204 case 10:
205 GlobalArgs[9] = args[9];
206 [[fallthrough]];
207 case 9:
208 GlobalArgs[8] = args[8];
209 [[fallthrough]];
210 case 8:
211 GlobalArgs[7] = args[7];
212 [[fallthrough]];
213 case 7:
214 GlobalArgs[6] = args[6];
215 [[fallthrough]];
216 case 6:
217 GlobalArgs[5] = args[5];
218 [[fallthrough]];
219 case 5:
220 GlobalArgs[4] = args[4];
221 [[fallthrough]];
222 case 4:
223 GlobalArgs[3] = args[3];
224 [[fallthrough]];
225 case 3:
226 GlobalArgs[2] = args[2];
227 [[fallthrough]];
228 case 2:
229 GlobalArgs[1] = args[1];
230 [[fallthrough]];
231 case 1:
232 GlobalArgs[0] = args[0];
233 [[fallthrough]];
234 case 0:
235 break;
240 // Note that the order here is important. `icv::Level` has to be updated
241 // last or the other updates will cause a thread specific state to be
242 // created.
243 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
244 1u, true, ident,
245 /* ForceTeamState */ true);
246 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
247 (void *)nullptr, true, ident,
248 /* ForceTeamState */ true);
249 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
250 /* ForceTeamState */ true);
251 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
252 /* ForceTeamState */ true);
254 // Master signals work to activate workers.
255 synchronize::threads(atomic::seq_cst);
256 // Master waits for workers to signal.
257 synchronize::threads(atomic::seq_cst);
260 if (nargs)
261 __kmpc_end_sharing_variables();
264 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
265 // Work function and arguments for L1 parallel region.
266 *WorkFn = state::ParallelRegionFn;
268 // If this is the termination signal from the master, quit early.
269 if (!*WorkFn)
270 return false;
272 // Set to true for workers participating in the parallel region.
273 uint32_t TId = mapping::getThreadIdInBlock();
274 bool ThreadIsActive = TId < state::getEffectivePTeamSize();
275 return ThreadIsActive;
278 [[clang::noinline]] void __kmpc_kernel_end_parallel() {
279 // In case we have modified an ICV for this thread before a ThreadState was
280 // created. We drop it now to not contaminate the next parallel region.
281 ASSERT(!mapping::isSPMDMode(), nullptr);
282 uint32_t TId = mapping::getThreadIdInBlock();
283 state::resetStateForThread(TId);
284 ASSERT(!mapping::isSPMDMode(), nullptr);
287 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
289 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
291 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
292 int32_t thread_limit) {}
294 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
297 #pragma omp end declare target