1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Parallel implementation in the GPU. Here is the pattern:
11 // while (not finished) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
31 // To make a long story short...
33 //===----------------------------------------------------------------------===//
36 #include "Interface.h"
39 #include "Synchronization.h"
45 #pragma omp declare target
49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause
) {
50 uint32_t NThreadsICV
=
51 NumThreadsClause
!= -1 ? NumThreadsClause
: icv::NThreads
;
52 uint32_t NumThreads
= mapping::getBlockSize();
54 if (NThreadsICV
!= 0 && NThreadsICV
< NumThreads
)
55 NumThreads
= NThreadsICV
;
57 // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
58 if (NumThreads
< mapping::getWarpSize())
61 NumThreads
= (NumThreads
& ~((uint32_t)mapping::getWarpSize() - 1));
66 // Invoke an outlined parallel function unwrapping arguments (up to 32).
67 void invokeMicrotask(int32_t global_tid
, int32_t bound_tid
, void *fn
,
68 void **args
, int64_t nargs
) {
69 DebugEntryRAII
Entry(__FILE__
, __LINE__
, "<OpenMP Outlined Function>");
71 #include "generated_microtask_cases.gen"
73 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
82 void __kmpc_parallel_51(IdentTy
*ident
, int32_t, int32_t if_expr
,
83 int32_t num_threads
, int proc_bind
, void *fn
,
84 void *wrapper_fn
, void **args
, int64_t nargs
) {
85 FunctionTracingRAII();
87 uint32_t TId
= mapping::getThreadIdInBlock();
88 // Handle the serialized case first, same for SPMD/non-SPMD.
89 if (OMP_UNLIKELY(!if_expr
|| icv::Level
)) {
90 state::enterDataEnvironment();
92 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
93 state::exitDataEnvironment();
97 uint32_t NumThreads
= determineNumberOfThreads(num_threads
);
98 if (mapping::isSPMDMode()) {
99 // Avoid the race between the read of the `icv::Level` above and the write
100 // below by synchronizing all threads here.
101 synchronize::threadsAligned();
103 // Note that the order here is important. `icv::Level` has to be updated
104 // last or the other updates will cause a thread specific state to be
106 state::ValueRAII
ParallelTeamSizeRAII(state::ParallelTeamSize
, NumThreads
,
108 state::ValueRAII
ActiveLevelRAII(icv::ActiveLevel
, 1u, 0u, TId
== 0);
109 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, TId
== 0);
111 // Synchronize all threads after the main thread (TId == 0) set up the
112 // team state properly.
113 synchronize::threadsAligned();
115 ASSERT(state::ParallelTeamSize
== NumThreads
);
116 ASSERT(icv::ActiveLevel
== 1u);
117 ASSERT(icv::Level
== 1u);
119 if (TId
< NumThreads
)
120 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
122 // Synchronize all threads at the end of a parallel region.
123 synchronize::threadsAligned();
126 // Synchronize all threads to make sure every thread exits the scope above;
127 // otherwise the following assertions and the assumption in
128 // __kmpc_target_deinit may not hold.
129 synchronize::threadsAligned();
131 ASSERT(state::ParallelTeamSize
== 1u);
132 ASSERT(icv::ActiveLevel
== 0u);
133 ASSERT(icv::Level
== 0u);
137 // We do *not* create a new data environment because all threads in the team
138 // that are active are now running this parallel region. They share the
139 // TeamState, which has an increase level-var and potentially active-level
140 // set, but they do not have individual ThreadStates yet. If they ever
141 // modify the ICVs beyond this point a ThreadStates will be allocated.
143 bool IsActiveParallelRegion
= NumThreads
> 1;
144 if (!IsActiveParallelRegion
) {
145 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, true);
146 invokeMicrotask(TId
, 0, fn
, args
, nargs
);
150 void **GlobalArgs
= nullptr;
152 __kmpc_begin_sharing_variables(&GlobalArgs
, nargs
);
154 for (int I
= 0; I
< nargs
; I
++)
155 GlobalArgs
[I
] = args
[I
];
159 // Note that the order here is important. `icv::Level` has to be updated
160 // last or the other updates will cause a thread specific state to be
162 state::ValueRAII
ParallelTeamSizeRAII(state::ParallelTeamSize
, NumThreads
,
164 state::ValueRAII
ParallelRegionFnRAII(state::ParallelRegionFn
, wrapper_fn
,
165 (void *)nullptr, true);
166 state::ValueRAII
ActiveLevelRAII(icv::ActiveLevel
, 1u, 0u, true);
167 state::ValueRAII
LevelRAII(icv::Level
, 1u, 0u, true);
169 // Master signals work to activate workers.
170 synchronize::threads();
171 // Master waits for workers to signal.
172 synchronize::threads();
176 __kmpc_end_sharing_variables();
179 __attribute__((noinline
)) bool
180 __kmpc_kernel_parallel(ParallelRegionFnTy
*WorkFn
) {
181 FunctionTracingRAII();
182 // Work function and arguments for L1 parallel region.
183 *WorkFn
= state::ParallelRegionFn
;
185 // If this is the termination signal from the master, quit early.
189 // Set to true for workers participating in the parallel region.
190 uint32_t TId
= mapping::getThreadIdInBlock();
191 bool ThreadIsActive
= TId
< state::ParallelTeamSize
;
192 return ThreadIsActive
;
195 __attribute__((noinline
)) void __kmpc_kernel_end_parallel() {
196 FunctionTracingRAII();
197 // In case we have modified an ICV for this thread before a ThreadState was
198 // created. We drop it now to not contaminate the next parallel region.
199 ASSERT(!mapping::isSPMDMode());
200 uint32_t TId
= mapping::getThreadIdInBlock();
201 state::resetStateForThread(TId
);
202 ASSERT(!mapping::isSPMDMode());
205 uint16_t __kmpc_parallel_level(IdentTy
*, uint32_t) {
206 FunctionTracingRAII();
207 return omp_get_level();
210 int32_t __kmpc_global_thread_num(IdentTy
*) {
211 FunctionTracingRAII();
212 return omp_get_thread_num();
215 void __kmpc_push_num_teams(IdentTy
*loc
, int32_t tid
, int32_t num_teams
,
216 int32_t thread_limit
) {
217 FunctionTracingRAII();
220 void __kmpc_push_proc_bind(IdentTy
*loc
, uint32_t tid
, int proc_bind
) {
221 FunctionTracingRAII();
225 #pragma omp end declare target