1 //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains the implementation of reduction with KMPC interface.
11 //===----------------------------------------------------------------------===//
14 #include "DeviceTypes.h"
15 #include "DeviceUtils.h"
16 #include "Interface.h"
19 #include "Synchronization.h"
25 #pragma omp begin declare target device_type(nohost)
27 void gpu_regular_warp_reduce(void *reduce_data
, ShuffleReductFnTy shflFct
) {
28 for (uint32_t mask
= mapping::getWarpSize() / 2; mask
> 0; mask
/= 2) {
29 shflFct(reduce_data
, /*LaneId - not used= */ 0,
30 /*Offset = */ mask
, /*AlgoVersion=*/0);
34 void gpu_irregular_warp_reduce(void *reduce_data
, ShuffleReductFnTy shflFct
,
35 uint32_t size
, uint32_t tid
) {
41 shflFct(reduce_data
, /*LaneId = */ tid
, /*Offset=*/mask
, /*AlgoVersion=*/1);
42 curr_size
= (curr_size
+ 1) / 2;
47 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48 static uint32_t gpu_irregular_simd_reduce(void *reduce_data
,
49 ShuffleReductFnTy shflFct
) {
50 uint32_t size
, remote_id
, physical_lane_id
;
51 physical_lane_id
= mapping::getThreadIdInBlock() % mapping::getWarpSize();
52 __kmpc_impl_lanemask_t lanemask_lt
= mapping::lanemaskLT();
53 __kmpc_impl_lanemask_t Liveness
= mapping::activemask();
54 uint32_t logical_lane_id
= utils::popc(Liveness
& lanemask_lt
) * 2;
55 __kmpc_impl_lanemask_t lanemask_gt
= mapping::lanemaskGT();
57 Liveness
= mapping::activemask();
58 remote_id
= utils::ffs(Liveness
& lanemask_gt
);
59 size
= utils::popc(Liveness
);
61 shflFct(reduce_data
, /*LaneId =*/logical_lane_id
,
62 /*Offset=*/remote_id
- 1 - physical_lane_id
, /*AlgoVersion=*/2);
63 } while (logical_lane_id
% 2 == 0 && size
> 1);
64 return (logical_lane_id
== 0);
68 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data
,
69 ShuffleReductFnTy shflFct
,
70 InterWarpCopyFnTy cpyFct
) {
71 uint32_t BlockThreadId
= mapping::getThreadIdInBlock();
72 if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
74 uint32_t NumThreads
= omp_get_num_threads();
78 * This reduce function handles reduction within a team. It handles
79 * parallel regions in both L1 and L2 parallelism levels. It also
80 * supports Generic, SPMD, and NoOMP modes.
82 * 1. Reduce within a warp.
83 * 2. Warp master copies value to warp 0 via shared memory.
84 * 3. Warp 0 reduces to a single value.
85 * 4. The reduced value is available in the thread that returns 1.
88 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
89 uint32_t WarpsNeeded
=
90 (NumThreads
+ mapping::getWarpSize() - 1) / mapping::getWarpSize();
91 uint32_t WarpId
= mapping::getWarpIdInBlock();
93 // Volta execution model:
94 // For the Generic execution mode a parallel region either has 1 thread and
95 // beyond that, always a multiple of 32. For the SPMD execution mode we may
96 // have any number of threads.
97 if ((NumThreads
% mapping::getWarpSize() == 0) || (WarpId
< WarpsNeeded
- 1))
98 gpu_regular_warp_reduce(reduce_data
, shflFct
);
99 else if (NumThreads
> 1) // Only SPMD execution mode comes thru this case.
100 gpu_irregular_warp_reduce(reduce_data
, shflFct
,
101 /*LaneCount=*/NumThreads
% mapping::getWarpSize(),
102 /*LaneId=*/mapping::getThreadIdInBlock() %
103 mapping::getWarpSize());
105 // When we have more than [mapping::getWarpSize()] number of threads
106 // a block reduction is performed here.
108 // Only L1 parallel region can enter this if condition.
109 if (NumThreads
> mapping::getWarpSize()) {
110 // Gather all the reduced values from each warp
111 // to the first warp.
112 cpyFct(reduce_data
, WarpsNeeded
);
115 gpu_irregular_warp_reduce(reduce_data
, shflFct
, WarpsNeeded
,
118 return BlockThreadId
== 0;
120 __kmpc_impl_lanemask_t Liveness
= mapping::activemask();
121 if (Liveness
== lanes::All
) // Full warp
122 gpu_regular_warp_reduce(reduce_data
, shflFct
);
123 else if (!(Liveness
& (Liveness
+ 1))) // Partial warp but contiguous lanes
124 gpu_irregular_warp_reduce(reduce_data
, shflFct
,
125 /*LaneCount=*/utils::popc(Liveness
),
126 /*LaneId=*/mapping::getThreadIdInBlock() %
127 mapping::getWarpSize());
128 else { // Dispersed lanes. Only threads in L2
129 // parallel region may enter here; return
131 return gpu_irregular_simd_reduce(reduce_data
, shflFct
);
134 // When we have more than [mapping::getWarpSize()] number of threads
135 // a block reduction is performed here.
137 // Only L1 parallel region can enter this if condition.
138 if (NumThreads
> mapping::getWarpSize()) {
139 uint32_t WarpsNeeded
=
140 (NumThreads
+ mapping::getWarpSize() - 1) / mapping::getWarpSize();
141 // Gather all the reduced values from each warp
142 // to the first warp.
143 cpyFct(reduce_data
, WarpsNeeded
);
145 uint32_t WarpId
= BlockThreadId
/ mapping::getWarpSize();
147 gpu_irregular_warp_reduce(reduce_data
, shflFct
, WarpsNeeded
,
150 return BlockThreadId
== 0;
153 // Get the OMP thread Id. This is different from BlockThreadId in the case of
154 // an L2 parallel region.
155 return BlockThreadId
== 0;
156 #endif // __CUDA_ARCH__ >= 700
159 uint32_t roundToWarpsize(uint32_t s
) {
160 if (s
< mapping::getWarpSize())
162 return (s
& ~(unsigned)(mapping::getWarpSize() - 1));
165 uint32_t kmpcMin(uint32_t x
, uint32_t y
) { return x
< y
? x
: y
; }
170 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy
*Loc
,
171 uint64_t reduce_data_size
,
173 ShuffleReductFnTy shflFct
,
174 InterWarpCopyFnTy cpyFct
) {
175 return nvptx_parallel_reduce_nowait(reduce_data
, shflFct
, cpyFct
);
178 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
179 IdentTy
*Loc
, void *GlobalBuffer
, uint32_t num_of_records
,
180 uint64_t reduce_data_size
, void *reduce_data
, ShuffleReductFnTy shflFct
,
181 InterWarpCopyFnTy cpyFct
, ListGlobalFnTy lgcpyFct
, ListGlobalFnTy lgredFct
,
182 ListGlobalFnTy glcpyFct
, ListGlobalFnTy glredFct
) {
183 // Terminate all threads in non-SPMD mode except for the master thread.
184 uint32_t ThreadId
= mapping::getThreadIdInBlock();
185 if (mapping::isGenericMode()) {
186 if (!mapping::isMainThreadInGenericMode())
191 uint32_t &IterCnt
= state::getKernelLaunchEnvironment().ReductionIterCnt
;
192 uint32_t &Cnt
= state::getKernelLaunchEnvironment().ReductionCnt
;
194 // In non-generic mode all workers participate in the teams reduction.
195 // In generic mode only the team master participates in the teams
196 // reduction because the workers are waiting for parallel work.
197 uint32_t NumThreads
= omp_get_num_threads();
198 uint32_t TeamId
= omp_get_team_num();
199 uint32_t NumTeams
= omp_get_num_teams();
200 static unsigned SHARED(Bound
);
201 static unsigned SHARED(ChunkTeamCount
);
203 // Block progress for teams greater than the current upper
204 // limit. We always only allow a number of teams less or equal
205 // to the number of slots in the buffer.
206 bool IsMaster
= (ThreadId
== 0);
208 Bound
= atomic::load(&IterCnt
, atomic::aquire
);
209 if (TeamId
< Bound
+ num_of_records
)
214 int ModBockId
= TeamId
% num_of_records
;
215 if (TeamId
< num_of_records
) {
216 lgcpyFct(GlobalBuffer
, ModBockId
, reduce_data
);
218 lgredFct(GlobalBuffer
, ModBockId
, reduce_data
);
220 // Propagate the memory writes above to the world.
221 fence::kernel(atomic::release
);
223 // Increment team counter.
224 // This counter is incremented by all teams in the current
225 // num_of_records chunk.
226 ChunkTeamCount
= atomic::inc(&Cnt
, num_of_records
- 1u, atomic::seq_cst
,
227 atomic::MemScopeTy::device
);
230 // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
232 if (mapping::isSPMDMode())
233 synchronize::threadsAligned(atomic::acq_rel
);
235 // reduce_data is global or shared so before being reduced within the
236 // warp we need to bring it in local memory:
237 // local_reduce_data = reduce_data[i]
239 // Example for 3 reduction variables a, b, c (of potentially different
242 // buffer layout (struct of arrays):
243 // a, a, ..., a, b, b, ... b, c, c, ... c
247 // local_data_reduce layout (struct):
250 // Each thread will have a local struct containing the values to be
252 // 1. do reduction within each warp.
253 // 2. do reduction across warps.
254 // 3. write the final result to the main reduction variable
255 // by returning 1 in the thread holding the reduction result.
257 // Check if this is the very last team.
258 unsigned NumRecs
= kmpcMin(NumTeams
, uint32_t(num_of_records
));
259 if (ChunkTeamCount
== NumTeams
- Bound
- 1) {
260 // Ensure we see the global memory writes by other teams
261 fence::kernel(atomic::aquire
);
264 // Last team processing.
266 if (ThreadId
>= NumRecs
)
268 NumThreads
= roundToWarpsize(kmpcMin(NumThreads
, NumRecs
));
269 if (ThreadId
>= NumThreads
)
272 // Load from buffer and reduce.
273 glcpyFct(GlobalBuffer
, ThreadId
, reduce_data
);
274 for (uint32_t i
= NumThreads
+ ThreadId
; i
< NumRecs
; i
+= NumThreads
)
275 glredFct(GlobalBuffer
, i
, reduce_data
);
277 // Reduce across warps to the warp master.
278 if (NumThreads
> 1) {
279 gpu_regular_warp_reduce(reduce_data
, shflFct
);
281 // When we have more than [mapping::getWarpSize()] number of threads
282 // a block reduction is performed here.
283 uint32_t ActiveThreads
= kmpcMin(NumRecs
, NumThreads
);
284 if (ActiveThreads
> mapping::getWarpSize()) {
285 uint32_t WarpsNeeded
= (ActiveThreads
+ mapping::getWarpSize() - 1) /
286 mapping::getWarpSize();
287 // Gather all the reduced values from each warp
288 // to the first warp.
289 cpyFct(reduce_data
, WarpsNeeded
);
291 uint32_t WarpId
= ThreadId
/ mapping::getWarpSize();
293 gpu_irregular_warp_reduce(reduce_data
, shflFct
, WarpsNeeded
,
305 if (IsMaster
&& ChunkTeamCount
== num_of_records
- 1) {
306 // Allow SIZE number of teams to proceed writing their
307 // intermediate results to the global buffer.
308 atomic::add(&IterCnt
, uint32_t(num_of_records
), atomic::seq_cst
);
315 void *__kmpc_reduction_get_fixed_buffer() {
316 return state::getKernelLaunchEnvironment().ReductionBuffer
;
319 #pragma omp end declare target