Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / DeviceRTL / src / Mapping.cpp
blob822b8dc2dd5e67129ea57ae696ac052c117f6d54
1 //===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //
10 //===----------------------------------------------------------------------===//
12 #include "Mapping.h"
13 #include "Interface.h"
14 #include "State.h"
15 #include "Types.h"
16 #include "Utils.h"
18 #pragma omp begin declare target device_type(nohost)
20 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
22 using namespace ompx;
24 namespace ompx {
25 namespace impl {
27 // Forward declarations defined to be defined for AMDGCN and NVPTX.
28 const llvm::omp::GV &getGridValue();
29 LaneMaskTy activemask();
30 LaneMaskTy lanemaskLT();
31 LaneMaskTy lanemaskGT();
32 uint32_t getThreadIdInWarp();
33 uint32_t getThreadIdInBlock(int32_t Dim);
34 uint32_t getNumberOfThreadsInBlock(int32_t Dim);
35 uint32_t getNumberOfThreadsInKernel();
36 uint32_t getBlockIdInKernel(int32_t Dim);
37 uint32_t getNumberOfBlocksInKernel(int32_t Dim);
38 uint32_t getWarpIdInBlock();
39 uint32_t getNumberOfWarpsInBlock();
41 /// AMDGCN Implementation
42 ///
43 ///{
44 #pragma omp begin declare variant match(device = {arch(amdgcn)})
46 const llvm::omp::GV &getGridValue() {
47 return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
50 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
51 switch (Dim) {
52 case 0:
53 return __builtin_amdgcn_workgroup_size_x();
54 case 1:
55 return __builtin_amdgcn_workgroup_size_y();
56 case 2:
57 return __builtin_amdgcn_workgroup_size_z();
59 UNREACHABLE("Dim outside range!");
62 LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
64 LaneMaskTy lanemaskLT() {
65 uint32_t Lane = mapping::getThreadIdInWarp();
66 int64_t Ballot = mapping::activemask();
67 uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
68 return Mask & Ballot;
71 LaneMaskTy lanemaskGT() {
72 uint32_t Lane = mapping::getThreadIdInWarp();
73 if (Lane == (mapping::getWarpSize() - 1))
74 return 0;
75 int64_t Ballot = mapping::activemask();
76 uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
77 return Mask & Ballot;
80 uint32_t getThreadIdInWarp() {
81 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
84 uint32_t getThreadIdInBlock(int32_t Dim) {
85 switch (Dim) {
86 case 0:
87 return __builtin_amdgcn_workitem_id_x();
88 case 1:
89 return __builtin_amdgcn_workitem_id_y();
90 case 2:
91 return __builtin_amdgcn_workitem_id_z();
93 UNREACHABLE("Dim outside range!");
96 uint32_t getNumberOfThreadsInKernel() {
97 return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
98 __builtin_amdgcn_grid_size_z();
101 uint32_t getBlockIdInKernel(int32_t Dim) {
102 switch (Dim) {
103 case 0:
104 return __builtin_amdgcn_workgroup_id_x();
105 case 1:
106 return __builtin_amdgcn_workgroup_id_y();
107 case 2:
108 return __builtin_amdgcn_workgroup_id_z();
110 UNREACHABLE("Dim outside range!");
113 uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
114 switch (Dim) {
115 case 0:
116 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
117 case 1:
118 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
119 case 2:
120 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
122 UNREACHABLE("Dim outside range!");
125 uint32_t getWarpIdInBlock() {
126 return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
129 uint32_t getNumberOfWarpsInBlock() {
130 return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
133 #pragma omp end declare variant
134 ///}
136 /// NVPTX Implementation
138 ///{
139 #pragma omp begin declare variant match( \
140 device = {arch(nvptx, nvptx64)}, \
141 implementation = {extension(match_any)})
143 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
144 switch (Dim) {
145 case 0:
146 return __nvvm_read_ptx_sreg_ntid_x();
147 case 1:
148 return __nvvm_read_ptx_sreg_ntid_y();
149 case 2:
150 return __nvvm_read_ptx_sreg_ntid_z();
152 UNREACHABLE("Dim outside range!");
155 const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
157 LaneMaskTy activemask() {
158 unsigned int Mask;
159 asm("activemask.b32 %0;" : "=r"(Mask));
160 return Mask;
163 LaneMaskTy lanemaskLT() {
164 __kmpc_impl_lanemask_t Res;
165 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
166 return Res;
169 LaneMaskTy lanemaskGT() {
170 __kmpc_impl_lanemask_t Res;
171 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
172 return Res;
175 uint32_t getThreadIdInBlock(int32_t Dim) {
176 switch (Dim) {
177 case 0:
178 return __nvvm_read_ptx_sreg_tid_x();
179 case 1:
180 return __nvvm_read_ptx_sreg_tid_y();
181 case 2:
182 return __nvvm_read_ptx_sreg_tid_z();
184 UNREACHABLE("Dim outside range!");
187 uint32_t getThreadIdInWarp() {
188 return impl::getThreadIdInBlock(mapping::DIM_X) &
189 (mapping::getWarpSize() - 1);
192 uint32_t getBlockIdInKernel(int32_t Dim) {
193 switch (Dim) {
194 case 0:
195 return __nvvm_read_ptx_sreg_ctaid_x();
196 case 1:
197 return __nvvm_read_ptx_sreg_ctaid_y();
198 case 2:
199 return __nvvm_read_ptx_sreg_ctaid_z();
201 UNREACHABLE("Dim outside range!");
204 uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
205 switch (Dim) {
206 case 0:
207 return __nvvm_read_ptx_sreg_nctaid_x();
208 case 1:
209 return __nvvm_read_ptx_sreg_nctaid_y();
210 case 2:
211 return __nvvm_read_ptx_sreg_nctaid_z();
213 UNREACHABLE("Dim outside range!");
216 uint32_t getNumberOfThreadsInKernel() {
217 return impl::getNumberOfThreadsInBlock(0) *
218 impl::getNumberOfBlocksInKernel(0) *
219 impl::getNumberOfThreadsInBlock(1) *
220 impl::getNumberOfBlocksInKernel(1) *
221 impl::getNumberOfThreadsInBlock(2) *
222 impl::getNumberOfBlocksInKernel(2);
225 uint32_t getWarpIdInBlock() {
226 return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
229 uint32_t getNumberOfWarpsInBlock() {
230 return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
231 mapping::getWarpSize();
234 #pragma omp end declare variant
235 ///}
237 uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
239 } // namespace impl
240 } // namespace ompx
242 /// We have to be deliberate about the distinction of `mapping::` and `impl::`
243 /// below to avoid repeating assumptions or including irrelevant ones.
244 ///{
246 static bool isInLastWarp() {
247 uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
248 ~(mapping::getWarpSize() - 1);
249 return mapping::getThreadIdInBlock() == MainTId;
252 bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
253 if (IsSPMD || icv::Level)
254 return false;
256 // Check if this is the last warp in the block.
257 return isInLastWarp();
260 bool mapping::isMainThreadInGenericMode() {
261 return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
264 bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
265 if (IsSPMD)
266 return mapping::getThreadIdInBlock() == 0;
267 return isInLastWarp();
270 bool mapping::isLeaderInWarp() {
271 __kmpc_impl_lanemask_t Active = mapping::activemask();
272 __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
273 return utils::popc(Active & LaneMaskLT) == 0;
276 LaneMaskTy mapping::activemask() { return impl::activemask(); }
278 LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
280 LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
282 uint32_t mapping::getThreadIdInWarp() {
283 uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
284 ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
285 return ThreadIdInWarp;
288 uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
289 uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
290 return ThreadIdInBlock;
293 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
295 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
296 uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
297 // If we are in SPMD mode, remove one warp.
298 return BlockSize - (!IsSPMD * impl::getWarpSize());
300 uint32_t mapping::getMaxTeamThreads() {
301 return mapping::getMaxTeamThreads(mapping::isSPMDMode());
304 uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
305 return impl::getNumberOfThreadsInBlock(Dim);
308 uint32_t mapping::getNumberOfThreadsInKernel() {
309 return impl::getNumberOfThreadsInKernel();
312 uint32_t mapping::getWarpIdInBlock() {
313 uint32_t WarpID = impl::getWarpIdInBlock();
314 ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
315 return WarpID;
318 uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
319 uint32_t BlockId = impl::getBlockIdInKernel(Dim);
320 ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
321 return BlockId;
324 uint32_t mapping::getNumberOfWarpsInBlock() {
325 uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
326 ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
327 return NumberOfWarpsInBlocks;
330 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
331 uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
332 ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
333 return NumberOfBlocks;
336 uint32_t mapping::getNumberOfProcessorElements() {
337 return static_cast<uint32_t>(config::getHardwareParallelism());
340 ///}
342 /// Execution mode
344 ///{
346 // TODO: This is a workaround for initialization coming from kernels outside of
347 // the TU. We will need to solve this more correctly in the future.
348 [[gnu::weak]] int SHARED(IsSPMDMode);
350 void mapping::init(bool IsSPMD) {
351 if (mapping::isInitialThreadInLevel0(IsSPMD))
352 IsSPMDMode = IsSPMD;
355 bool mapping::isSPMDMode() { return IsSPMDMode; }
357 bool mapping::isGenericMode() { return !isSPMDMode(); }
358 ///}
360 extern "C" {
361 [[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
362 return mapping::getThreadIdInBlock();
365 [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
366 return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
369 [[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
370 return impl::getWarpSize();
374 #define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \
375 extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
377 _TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
378 _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
379 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
380 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
382 #pragma omp end declare target