1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
11 /// get_local_size-like functions.
13 //===----------------------------------------------------------------------===//
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/Analysis/ConstantFolding.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/Constants.h"
22 #include "llvm/IR/Function.h"
23 #include "llvm/IR/InstIterator.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/IntrinsicsAMDGPU.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/Pass.h"
29 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
35 // Field offsets in hsa_kernel_dispatch_packet_t.
36 enum DispatchPackedOffsets
{
46 // Field offsets to implicit kernel argument pointer.
47 enum ImplicitArgOffsets
{
48 HIDDEN_BLOCK_COUNT_X
= 0,
49 HIDDEN_BLOCK_COUNT_Y
= 4,
50 HIDDEN_BLOCK_COUNT_Z
= 8,
52 HIDDEN_GROUP_SIZE_X
= 12,
53 HIDDEN_GROUP_SIZE_Y
= 14,
54 HIDDEN_GROUP_SIZE_Z
= 16,
56 HIDDEN_REMAINDER_X
= 18,
57 HIDDEN_REMAINDER_Y
= 20,
58 HIDDEN_REMAINDER_Z
= 22,
61 class AMDGPULowerKernelAttributes
: public ModulePass
{
65 AMDGPULowerKernelAttributes() : ModulePass(ID
) {}
67 bool runOnModule(Module
&M
) override
;
69 StringRef
getPassName() const override
{
70 return "AMDGPU Kernel Attributes";
73 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
78 Function
*getBasePtrIntrinsic(Module
&M
, bool IsV5OrAbove
) {
79 auto IntrinsicId
= IsV5OrAbove
? Intrinsic::amdgcn_implicitarg_ptr
80 : Intrinsic::amdgcn_dispatch_ptr
;
81 StringRef Name
= Intrinsic::getName(IntrinsicId
);
82 return M
.getFunction(Name
);
85 } // end anonymous namespace
87 static bool processUse(CallInst
*CI
, bool IsV5OrAbove
) {
88 Function
*F
= CI
->getParent()->getParent();
90 auto MD
= F
->getMetadata("reqd_work_group_size");
91 const bool HasReqdWorkGroupSize
= MD
&& MD
->getNumOperands() == 3;
93 const bool HasUniformWorkGroupSize
=
94 F
->getFnAttribute("uniform-work-group-size").getValueAsBool();
96 if (!HasReqdWorkGroupSize
&& !HasUniformWorkGroupSize
)
99 Value
*BlockCounts
[3] = {nullptr, nullptr, nullptr};
100 Value
*GroupSizes
[3] = {nullptr, nullptr, nullptr};
101 Value
*Remainders
[3] = {nullptr, nullptr, nullptr};
102 Value
*GridSizes
[3] = {nullptr, nullptr, nullptr};
104 const DataLayout
&DL
= F
->getDataLayout();
106 // We expect to see several GEP users, casted to the appropriate type and
108 for (User
*U
: CI
->users()) {
113 auto *Load
= dyn_cast
<LoadInst
>(U
); // Load from ImplicitArgPtr/DispatchPtr?
114 auto *BCI
= dyn_cast
<BitCastInst
>(U
);
116 if (GetPointerBaseWithConstantOffset(U
, Offset
, DL
) != CI
)
118 Load
= dyn_cast
<LoadInst
>(*U
->user_begin()); // Load from GEP?
119 BCI
= dyn_cast
<BitCastInst
>(*U
->user_begin());
123 if (!BCI
->hasOneUse())
125 Load
= dyn_cast
<LoadInst
>(*BCI
->user_begin()); // Load from BCI?
128 if (!Load
|| !Load
->isSimple())
131 unsigned LoadSize
= DL
.getTypeStoreSize(Load
->getType());
133 // TODO: Handle merged loads.
134 if (IsV5OrAbove
) { // Base is ImplicitArgPtr.
136 case HIDDEN_BLOCK_COUNT_X
:
138 BlockCounts
[0] = Load
;
140 case HIDDEN_BLOCK_COUNT_Y
:
142 BlockCounts
[1] = Load
;
144 case HIDDEN_BLOCK_COUNT_Z
:
146 BlockCounts
[2] = Load
;
148 case HIDDEN_GROUP_SIZE_X
:
150 GroupSizes
[0] = Load
;
152 case HIDDEN_GROUP_SIZE_Y
:
154 GroupSizes
[1] = Load
;
156 case HIDDEN_GROUP_SIZE_Z
:
158 GroupSizes
[2] = Load
;
160 case HIDDEN_REMAINDER_X
:
162 Remainders
[0] = Load
;
164 case HIDDEN_REMAINDER_Y
:
166 Remainders
[1] = Load
;
168 case HIDDEN_REMAINDER_Z
:
170 Remainders
[2] = Load
;
175 } else { // Base is DispatchPtr.
177 case WORKGROUP_SIZE_X
:
179 GroupSizes
[0] = Load
;
181 case WORKGROUP_SIZE_Y
:
183 GroupSizes
[1] = Load
;
185 case WORKGROUP_SIZE_Z
:
187 GroupSizes
[2] = Load
;
207 bool MadeChange
= false;
208 if (IsV5OrAbove
&& HasUniformWorkGroupSize
) {
209 // Under v5 __ockl_get_local_size returns the value computed by the expression:
211 // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
213 // For functions with the attribute uniform-work-group-size=true. we can evaluate
214 // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
215 // for __ockl_get_local_size.
216 for (int I
= 0; I
< 3; ++I
) {
217 Value
*BlockCount
= BlockCounts
[I
];
221 using namespace llvm::PatternMatch
;
223 I
== 0 ? m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_x
>()
224 : (I
== 1 ? m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_y
>()
225 : m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_z
>());
227 for (User
*ICmp
: BlockCount
->users()) {
228 ICmpInst::Predicate Pred
;
229 if (match(ICmp
, m_ICmp(Pred
, GroupIDIntrin
, m_Specific(BlockCount
)))) {
230 if (Pred
!= ICmpInst::ICMP_ULT
)
232 ICmp
->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp
->getType()));
238 // All remainders should be 0 with uniform work group size.
239 for (Value
*Remainder
: Remainders
) {
242 Remainder
->replaceAllUsesWith(Constant::getNullValue(Remainder
->getType()));
245 } else if (HasUniformWorkGroupSize
) { // Pre-V5.
246 // Pattern match the code used to handle partial workgroup dispatches in the
247 // library implementation of get_local_size, so the entire function can be
248 // constant folded with a known group size.
250 // uint r = grid_size - group_id * group_size;
251 // get_local_size = (r < group_size) ? r : group_size;
253 // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
254 // the grid_size is required to be a multiple of group_size). In this case:
256 // grid_size - (group_id * group_size) < group_size
258 // grid_size < group_size + (group_id * group_size)
260 // (grid_size / group_size) < 1 + group_id
262 // grid_size / group_size is at least 1, so we can conclude the select
263 // condition is false (except for group_id == 0, where the select result is
265 for (int I
= 0; I
< 3; ++I
) {
266 Value
*GroupSize
= GroupSizes
[I
];
267 Value
*GridSize
= GridSizes
[I
];
268 if (!GroupSize
|| !GridSize
)
271 using namespace llvm::PatternMatch
;
273 I
== 0 ? m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_x
>()
274 : (I
== 1 ? m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_y
>()
275 : m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_z
>());
277 for (User
*U
: GroupSize
->users()) {
278 auto *ZextGroupSize
= dyn_cast
<ZExtInst
>(U
);
282 for (User
*UMin
: ZextGroupSize
->users()) {
284 m_UMin(m_Sub(m_Specific(GridSize
),
285 m_Mul(GroupIDIntrin
, m_Specific(ZextGroupSize
))),
286 m_Specific(ZextGroupSize
)))) {
287 if (HasReqdWorkGroupSize
) {
288 ConstantInt
*KnownSize
289 = mdconst::extract
<ConstantInt
>(MD
->getOperand(I
));
290 UMin
->replaceAllUsesWith(ConstantFoldIntegerCast(
291 KnownSize
, UMin
->getType(), false, DL
));
293 UMin
->replaceAllUsesWith(ZextGroupSize
);
303 // If reqd_work_group_size is set, we can replace work group size with it.
304 if (!HasReqdWorkGroupSize
)
307 for (int I
= 0; I
< 3; I
++) {
308 Value
*GroupSize
= GroupSizes
[I
];
312 ConstantInt
*KnownSize
= mdconst::extract
<ConstantInt
>(MD
->getOperand(I
));
313 GroupSize
->replaceAllUsesWith(
314 ConstantFoldIntegerCast(KnownSize
, GroupSize
->getType(), false, DL
));
322 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
323 // TargetPassConfig for subtarget.
324 bool AMDGPULowerKernelAttributes::runOnModule(Module
&M
) {
325 bool MadeChange
= false;
327 AMDGPU::getAMDHSACodeObjectVersion(M
) >= AMDGPU::AMDHSA_COV5
;
328 Function
*BasePtr
= getBasePtrIntrinsic(M
, IsV5OrAbove
);
330 if (!BasePtr
) // ImplicitArgPtr/DispatchPtr not used.
333 SmallPtrSet
<Instruction
*, 4> HandledUses
;
334 for (auto *U
: BasePtr
->users()) {
335 CallInst
*CI
= cast
<CallInst
>(U
);
336 if (HandledUses
.insert(CI
).second
) {
337 if (processUse(CI
, IsV5OrAbove
))
346 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes
, DEBUG_TYPE
,
347 "AMDGPU Kernel Attributes", false, false)
348 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes
, DEBUG_TYPE
,
349 "AMDGPU Kernel Attributes", false, false)
351 char AMDGPULowerKernelAttributes::ID
= 0;
353 ModulePass
*llvm::createAMDGPULowerKernelAttributesPass() {
354 return new AMDGPULowerKernelAttributes();
358 AMDGPULowerKernelAttributesPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
360 AMDGPU::getAMDHSACodeObjectVersion(*F
.getParent()) >= AMDGPU::AMDHSA_COV5
;
361 Function
*BasePtr
= getBasePtrIntrinsic(*F
.getParent(), IsV5OrAbove
);
363 if (!BasePtr
) // ImplicitArgPtr/DispatchPtr not used.
364 return PreservedAnalyses::all();
366 for (Instruction
&I
: instructions(F
)) {
367 if (CallInst
*CI
= dyn_cast
<CallInst
>(&I
)) {
368 if (CI
->getCalledFunction() == BasePtr
)
369 processUse(CI
, IsV5OrAbove
);
373 return PreservedAnalyses::all();