1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// \file This pass does attempts to make use of reqd_work_group_size metadata
11 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
12 /// get_local_size-like functions.
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUTargetMachine.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/Constants.h"
22 #include "llvm/IR/Function.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Pass.h"
27 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
33 // Field offsets in hsa_kernel_dispatch_packet_t.
34 enum DispatchPackedOffsets
{
44 class AMDGPULowerKernelAttributes
: public ModulePass
{
45 Module
*Mod
= nullptr;
50 AMDGPULowerKernelAttributes() : ModulePass(ID
) {}
52 bool processUse(CallInst
*CI
);
54 bool doInitialization(Module
&M
) override
;
55 bool runOnModule(Module
&M
) override
;
57 StringRef
getPassName() const override
{
58 return "AMDGPU Kernel Attributes";
61 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
66 } // end anonymous namespace
68 bool AMDGPULowerKernelAttributes::doInitialization(Module
&M
) {
73 bool AMDGPULowerKernelAttributes::processUse(CallInst
*CI
) {
74 Function
*F
= CI
->getParent()->getParent();
76 auto MD
= F
->getMetadata("reqd_work_group_size");
77 const bool HasReqdWorkGroupSize
= MD
&& MD
->getNumOperands() == 3;
79 const bool HasUniformWorkGroupSize
=
80 F
->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
82 if (!HasReqdWorkGroupSize
&& !HasUniformWorkGroupSize
)
85 Value
*WorkGroupSizeX
= nullptr;
86 Value
*WorkGroupSizeY
= nullptr;
87 Value
*WorkGroupSizeZ
= nullptr;
89 Value
*GridSizeX
= nullptr;
90 Value
*GridSizeY
= nullptr;
91 Value
*GridSizeZ
= nullptr;
93 const DataLayout
&DL
= Mod
->getDataLayout();
95 // We expect to see several GEP users, casted to the appropriate type and
97 for (User
*U
: CI
->users()) {
102 if (GetPointerBaseWithConstantOffset(U
, Offset
, DL
) != CI
)
105 auto *BCI
= dyn_cast
<BitCastInst
>(*U
->user_begin());
106 if (!BCI
|| !BCI
->hasOneUse())
109 auto *Load
= dyn_cast
<LoadInst
>(*BCI
->user_begin());
110 if (!Load
|| !Load
->isSimple())
113 unsigned LoadSize
= DL
.getTypeStoreSize(Load
->getType());
115 // TODO: Handle merged loads.
117 case WORKGROUP_SIZE_X
:
119 WorkGroupSizeX
= Load
;
121 case WORKGROUP_SIZE_Y
:
123 WorkGroupSizeY
= Load
;
125 case WORKGROUP_SIZE_Z
:
127 WorkGroupSizeZ
= Load
;
146 // Pattern match the code used to handle partial workgroup dispatches in the
147 // library implementation of get_local_size, so the entire function can be
148 // constant folded with a known group size.
150 // uint r = grid_size - group_id * group_size;
151 // get_local_size = (r < group_size) ? r : group_size;
153 // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
154 // the grid_size is required to be a multiple of group_size). In this case:
156 // grid_size - (group_id * group_size) < group_size
158 // grid_size < group_size + (group_id * group_size)
160 // (grid_size / group_size) < 1 + group_id
162 // grid_size / group_size is at least 1, so we can conclude the select
163 // condition is false (except for group_id == 0, where the select result is
166 bool MadeChange
= false;
167 Value
*WorkGroupSizes
[3] = { WorkGroupSizeX
, WorkGroupSizeY
, WorkGroupSizeZ
};
168 Value
*GridSizes
[3] = { GridSizeX
, GridSizeY
, GridSizeZ
};
170 for (int I
= 0; HasUniformWorkGroupSize
&& I
< 3; ++I
) {
171 Value
*GroupSize
= WorkGroupSizes
[I
];
172 Value
*GridSize
= GridSizes
[I
];
173 if (!GroupSize
|| !GridSize
)
176 for (User
*U
: GroupSize
->users()) {
177 auto *ZextGroupSize
= dyn_cast
<ZExtInst
>(U
);
181 for (User
*ZextUser
: ZextGroupSize
->users()) {
182 auto *SI
= dyn_cast
<SelectInst
>(ZextUser
);
186 using namespace llvm::PatternMatch
;
187 auto GroupIDIntrin
= I
== 0 ?
188 m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_x
>() :
189 (I
== 1 ? m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_y
>() :
190 m_Intrinsic
<Intrinsic::amdgcn_workgroup_id_z
>());
192 auto SubExpr
= m_Sub(m_Specific(GridSize
),
193 m_Mul(GroupIDIntrin
, m_Specific(ZextGroupSize
)));
195 ICmpInst::Predicate Pred
;
197 m_Select(m_ICmp(Pred
, SubExpr
, m_Specific(ZextGroupSize
)),
199 m_Specific(ZextGroupSize
))) &&
200 Pred
== ICmpInst::ICMP_ULT
) {
201 if (HasReqdWorkGroupSize
) {
202 ConstantInt
*KnownSize
203 = mdconst::extract
<ConstantInt
>(MD
->getOperand(I
));
204 SI
->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize
,
208 SI
->replaceAllUsesWith(ZextGroupSize
);
217 if (!HasReqdWorkGroupSize
)
220 // Eliminate any other loads we can from the dispatch packet.
221 for (int I
= 0; I
< 3; ++I
) {
222 Value
*GroupSize
= WorkGroupSizes
[I
];
226 ConstantInt
*KnownSize
= mdconst::extract
<ConstantInt
>(MD
->getOperand(I
));
227 GroupSize
->replaceAllUsesWith(
228 ConstantExpr::getIntegerCast(KnownSize
,
229 GroupSize
->getType(),
237 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
238 // TargetPassConfig for subtarget.
239 bool AMDGPULowerKernelAttributes::runOnModule(Module
&M
) {
240 StringRef DispatchPtrName
241 = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr
);
243 Function
*DispatchPtr
= Mod
->getFunction(DispatchPtrName
);
244 if (!DispatchPtr
) // Dispatch ptr not used.
247 bool MadeChange
= false;
249 SmallPtrSet
<Instruction
*, 4> HandledUses
;
250 for (auto *U
: DispatchPtr
->users()) {
251 CallInst
*CI
= cast
<CallInst
>(U
);
252 if (HandledUses
.insert(CI
).second
) {
261 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes
, DEBUG_TYPE
,
262 "AMDGPU IR optimizations", false, false)
263 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes
, DEBUG_TYPE
, "AMDGPU IR optimizations",
266 char AMDGPULowerKernelAttributes::ID
= 0;
268 ModulePass
*llvm::createAMDGPULowerKernelAttributesPass() {
269 return new AMDGPULowerKernelAttributes();