1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUTargetMachine.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/UniformityAnalysis.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstVisitor.h"
23 #include "llvm/InitializePasses.h"
24 #include "llvm/Support/CommandLine.h"
25 #include "llvm/Support/KnownBits.h"
26 #include "llvm/Transforms/Utils/Local.h"
28 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
32 // Scalar load widening needs running after load-store-vectorizer as that pass
33 // doesn't handle overlapping cases. In addition, this pass enhances the
34 // widening to handle cases where scalar sub-dword loads are naturally aligned
35 // only but not dword aligned.
37 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38 cl::desc("Widen sub-dword constant address space loads in "
39 "AMDGPULateCodeGenPrepare"),
40 cl::ReallyHidden
, cl::init(true));
44 class AMDGPULateCodeGenPrepare
45 : public FunctionPass
,
46 public InstVisitor
<AMDGPULateCodeGenPrepare
, bool> {
47 Module
*Mod
= nullptr;
48 const DataLayout
*DL
= nullptr;
50 AssumptionCache
*AC
= nullptr;
51 UniformityInfo
*UA
= nullptr;
56 AMDGPULateCodeGenPrepare() : FunctionPass(ID
) {}
58 StringRef
getPassName() const override
{
59 return "AMDGPU IR late optimizations";
62 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
63 AU
.addRequired
<TargetPassConfig
>();
64 AU
.addRequired
<AssumptionCacheTracker
>();
65 AU
.addRequired
<UniformityInfoWrapperPass
>();
69 bool doInitialization(Module
&M
) override
;
70 bool runOnFunction(Function
&F
) override
;
72 bool visitInstruction(Instruction
&) { return false; }
74 // Check if the specified value is at least DWORD aligned.
75 bool isDWORDAligned(const Value
*V
) const {
76 KnownBits Known
= computeKnownBits(V
, *DL
, 0, AC
);
77 return Known
.countMinTrailingZeros() >= 2;
80 bool canWidenScalarExtLoad(LoadInst
&LI
) const;
81 bool visitLoadInst(LoadInst
&LI
);
84 } // end anonymous namespace
86 bool AMDGPULateCodeGenPrepare::doInitialization(Module
&M
) {
88 DL
= &Mod
->getDataLayout();
92 bool AMDGPULateCodeGenPrepare::runOnFunction(Function
&F
) {
96 const TargetPassConfig
&TPC
= getAnalysis
<TargetPassConfig
>();
97 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
98 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
99 if (ST
.hasScalarSubwordLoads())
102 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
103 UA
= &getAnalysis
<UniformityInfoWrapperPass
>().getUniformityInfo();
105 bool Changed
= false;
107 for (Instruction
&I
: llvm::make_early_inc_range(BB
))
113 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst
&LI
) const {
114 unsigned AS
= LI
.getPointerAddressSpace();
115 // Skip non-constant address space.
116 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
117 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
119 // Skip non-simple loads.
122 auto *Ty
= LI
.getType();
123 // Skip aggregate types.
124 if (Ty
->isAggregateType())
126 unsigned TySize
= DL
->getTypeStoreSize(Ty
);
127 // Only handle sub-DWORD loads.
130 // That load must be at least naturally aligned.
131 if (LI
.getAlign() < DL
->getABITypeAlign(Ty
))
133 // It should be uniform, i.e. a scalar load.
134 return UA
->isUniform(&LI
);
137 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst
&LI
) {
141 // Skip if that load is already aligned on DWORD at least as it's handled in
143 if (LI
.getAlign() >= 4)
146 if (!canWidenScalarExtLoad(LI
))
151 GetPointerBaseWithConstantOffset(LI
.getPointerOperand(), Offset
, *DL
);
152 // If that base is not DWORD aligned, it's not safe to perform the following
154 if (!isDWORDAligned(Base
))
157 int64_t Adjust
= Offset
& 0x3;
159 // With a zero adjust, the original alignment could be promoted with a
161 LI
.setAlignment(Align(4));
165 IRBuilder
<> IRB(&LI
);
166 IRB
.SetCurrentDebugLocation(LI
.getDebugLoc());
168 unsigned LdBits
= DL
->getTypeStoreSizeInBits(LI
.getType());
169 auto IntNTy
= Type::getIntNTy(LI
.getContext(), LdBits
);
171 auto *NewPtr
= IRB
.CreateConstGEP1_64(
173 IRB
.CreateAddrSpaceCast(Base
, LI
.getPointerOperand()->getType()),
176 LoadInst
*NewLd
= IRB
.CreateAlignedLoad(IRB
.getInt32Ty(), NewPtr
, Align(4));
177 NewLd
->copyMetadata(LI
);
178 NewLd
->setMetadata(LLVMContext::MD_range
, nullptr);
180 unsigned ShAmt
= Adjust
* 8;
181 auto *NewVal
= IRB
.CreateBitCast(
182 IRB
.CreateTrunc(IRB
.CreateLShr(NewLd
, ShAmt
), IntNTy
), LI
.getType());
183 LI
.replaceAllUsesWith(NewVal
);
184 RecursivelyDeleteTriviallyDeadInstructions(&LI
);
189 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare
, DEBUG_TYPE
,
190 "AMDGPU IR late optimizations", false, false)
191 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
192 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
193 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass
)
194 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare
, DEBUG_TYPE
,
195 "AMDGPU IR late optimizations", false, false)
197 char AMDGPULateCodeGenPrepare::ID
= 0;
199 FunctionPass
*llvm::createAMDGPULateCodeGenPreparePass() {
200 return new AMDGPULateCodeGenPrepare();