1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
13 //===----------------------------------------------------------------------===//
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/InstVisitor.h"
21 #include "llvm/InitializePasses.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/KnownBits.h"
24 #include "llvm/Transforms/Utils/Local.h"
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
30 // Scalar load widening needs running after load-store-vectorizer as that pass
31 // doesn't handle overlapping cases. In addition, this pass enhances the
32 // widening to handle cases where scalar sub-dword loads are naturally aligned
33 // only but not dword aligned.
35 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36 cl::desc("Widen sub-dword constant address space loads in "
37 "AMDGPULateCodeGenPrepare"),
38 cl::ReallyHidden
, cl::init(true));
42 class AMDGPULateCodeGenPrepare
43 : public FunctionPass
,
44 public InstVisitor
<AMDGPULateCodeGenPrepare
, bool> {
45 Module
*Mod
= nullptr;
46 const DataLayout
*DL
= nullptr;
48 AssumptionCache
*AC
= nullptr;
49 LegacyDivergenceAnalysis
*DA
= nullptr;
54 AMDGPULateCodeGenPrepare() : FunctionPass(ID
) {}
56 StringRef
getPassName() const override
{
57 return "AMDGPU IR late optimizations";
60 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
61 AU
.addRequired
<AssumptionCacheTracker
>();
62 AU
.addRequired
<LegacyDivergenceAnalysis
>();
66 bool doInitialization(Module
&M
) override
;
67 bool runOnFunction(Function
&F
) override
;
69 bool visitInstruction(Instruction
&) { return false; }
71 // Check if the specified value is at least DWORD aligned.
72 bool isDWORDAligned(const Value
*V
) const {
73 KnownBits Known
= computeKnownBits(V
, *DL
, 0, AC
);
74 return Known
.countMinTrailingZeros() >= 2;
77 bool canWidenScalarExtLoad(LoadInst
&LI
) const;
78 bool visitLoadInst(LoadInst
&LI
);
81 } // end anonymous namespace
83 bool AMDGPULateCodeGenPrepare::doInitialization(Module
&M
) {
85 DL
= &Mod
->getDataLayout();
89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function
&F
) {
93 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
94 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
98 for (auto BI
= BB
.begin(), BE
= BB
.end(); BI
!= BE
; /*EMPTY*/) {
99 Instruction
*I
= &*BI
++;
100 Changed
|= visit(*I
);
106 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst
&LI
) const {
107 unsigned AS
= LI
.getPointerAddressSpace();
108 // Skip non-constant address space.
109 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
110 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
112 // Skip non-simple loads.
115 auto *Ty
= LI
.getType();
116 // Skip aggregate types.
117 if (Ty
->isAggregateType())
119 unsigned TySize
= DL
->getTypeStoreSize(Ty
);
120 // Only handle sub-DWORD loads.
123 // That load must be at least naturally aligned.
124 if (LI
.getAlign() < DL
->getABITypeAlign(Ty
))
126 // It should be uniform, i.e. a scalar load.
127 return DA
->isUniform(&LI
);
130 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst
&LI
) {
134 // Skip if that load is already aligned on DWORD at least as it's handled in
136 if (LI
.getAlign() >= 4)
139 if (!canWidenScalarExtLoad(LI
))
144 GetPointerBaseWithConstantOffset(LI
.getPointerOperand(), Offset
, *DL
);
145 // If that base is not DWORD aligned, it's not safe to perform the following
147 if (!isDWORDAligned(Base
))
150 int64_t Adjust
= Offset
& 0x3;
152 // With a zero adjust, the original alignment could be promoted with a
154 LI
.setAlignment(Align(4));
158 IRBuilder
<> IRB(&LI
);
159 IRB
.SetCurrentDebugLocation(LI
.getDebugLoc());
161 unsigned AS
= LI
.getPointerAddressSpace();
162 unsigned LdBits
= DL
->getTypeStoreSize(LI
.getType()) * 8;
163 auto IntNTy
= Type::getIntNTy(LI
.getContext(), LdBits
);
165 PointerType
*Int32PtrTy
= Type::getInt32PtrTy(LI
.getContext(), AS
);
166 PointerType
*Int8PtrTy
= Type::getInt8PtrTy(LI
.getContext(), AS
);
167 auto *NewPtr
= IRB
.CreateBitCast(
168 IRB
.CreateConstGEP1_64(
170 IRB
.CreatePointerBitCastOrAddrSpaceCast(Base
, Int8PtrTy
),
173 LoadInst
*NewLd
= IRB
.CreateAlignedLoad(IRB
.getInt32Ty(), NewPtr
, Align(4));
174 NewLd
->copyMetadata(LI
);
175 NewLd
->setMetadata(LLVMContext::MD_range
, nullptr);
177 unsigned ShAmt
= Adjust
* 8;
178 auto *NewVal
= IRB
.CreateBitCast(
179 IRB
.CreateTrunc(IRB
.CreateLShr(NewLd
, ShAmt
), IntNTy
), LI
.getType());
180 LI
.replaceAllUsesWith(NewVal
);
181 RecursivelyDeleteTriviallyDeadInstructions(&LI
);
186 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare
, DEBUG_TYPE
,
187 "AMDGPU IR late optimizations", false, false)
188 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
189 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
190 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare
, DEBUG_TYPE
,
191 "AMDGPU IR late optimizations", false, false)
193 char AMDGPULateCodeGenPrepare::ID
= 0;
195 FunctionPass
*llvm::createAMDGPULateCodeGenPreparePass() {
196 return new AMDGPULateCodeGenPrepare();