[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPULateCodeGenPrepare.cpp
blob4971b010870d1fb2a74b0aaf9f26d967743749a4
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPU.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/InstVisitor.h"
21 #include "llvm/InitializePasses.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/KnownBits.h"
24 #include "llvm/Transforms/Utils/Local.h"
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
28 using namespace llvm;
30 // Scalar load widening needs running after load-store-vectorizer as that pass
31 // doesn't handle overlapping cases. In addition, this pass enhances the
32 // widening to handle cases where scalar sub-dword loads are naturally aligned
33 // only but not dword aligned.
34 static cl::opt<bool>
35 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36 cl::desc("Widen sub-dword constant address space loads in "
37 "AMDGPULateCodeGenPrepare"),
38 cl::ReallyHidden, cl::init(true));
40 namespace {
42 class AMDGPULateCodeGenPrepare
43 : public FunctionPass,
44 public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45 Module *Mod = nullptr;
46 const DataLayout *DL = nullptr;
48 AssumptionCache *AC = nullptr;
49 LegacyDivergenceAnalysis *DA = nullptr;
51 public:
52 static char ID;
54 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
56 StringRef getPassName() const override {
57 return "AMDGPU IR late optimizations";
60 void getAnalysisUsage(AnalysisUsage &AU) const override {
61 AU.addRequired<AssumptionCacheTracker>();
62 AU.addRequired<LegacyDivergenceAnalysis>();
63 AU.setPreservesAll();
66 bool doInitialization(Module &M) override;
67 bool runOnFunction(Function &F) override;
69 bool visitInstruction(Instruction &) { return false; }
71 // Check if the specified value is at least DWORD aligned.
72 bool isDWORDAligned(const Value *V) const {
73 KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74 return Known.countMinTrailingZeros() >= 2;
77 bool canWidenScalarExtLoad(LoadInst &LI) const;
78 bool visitLoadInst(LoadInst &LI);
81 } // end anonymous namespace
83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84 Mod = &M;
85 DL = &Mod->getDataLayout();
86 return false;
89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90 if (skipFunction(F))
91 return false;
93 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94 DA = &getAnalysis<LegacyDivergenceAnalysis>();
96 bool Changed = false;
97 for (auto &BB : F)
98 for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
99 Instruction *I = &*BI++;
100 Changed |= visit(*I);
103 return Changed;
106 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
107 unsigned AS = LI.getPointerAddressSpace();
108 // Skip non-constant address space.
109 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
110 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
111 return false;
112 // Skip non-simple loads.
113 if (!LI.isSimple())
114 return false;
115 auto *Ty = LI.getType();
116 // Skip aggregate types.
117 if (Ty->isAggregateType())
118 return false;
119 unsigned TySize = DL->getTypeStoreSize(Ty);
120 // Only handle sub-DWORD loads.
121 if (TySize >= 4)
122 return false;
123 // That load must be at least naturally aligned.
124 if (LI.getAlign() < DL->getABITypeAlign(Ty))
125 return false;
126 // It should be uniform, i.e. a scalar load.
127 return DA->isUniform(&LI);
130 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
131 if (!WidenLoads)
132 return false;
134 // Skip if that load is already aligned on DWORD at least as it's handled in
135 // SDAG.
136 if (LI.getAlign() >= 4)
137 return false;
139 if (!canWidenScalarExtLoad(LI))
140 return false;
142 int64_t Offset = 0;
143 auto *Base =
144 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
145 // If that base is not DWORD aligned, it's not safe to perform the following
146 // transforms.
147 if (!isDWORDAligned(Base))
148 return false;
150 int64_t Adjust = Offset & 0x3;
151 if (Adjust == 0) {
152 // With a zero adjust, the original alignment could be promoted with a
153 // better one.
154 LI.setAlignment(Align(4));
155 return true;
158 IRBuilder<> IRB(&LI);
159 IRB.SetCurrentDebugLocation(LI.getDebugLoc());
161 unsigned AS = LI.getPointerAddressSpace();
162 unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
163 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
165 PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
166 PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
167 auto *NewPtr = IRB.CreateBitCast(
168 IRB.CreateConstGEP1_64(
169 IRB.getInt8Ty(),
170 IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
171 Offset - Adjust),
172 Int32PtrTy);
173 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
174 NewLd->copyMetadata(LI);
175 NewLd->setMetadata(LLVMContext::MD_range, nullptr);
177 unsigned ShAmt = Adjust * 8;
178 auto *NewVal = IRB.CreateBitCast(
179 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
180 LI.replaceAllUsesWith(NewVal);
181 RecursivelyDeleteTriviallyDeadInstructions(&LI);
183 return true;
186 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
187 "AMDGPU IR late optimizations", false, false)
188 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
189 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
190 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
191 "AMDGPU IR late optimizations", false, false)
193 char AMDGPULateCodeGenPrepare::ID = 0;
195 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
196 return new AMDGPULateCodeGenPrepare();