llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

   1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
  11 /// selection.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "llvm/Analysis/AssumptionCache.h"
  17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
  18 #include "llvm/Analysis/ValueTracking.h"
  19 #include "llvm/IR/IRBuilder.h"
  20 #include "llvm/IR/InstVisitor.h"
  21 #include "llvm/InitializePasses.h"
  22 #include "llvm/Support/CommandLine.h"
  23 #include "llvm/Support/KnownBits.h"
  24 #include "llvm/Transforms/Utils/Local.h"
  25
  26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
  27
  28 using namespace llvm;
  29
  30 // Scalar load widening needs running after load-store-vectorizer as that pass
  31 // doesn't handle overlapping cases. In addition, this pass enhances the
  32 // widening to handle cases where scalar sub-dword loads are naturally aligned
  33 // only but not dword aligned.
  34 static cl::opt<bool>
  35     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
  36                cl::desc("Widen sub-dword constant address space loads in "
  37                         "AMDGPULateCodeGenPrepare"),
  38                cl::ReallyHidden, cl::init(true));
  39
  40 namespace {
  41
  42 class AMDGPULateCodeGenPrepare
  43     : public FunctionPass,
  44       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
  45   Module *Mod = nullptr;
  46   const DataLayout *DL = nullptr;
  47
  48   AssumptionCache *AC = nullptr;
  49   LegacyDivergenceAnalysis *DA = nullptr;
  50
  51 public:
  52   static char ID;
  53
  54   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
  55
  56   StringRef getPassName() const override {
  57     return "AMDGPU IR late optimizations";
  58   }
  59
  60   void getAnalysisUsage(AnalysisUsage &AU) const override {
  61     AU.addRequired<AssumptionCacheTracker>();
  62     AU.addRequired<LegacyDivergenceAnalysis>();
  63     AU.setPreservesAll();
  64   }
  65
  66   bool doInitialization(Module &M) override;
  67   bool runOnFunction(Function &F) override;
  68
  69   bool visitInstruction(Instruction &) { return false; }
  70
  71   // Check if the specified value is at least DWORD aligned.
  72   bool isDWORDAligned(const Value *V) const {
  73     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
  74     return Known.countMinTrailingZeros() >= 2;
  75   }
  76
  77   bool canWidenScalarExtLoad(LoadInst &LI) const;
  78   bool visitLoadInst(LoadInst &LI);
  79 };
  80
  81 } // end anonymous namespace
  82
  83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
  84   Mod = &M;
  85   DL = &Mod->getDataLayout();
  86   return false;
  87 }
  88
  89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
  90   if (skipFunction(F))
  91     return false;
  92
  93   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  94   DA = &getAnalysis<LegacyDivergenceAnalysis>();
  95
  96   bool Changed = false;
  97   for (auto &BB : F)
  98     for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
  99       Instruction *I = &*BI++;
 100       Changed |= visit(*I);
 101     }
 102
 103   return Changed;
 104 }
 105
 106 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
 107   unsigned AS = LI.getPointerAddressSpace();
 108   // Skip non-constant address space.
 109   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
 110       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
 111     return false;
 112   // Skip non-simple loads.
 113   if (!LI.isSimple())
 114     return false;
 115   auto *Ty = LI.getType();
 116   // Skip aggregate types.
 117   if (Ty->isAggregateType())
 118     return false;
 119   unsigned TySize = DL->getTypeStoreSize(Ty);
 120   // Only handle sub-DWORD loads.
 121   if (TySize >= 4)
 122     return false;
 123   // That load must be at least naturally aligned.
 124   if (LI.getAlign() < DL->getABITypeAlign(Ty))
 125     return false;
 126   // It should be uniform, i.e. a scalar load.
 127   return DA->isUniform(&LI);
 128 }
 129
 130 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
 131   if (!WidenLoads)
 132     return false;
 133
 134   // Skip if that load is already aligned on DWORD at least as it's handled in
 135   // SDAG.
 136   if (LI.getAlign() >= 4)
 137     return false;
 138
 139   if (!canWidenScalarExtLoad(LI))
 140     return false;
 141
 142   int64_t Offset = 0;
 143   auto *Base =
 144       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
 145   // If that base is not DWORD aligned, it's not safe to perform the following
 146   // transforms.
 147   if (!isDWORDAligned(Base))
 148     return false;
 149
 150   int64_t Adjust = Offset & 0x3;
 151   if (Adjust == 0) {
 152     // With a zero adjust, the original alignment could be promoted with a
 153     // better one.
 154     LI.setAlignment(Align(4));
 155     return true;
 156   }
 157
 158   IRBuilder<> IRB(&LI);
 159   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
 160
 161   unsigned AS = LI.getPointerAddressSpace();
 162   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
 163   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
 164
 165   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
 166   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
 167   auto *NewPtr = IRB.CreateBitCast(
 168       IRB.CreateConstGEP1_64(
 169           IRB.getInt8Ty(),
 170           IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
 171           Offset - Adjust),
 172       Int32PtrTy);
 173   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
 174   NewLd->copyMetadata(LI);
 175   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
 176
 177   unsigned ShAmt = Adjust * 8;
 178   auto *NewVal = IRB.CreateBitCast(
 179       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
 180   LI.replaceAllUsesWith(NewVal);
 181   RecursivelyDeleteTriviallyDeadInstructions(&LI);
 182
 183   return true;
 184 }
 185
 186 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
 187                       "AMDGPU IR late optimizations", false, false)
 188 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 189 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 190 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
 191                     "AMDGPU IR late optimizations", false, false)
 192
 193 char AMDGPULateCodeGenPrepare::ID = 0;
 194
 195 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
 196   return new AMDGPULateCodeGenPrepare();
 197 }