llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

   1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
  11 /// selection.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "llvm/Analysis/AssumptionCache.h"
  18 #include "llvm/Analysis/UniformityAnalysis.h"
  19 #include "llvm/Analysis/ValueTracking.h"
  20 #include "llvm/CodeGen/TargetPassConfig.h"
  21 #include "llvm/IR/IRBuilder.h"
  22 #include "llvm/IR/InstVisitor.h"
  23 #include "llvm/InitializePasses.h"
  24 #include "llvm/Support/CommandLine.h"
  25 #include "llvm/Support/KnownBits.h"
  26 #include "llvm/Transforms/Utils/Local.h"
  27
  28 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
  29
  30 using namespace llvm;
  31
  32 // Scalar load widening needs running after load-store-vectorizer as that pass
  33 // doesn't handle overlapping cases. In addition, this pass enhances the
  34 // widening to handle cases where scalar sub-dword loads are naturally aligned
  35 // only but not dword aligned.
  36 static cl::opt<bool>
  37     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
  38                cl::desc("Widen sub-dword constant address space loads in "
  39                         "AMDGPULateCodeGenPrepare"),
  40                cl::ReallyHidden, cl::init(true));
  41
  42 namespace {
  43
  44 class AMDGPULateCodeGenPrepare
  45     : public FunctionPass,
  46       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
  47   Module *Mod = nullptr;
  48   const DataLayout *DL = nullptr;
  49
  50   AssumptionCache *AC = nullptr;
  51   UniformityInfo *UA = nullptr;
  52
  53 public:
  54   static char ID;
  55
  56   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
  57
  58   StringRef getPassName() const override {
  59     return "AMDGPU IR late optimizations";
  60   }
  61
  62   void getAnalysisUsage(AnalysisUsage &AU) const override {
  63     AU.addRequired<TargetPassConfig>();
  64     AU.addRequired<AssumptionCacheTracker>();
  65     AU.addRequired<UniformityInfoWrapperPass>();
  66     AU.setPreservesAll();
  67   }
  68
  69   bool doInitialization(Module &M) override;
  70   bool runOnFunction(Function &F) override;
  71
  72   bool visitInstruction(Instruction &) { return false; }
  73
  74   // Check if the specified value is at least DWORD aligned.
  75   bool isDWORDAligned(const Value *V) const {
  76     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
  77     return Known.countMinTrailingZeros() >= 2;
  78   }
  79
  80   bool canWidenScalarExtLoad(LoadInst &LI) const;
  81   bool visitLoadInst(LoadInst &LI);
  82 };
  83
  84 } // end anonymous namespace
  85
  86 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
  87   Mod = &M;
  88   DL = &Mod->getDataLayout();
  89   return false;
  90 }
  91
  92 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
  93   if (skipFunction(F))
  94     return false;
  95
  96   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
  97   const TargetMachine &TM = TPC.getTM<TargetMachine>();
  98   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
  99   if (ST.hasScalarSubwordLoads())
 100     return false;
 101
 102   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 103   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 104
 105   bool Changed = false;
 106   for (auto &BB : F)
 107     for (Instruction &I : llvm::make_early_inc_range(BB))
 108       Changed |= visit(I);
 109
 110   return Changed;
 111 }
 112
 113 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
 114   unsigned AS = LI.getPointerAddressSpace();
 115   // Skip non-constant address space.
 116   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
 117       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
 118     return false;
 119   // Skip non-simple loads.
 120   if (!LI.isSimple())
 121     return false;
 122   auto *Ty = LI.getType();
 123   // Skip aggregate types.
 124   if (Ty->isAggregateType())
 125     return false;
 126   unsigned TySize = DL->getTypeStoreSize(Ty);
 127   // Only handle sub-DWORD loads.
 128   if (TySize >= 4)
 129     return false;
 130   // That load must be at least naturally aligned.
 131   if (LI.getAlign() < DL->getABITypeAlign(Ty))
 132     return false;
 133   // It should be uniform, i.e. a scalar load.
 134   return UA->isUniform(&LI);
 135 }
 136
 137 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
 138   if (!WidenLoads)
 139     return false;
 140
 141   // Skip if that load is already aligned on DWORD at least as it's handled in
 142   // SDAG.
 143   if (LI.getAlign() >= 4)
 144     return false;
 145
 146   if (!canWidenScalarExtLoad(LI))
 147     return false;
 148
 149   int64_t Offset = 0;
 150   auto *Base =
 151       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
 152   // If that base is not DWORD aligned, it's not safe to perform the following
 153   // transforms.
 154   if (!isDWORDAligned(Base))
 155     return false;
 156
 157   int64_t Adjust = Offset & 0x3;
 158   if (Adjust == 0) {
 159     // With a zero adjust, the original alignment could be promoted with a
 160     // better one.
 161     LI.setAlignment(Align(4));
 162     return true;
 163   }
 164
 165   IRBuilder<> IRB(&LI);
 166   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
 167
 168   unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
 169   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
 170
 171   auto *NewPtr = IRB.CreateConstGEP1_64(
 172       IRB.getInt8Ty(),
 173       IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
 174       Offset - Adjust);
 175
 176   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
 177   NewLd->copyMetadata(LI);
 178   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
 179
 180   unsigned ShAmt = Adjust * 8;
 181   auto *NewVal = IRB.CreateBitCast(
 182       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
 183   LI.replaceAllUsesWith(NewVal);
 184   RecursivelyDeleteTriviallyDeadInstructions(&LI);
 185
 186   return true;
 187 }
 188
 189 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
 190                       "AMDGPU IR late optimizations", false, false)
 191 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 192 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 193 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 194 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
 195                     "AMDGPU IR late optimizations", false, false)
 196
 197 char AMDGPULateCodeGenPrepare::ID = 0;
 198
 199 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
 200   return new AMDGPULateCodeGenPrepare();
 201 }