lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

   1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass tries to fuse DS instructions with close by immediate offsets.
  10 // This will fuse operations such as
  11 //  ds_read_b32 v0, v2 offset:16
  12 //  ds_read_b32 v1, v2 offset:32
  13 // ==>
  14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
  15 //
  16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
  17 //  s_buffer_load_dword s4, s[0:3], 4
  18 //  s_buffer_load_dword s5, s[0:3], 8
  19 // ==>
  20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
  21 //
  22 // This pass also tries to promote constant offset to the immediate by
  23 // adjusting the base. It tries to use a base from the nearby instructions that
  24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
  25 // to the immediate.
  26 // E.g.
  27 //  s_movk_i32 s0, 0x1800
  28 //  v_add_co_u32_e32 v0, vcc, s0, v2
  29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
  30 //
  31 //  s_movk_i32 s0, 0x1000
  32 //  v_add_co_u32_e32 v5, vcc, s0, v2
  33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
  34 //  global_load_dwordx2 v[5:6], v[5:6], off
  35 //  global_load_dwordx2 v[0:1], v[0:1], off
  36 // =>
  37 //  s_movk_i32 s0, 0x1000
  38 //  v_add_co_u32_e32 v5, vcc, s0, v2
  39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
  40 //  global_load_dwordx2 v[5:6], v[5:6], off
  41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
  42 //
  43 // Future improvements:
  44 //
  45 // - This currently relies on the scheduler to place loads and stores next to
  46 //   each other, and then only merges adjacent pairs of instructions. It would
  47 //   be good to be more flexible with interleaved instructions, and possibly run
  48 //   before scheduling. It currently missing stores of constants because loading
  49 //   the constant into the data register is placed between the stores, although
  50 //   this is arguably a scheduling problem.
  51 //
  52 // - Live interval recomputing seems inefficient. This currently only matches
  53 //   one pair, and recomputes live intervals and moves on to the next pair. It
  54 //   would be better to compute a list of all merges that need to occur.
  55 //
  56 // - With a list of instructions to process, we can also merge more. If a
  57 //   cluster of loads have offsets that are too large to fit in the 8-bit
  58 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
  59 //   pointer and use the new reduced offsets.
  60 //
  61 //===----------------------------------------------------------------------===//
  62
  63 #include "AMDGPU.h"
  64 #include "AMDGPUSubtarget.h"
  65 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  66 #include "SIInstrInfo.h"
  67 #include "SIRegisterInfo.h"
  68 #include "Utils/AMDGPUBaseInfo.h"
  69 #include "llvm/ADT/ArrayRef.h"
  70 #include "llvm/ADT/SmallVector.h"
  71 #include "llvm/ADT/StringRef.h"
  72 #include "llvm/Analysis/AliasAnalysis.h"
  73 #include "llvm/CodeGen/MachineBasicBlock.h"
  74 #include "llvm/CodeGen/MachineFunction.h"
  75 #include "llvm/CodeGen/MachineFunctionPass.h"
  76 #include "llvm/CodeGen/MachineInstr.h"
  77 #include "llvm/CodeGen/MachineInstrBuilder.h"
  78 #include "llvm/CodeGen/MachineOperand.h"
  79 #include "llvm/CodeGen/MachineRegisterInfo.h"
  80 #include "llvm/IR/DebugLoc.h"
  81 #include "llvm/Pass.h"
  82 #include "llvm/Support/Debug.h"
  83 #include "llvm/Support/MathExtras.h"
  84 #include "llvm/Support/raw_ostream.h"
  85 #include <algorithm>
  86 #include <cassert>
  87 #include <cstdlib>
  88 #include <iterator>
  89 #include <utility>
  90
  91 using namespace llvm;
  92
  93 #define DEBUG_TYPE "si-load-store-opt"
  94
  95 namespace {
  96 enum InstClassEnum {
  97   UNKNOWN,
  98   DS_READ,
  99   DS_WRITE,
 100   S_BUFFER_LOAD_IMM,
 101   BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
 102   BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
 103   BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
 104   BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
 105   BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
 106   BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
 107   BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
 108   BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
 109 };
 110
 111 enum RegisterEnum {
 112   SBASE = 0x1,
 113   SRSRC = 0x2,
 114   SOFFSET = 0x4,
 115   VADDR = 0x8,
 116   ADDR = 0x10,
 117 };
 118
 119 class SILoadStoreOptimizer : public MachineFunctionPass {
 120   struct CombineInfo {
 121     MachineBasicBlock::iterator I;
 122     MachineBasicBlock::iterator Paired;
 123     unsigned EltSize;
 124     unsigned Offset0;
 125     unsigned Offset1;
 126     unsigned Width0;
 127     unsigned Width1;
 128     unsigned BaseOff;
 129     InstClassEnum InstClass;
 130     bool GLC0;
 131     bool GLC1;
 132     bool SLC0;
 133     bool SLC1;
 134     bool UseST64;
 135     SmallVector<MachineInstr *, 8> InstsToMove;
 136   };
 137
 138   struct BaseRegisters {
 139     unsigned LoReg = 0;
 140     unsigned HiReg = 0;
 141
 142     unsigned LoSubReg = 0;
 143     unsigned HiSubReg = 0;
 144   };
 145
 146   struct MemAddress {
 147     BaseRegisters Base;
 148     int64_t Offset = 0;
 149   };
 150
 151   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
 152
 153 private:
 154   const GCNSubtarget *STM = nullptr;
 155   const SIInstrInfo *TII = nullptr;
 156   const SIRegisterInfo *TRI = nullptr;
 157   MachineRegisterInfo *MRI = nullptr;
 158   AliasAnalysis *AA = nullptr;
 159   bool OptimizeAgain;
 160
 161   static bool offsetsCanBeCombined(CombineInfo &CI);
 162   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
 163   static unsigned getNewOpcode(const CombineInfo &CI);
 164   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
 165   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
 166   unsigned getOpcodeWidth(const MachineInstr &MI);
 167   InstClassEnum getInstClass(unsigned Opc);
 168   unsigned getRegs(unsigned Opc);
 169
 170   bool findMatchingInst(CombineInfo &CI);
 171
 172   unsigned read2Opcode(unsigned EltSize) const;
 173   unsigned read2ST64Opcode(unsigned EltSize) const;
 174   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
 175
 176   unsigned write2Opcode(unsigned EltSize) const;
 177   unsigned write2ST64Opcode(unsigned EltSize) const;
 178   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
 179   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
 180   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
 181   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
 182
 183   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
 184                            int32_t NewOffset);
 185   unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
 186   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
 187   Optional<int32_t> extractConstOffset(const MachineOperand &Op);
 188   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
 189   /// Promotes constant offset to the immediate by adjusting the base. It
 190   /// tries to use a base from the nearby instructions that allows it to have
 191   /// a 13bit constant offset which gets promoted to the immediate.
 192   bool promoteConstantOffsetToImm(MachineInstr &CI,
 193                                   MemInfoMap &Visited,
 194                                   SmallPtrSet<MachineInstr *, 4> &Promoted);
 195
 196 public:
 197   static char ID;
 198
 199   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
 200     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
 201   }
 202
 203   bool optimizeBlock(MachineBasicBlock &MBB);
 204
 205   bool runOnMachineFunction(MachineFunction &MF) override;
 206
 207   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
 208
 209   void getAnalysisUsage(AnalysisUsage &AU) const override {
 210     AU.setPreservesCFG();
 211     AU.addRequired<AAResultsWrapperPass>();
 212
 213     MachineFunctionPass::getAnalysisUsage(AU);
 214   }
 215 };
 216
 217 } // end anonymous namespace.
 218
 219 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
 220                       "SI Load Store Optimizer", false, false)
 221 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 222 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
 223                     false, false)
 224
 225 char SILoadStoreOptimizer::ID = 0;
 226
 227 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
 228
 229 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
 230   return new SILoadStoreOptimizer();
 231 }
 232
 233 static void moveInstsAfter(MachineBasicBlock::iterator I,
 234                            ArrayRef<MachineInstr *> InstsToMove) {
 235   MachineBasicBlock *MBB = I->getParent();
 236   ++I;
 237   for (MachineInstr *MI : InstsToMove) {
 238     MI->removeFromParent();
 239     MBB->insert(I, MI);
 240   }
 241 }
 242
 243 static void addDefsUsesToList(const MachineInstr &MI,
 244                               DenseSet<unsigned> &RegDefs,
 245                               DenseSet<unsigned> &PhysRegUses) {
 246   for (const MachineOperand &Op : MI.operands()) {
 247     if (Op.isReg()) {
 248       if (Op.isDef())
 249         RegDefs.insert(Op.getReg());
 250       else if (Op.readsReg() &&
 251                TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
 252         PhysRegUses.insert(Op.getReg());
 253     }
 254   }
 255 }
 256
 257 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
 258                                       MachineBasicBlock::iterator B,
 259                                       const SIInstrInfo *TII,
 260                                       AliasAnalysis *AA) {
 261   // RAW or WAR - cannot reorder
 262   // WAW - cannot reorder
 263   // RAR - safe to reorder
 264   return !(A->mayStore() || B->mayStore()) ||
 265          TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
 266 }
 267
 268 // Add MI and its defs to the lists if MI reads one of the defs that are
 269 // already in the list. Returns true in that case.
 270 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
 271                                   DenseSet<unsigned> &PhysRegUses,
 272                                   SmallVectorImpl<MachineInstr *> &Insts) {
 273   for (MachineOperand &Use : MI.operands()) {
 274     // If one of the defs is read, then there is a use of Def between I and the
 275     // instruction that I will potentially be merged with. We will need to move
 276     // this instruction after the merged instructions.
 277     //
 278     // Similarly, if there is a def which is read by an instruction that is to
 279     // be moved for merging, then we need to move the def-instruction as well.
 280     // This can only happen for physical registers such as M0; virtual
 281     // registers are in SSA form.
 282     if (Use.isReg() &&
 283         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
 284          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
 285           PhysRegUses.count(Use.getReg())))) {
 286       Insts.push_back(&MI);
 287       addDefsUsesToList(MI, RegDefs, PhysRegUses);
 288       return true;
 289     }
 290   }
 291
 292   return false;
 293 }
 294
 295 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
 296                                     ArrayRef<MachineInstr *> InstsToMove,
 297                                     const SIInstrInfo *TII, AliasAnalysis *AA) {
 298   assert(MemOp.mayLoadOrStore());
 299
 300   for (MachineInstr *InstToMove : InstsToMove) {
 301     if (!InstToMove->mayLoadOrStore())
 302       continue;
 303     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
 304       return false;
 305   }
 306   return true;
 307 }
 308
 309 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
 310   // XXX - Would the same offset be OK? Is there any reason this would happen or
 311   // be useful?
 312   if (CI.Offset0 == CI.Offset1)
 313     return false;
 314
 315   // This won't be valid if the offset isn't aligned.
 316   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
 317     return false;
 318
 319   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
 320   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
 321   CI.UseST64 = false;
 322   CI.BaseOff = 0;
 323
 324   // Handle SMEM and VMEM instructions.
 325   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
 326     return (EltOffset0 + CI.Width0 == EltOffset1 ||
 327             EltOffset1 + CI.Width1 == EltOffset0) &&
 328            CI.GLC0 == CI.GLC1 &&
 329            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
 330   }
 331
 332   // If the offset in elements doesn't fit in 8-bits, we might be able to use
 333   // the stride 64 versions.
 334   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
 335       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
 336     CI.Offset0 = EltOffset0 / 64;
 337     CI.Offset1 = EltOffset1 / 64;
 338     CI.UseST64 = true;
 339     return true;
 340   }
 341
 342   // Check if the new offsets fit in the reduced 8-bit range.
 343   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
 344     CI.Offset0 = EltOffset0;
 345     CI.Offset1 = EltOffset1;
 346     return true;
 347   }
 348
 349   // Try to shift base address to decrease offsets.
 350   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
 351   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
 352
 353   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
 354     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
 355     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
 356     CI.UseST64 = true;
 357     return true;
 358   }
 359
 360   if (isUInt<8>(OffsetDiff)) {
 361     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
 362     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
 363     return true;
 364   }
 365
 366   return false;
 367 }
 368
 369 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
 370                                      const CombineInfo &CI) {
 371   const unsigned Width = (CI.Width0 + CI.Width1);
 372   switch (CI.InstClass) {
 373   default:
 374     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
 375   case S_BUFFER_LOAD_IMM:
 376     switch (Width) {
 377     default:
 378       return false;
 379     case 2:
 380     case 4:
 381       return true;
 382     }
 383   }
 384 }
 385
 386 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
 387   const unsigned Opc = MI.getOpcode();
 388
 389   if (TII->isMUBUF(MI)) {
 390     return AMDGPU::getMUBUFDwords(Opc);
 391   }
 392
 393   switch (Opc) {
 394   default:
 395     return 0;
 396   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
 397     return 1;
 398   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
 399     return 2;
 400   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
 401     return 4;
 402   }
 403 }
 404
 405 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
 406   if (TII->isMUBUF(Opc)) {
 407     const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
 408
 409     // If we couldn't identify the opcode, bail out.
 410     if (baseOpcode == -1) {
 411       return UNKNOWN;
 412     }
 413
 414     switch (baseOpcode) {
 415     default:
 416       return UNKNOWN;
 417     case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
 418       return BUFFER_LOAD_OFFEN;
 419     case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
 420       return BUFFER_LOAD_OFFSET;
 421     case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
 422       return BUFFER_STORE_OFFEN;
 423     case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
 424       return BUFFER_STORE_OFFSET;
 425     case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
 426       return BUFFER_LOAD_OFFEN_exact;
 427     case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
 428       return BUFFER_LOAD_OFFSET_exact;
 429     case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
 430       return BUFFER_STORE_OFFEN_exact;
 431     case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
 432       return BUFFER_STORE_OFFSET_exact;
 433     }
 434   }
 435
 436   switch (Opc) {
 437   default:
 438     return UNKNOWN;
 439   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
 440   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
 441   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
 442     return S_BUFFER_LOAD_IMM;
 443   case AMDGPU::DS_READ_B32:
 444   case AMDGPU::DS_READ_B64:
 445   case AMDGPU::DS_READ_B32_gfx9:
 446   case AMDGPU::DS_READ_B64_gfx9:
 447     return DS_READ;
 448   case AMDGPU::DS_WRITE_B32:
 449   case AMDGPU::DS_WRITE_B64:
 450   case AMDGPU::DS_WRITE_B32_gfx9:
 451   case AMDGPU::DS_WRITE_B64_gfx9:
 452     return DS_WRITE;
 453   }
 454 }
 455
 456 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
 457   if (TII->isMUBUF(Opc)) {
 458     unsigned result = 0;
 459
 460     if (AMDGPU::getMUBUFHasVAddr(Opc)) {
 461       result |= VADDR;
 462     }
 463
 464     if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
 465       result |= SRSRC;
 466     }
 467
 468     if (AMDGPU::getMUBUFHasSoffset(Opc)) {
 469       result |= SOFFSET;
 470     }
 471
 472     return result;
 473   }
 474
 475   switch (Opc) {
 476   default:
 477     return 0;
 478   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
 479   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
 480   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
 481     return SBASE;
 482   case AMDGPU::DS_READ_B32:
 483   case AMDGPU::DS_READ_B64:
 484   case AMDGPU::DS_READ_B32_gfx9:
 485   case AMDGPU::DS_READ_B64_gfx9:
 486   case AMDGPU::DS_WRITE_B32:
 487   case AMDGPU::DS_WRITE_B64:
 488   case AMDGPU::DS_WRITE_B32_gfx9:
 489   case AMDGPU::DS_WRITE_B64_gfx9:
 490     return ADDR;
 491   }
 492 }
 493
 494 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
 495   MachineBasicBlock *MBB = CI.I->getParent();
 496   MachineBasicBlock::iterator E = MBB->end();
 497   MachineBasicBlock::iterator MBBI = CI.I;
 498
 499   const unsigned Opc = CI.I->getOpcode();
 500   const InstClassEnum InstClass = getInstClass(Opc);
 501
 502   if (InstClass == UNKNOWN) {
 503     return false;
 504   }
 505
 506   const unsigned Regs = getRegs(Opc);
 507
 508   unsigned AddrOpName[5] = {0};
 509   int AddrIdx[5];
 510   const MachineOperand *AddrReg[5];
 511   unsigned NumAddresses = 0;
 512
 513   if (Regs & ADDR) {
 514     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
 515   }
 516
 517   if (Regs & SBASE) {
 518     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
 519   }
 520
 521   if (Regs & SRSRC) {
 522     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
 523   }
 524
 525   if (Regs & SOFFSET) {
 526     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
 527   }
 528
 529   if (Regs & VADDR) {
 530     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
 531   }
 532
 533   for (unsigned i = 0; i < NumAddresses; i++) {
 534     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
 535     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
 536
 537     // We only ever merge operations with the same base address register, so
 538     // don't bother scanning forward if there are no other uses.
 539     if (AddrReg[i]->isReg() &&
 540         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
 541          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
 542       return false;
 543   }
 544
 545   ++MBBI;
 546
 547   DenseSet<unsigned> RegDefsToMove;
 548   DenseSet<unsigned> PhysRegUsesToMove;
 549   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 550
 551   for (; MBBI != E; ++MBBI) {
 552     const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
 553
 554     if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
 555         (IsDS && (MBBI->getOpcode() != Opc))) {
 556       // This is not a matching DS instruction, but we can keep looking as
 557       // long as one of these conditions are met:
 558       // 1. It is safe to move I down past MBBI.
 559       // 2. It is safe to move MBBI down past the instruction that I will
 560       //    be merged into.
 561
 562       if (MBBI->hasUnmodeledSideEffects()) {
 563         // We can't re-order this instruction with respect to other memory
 564         // operations, so we fail both conditions mentioned above.
 565         return false;
 566       }
 567
 568       if (MBBI->mayLoadOrStore() &&
 569           (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
 570            !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
 571         // We fail condition #1, but we may still be able to satisfy condition
 572         // #2.  Add this instruction to the move list and then we will check
 573         // if condition #2 holds once we have selected the matching instruction.
 574         CI.InstsToMove.push_back(&*MBBI);
 575         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
 576         continue;
 577       }
 578
 579       // When we match I with another DS instruction we will be moving I down
 580       // to the location of the matched instruction any uses of I will need to
 581       // be moved down as well.
 582       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
 583                             CI.InstsToMove);
 584       continue;
 585     }
 586
 587     // Don't merge volatiles.
 588     if (MBBI->hasOrderedMemoryRef())
 589       return false;
 590
 591     // Handle a case like
 592     //   DS_WRITE_B32 addr, v, idx0
 593     //   w = DS_READ_B32 addr, idx0
 594     //   DS_WRITE_B32 addr, f(w), idx1
 595     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
 596     // merging of the two writes.
 597     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
 598                               CI.InstsToMove))
 599       continue;
 600
 601     bool Match = true;
 602     for (unsigned i = 0; i < NumAddresses; i++) {
 603       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
 604
 605       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
 606         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
 607             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
 608           Match = false;
 609           break;
 610         }
 611         continue;
 612       }
 613
 614       // Check same base pointer. Be careful of subregisters, which can occur
 615       // with vectors of pointers.
 616       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
 617           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
 618         Match = false;
 619         break;
 620       }
 621     }
 622
 623     if (Match) {
 624       int OffsetIdx =
 625           AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
 626       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
 627       CI.Width0 = getOpcodeWidth(*CI.I);
 628       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
 629       CI.Width1 = getOpcodeWidth(*MBBI);
 630       CI.Paired = MBBI;
 631
 632       if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
 633         CI.Offset0 &= 0xffff;
 634         CI.Offset1 &= 0xffff;
 635       } else {
 636         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
 637         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
 638         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
 639           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
 640           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
 641         }
 642       }
 643
 644       // Check both offsets fit in the reduced range.
 645       // We also need to go through the list of instructions that we plan to
 646       // move and make sure they are all safe to move down past the merged
 647       // instruction.
 648       if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
 649         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
 650           return true;
 651     }
 652
 653     // We've found a load/store that we couldn't merge for some reason.
 654     // We could potentially keep looking, but we'd need to make sure that
 655     // it was safe to move I and also all the instruction in InstsToMove
 656     // down past this instruction.
 657     // check if we can move I across MBBI and if we can move all I's users
 658     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
 659         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
 660       break;
 661   }
 662   return false;
 663 }
 664
 665 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
 666   if (STM->ldsRequiresM0Init())
 667     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
 668   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
 669 }
 670
 671 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
 672   if (STM->ldsRequiresM0Init())
 673     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
 674
 675   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
 676                         : AMDGPU::DS_READ2ST64_B64_gfx9;
 677 }
 678
 679 MachineBasicBlock::iterator
 680 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
 681   MachineBasicBlock *MBB = CI.I->getParent();
 682
 683   // Be careful, since the addresses could be subregisters themselves in weird
 684   // cases, like vectors of pointers.
 685   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
 686
 687   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
 688   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
 689
 690   unsigned NewOffset0 = CI.Offset0;
 691   unsigned NewOffset1 = CI.Offset1;
 692   unsigned Opc =
 693       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
 694
 695   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
 696   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
 697
 698   if (NewOffset0 > NewOffset1) {
 699     // Canonicalize the merged instruction so the smaller offset comes first.
 700     std::swap(NewOffset0, NewOffset1);
 701     std::swap(SubRegIdx0, SubRegIdx1);
 702   }
 703
 704   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
 705          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 706
 707   const MCInstrDesc &Read2Desc = TII->get(Opc);
 708
 709   const TargetRegisterClass *SuperRC =
 710       (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
 711   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 712
 713   DebugLoc DL = CI.I->getDebugLoc();
 714
 715   unsigned BaseReg = AddrReg->getReg();
 716   unsigned BaseSubReg = AddrReg->getSubReg();
 717   unsigned BaseRegFlags = 0;
 718   if (CI.BaseOff) {
 719     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
 720     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
 721         .addImm(CI.BaseOff);
 722
 723     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 724     BaseRegFlags = RegState::Kill;
 725
 726     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
 727         .addReg(ImmReg)
 728         .addReg(AddrReg->getReg(), 0, BaseSubReg);
 729     BaseSubReg = 0;
 730   }
 731
 732   MachineInstrBuilder Read2 =
 733       BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
 734           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
 735           .addImm(NewOffset0)                        // offset0
 736           .addImm(NewOffset1)                        // offset1
 737           .addImm(0)                                 // gds
 738           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 739
 740   (void)Read2;
 741
 742   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 743
 744   // Copy to the old destination registers.
 745   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
 746       .add(*Dest0) // Copy to same destination including flags and sub reg.
 747       .addReg(DestReg, 0, SubRegIdx0);
 748   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
 749                             .add(*Dest1)
 750                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 751
 752   moveInstsAfter(Copy1, CI.InstsToMove);
 753
 754   MachineBasicBlock::iterator Next = std::next(CI.I);
 755   CI.I->eraseFromParent();
 756   CI.Paired->eraseFromParent();
 757
 758   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
 759   return Next;
 760 }
 761
 762 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
 763   if (STM->ldsRequiresM0Init())
 764     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
 765   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
 766                         : AMDGPU::DS_WRITE2_B64_gfx9;
 767 }
 768
 769 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
 770   if (STM->ldsRequiresM0Init())
 771     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
 772                           : AMDGPU::DS_WRITE2ST64_B64;
 773
 774   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
 775                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 776 }
 777
 778 MachineBasicBlock::iterator
 779 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
 780   MachineBasicBlock *MBB = CI.I->getParent();
 781
 782   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
 783   // sure we preserve the subregister index and any register flags set on them.
 784   const MachineOperand *AddrReg =
 785       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
 786   const MachineOperand *Data0 =
 787       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
 788   const MachineOperand *Data1 =
 789       TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
 790
 791   unsigned NewOffset0 = CI.Offset0;
 792   unsigned NewOffset1 = CI.Offset1;
 793   unsigned Opc =
 794       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
 795
 796   if (NewOffset0 > NewOffset1) {
 797     // Canonicalize the merged instruction so the smaller offset comes first.
 798     std::swap(NewOffset0, NewOffset1);
 799     std::swap(Data0, Data1);
 800   }
 801
 802   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
 803          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 804
 805   const MCInstrDesc &Write2Desc = TII->get(Opc);
 806   DebugLoc DL = CI.I->getDebugLoc();
 807
 808   unsigned BaseReg = AddrReg->getReg();
 809   unsigned BaseSubReg = AddrReg->getSubReg();
 810   unsigned BaseRegFlags = 0;
 811   if (CI.BaseOff) {
 812     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
 813     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
 814         .addImm(CI.BaseOff);
 815
 816     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 817     BaseRegFlags = RegState::Kill;
 818
 819     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
 820         .addReg(ImmReg)
 821         .addReg(AddrReg->getReg(), 0, BaseSubReg);
 822     BaseSubReg = 0;
 823   }
 824
 825   MachineInstrBuilder Write2 =
 826       BuildMI(*MBB, CI.Paired, DL, Write2Desc)
 827           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
 828           .add(*Data0)                               // data0
 829           .add(*Data1)                               // data1
 830           .addImm(NewOffset0)                        // offset0
 831           .addImm(NewOffset1)                        // offset1
 832           .addImm(0)                                 // gds
 833           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 834
 835   moveInstsAfter(Write2, CI.InstsToMove);
 836
 837   MachineBasicBlock::iterator Next = std::next(CI.I);
 838   CI.I->eraseFromParent();
 839   CI.Paired->eraseFromParent();
 840
 841   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
 842   return Next;
 843 }
 844
 845 MachineBasicBlock::iterator
 846 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
 847   MachineBasicBlock *MBB = CI.I->getParent();
 848   DebugLoc DL = CI.I->getDebugLoc();
 849   const unsigned Opcode = getNewOpcode(CI);
 850
 851   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
 852
 853   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 854   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 855
 856   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
 857       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
 858       .addImm(MergedOffset) // offset
 859       .addImm(CI.GLC0)      // glc
 860       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 861
 862   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
 863   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
 864   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 865
 866   // Copy to the old destination registers.
 867   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 868   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
 869   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
 870
 871   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
 872       .add(*Dest0) // Copy to same destination including flags and sub reg.
 873       .addReg(DestReg, 0, SubRegIdx0);
 874   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
 875                             .add(*Dest1)
 876                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 877
 878   moveInstsAfter(Copy1, CI.InstsToMove);
 879
 880   MachineBasicBlock::iterator Next = std::next(CI.I);
 881   CI.I->eraseFromParent();
 882   CI.Paired->eraseFromParent();
 883   return Next;
 884 }
 885
 886 MachineBasicBlock::iterator
 887 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
 888   MachineBasicBlock *MBB = CI.I->getParent();
 889   DebugLoc DL = CI.I->getDebugLoc();
 890
 891   const unsigned Opcode = getNewOpcode(CI);
 892
 893   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
 894
 895   // Copy to the new source register.
 896   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 897   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 898
 899   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
 900
 901   const unsigned Regs = getRegs(Opcode);
 902
 903   if (Regs & VADDR)
 904     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 905
 906   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
 907       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
 908       .addImm(MergedOffset) // offset
 909       .addImm(CI.GLC0)      // glc
 910       .addImm(CI.SLC0)      // slc
 911       .addImm(0)            // tfe
 912       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 913
 914   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
 915   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
 916   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 917
 918   // Copy to the old destination registers.
 919   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 920   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
 921   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
 922
 923   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
 924       .add(*Dest0) // Copy to same destination including flags and sub reg.
 925       .addReg(DestReg, 0, SubRegIdx0);
 926   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
 927                             .add(*Dest1)
 928                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 929
 930   moveInstsAfter(Copy1, CI.InstsToMove);
 931
 932   MachineBasicBlock::iterator Next = std::next(CI.I);
 933   CI.I->eraseFromParent();
 934   CI.Paired->eraseFromParent();
 935   return Next;
 936 }
 937
 938 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
 939   const unsigned Width = CI.Width0 + CI.Width1;
 940
 941   switch (CI.InstClass) {
 942   default:
 943     return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
 944   case UNKNOWN:
 945     llvm_unreachable("Unknown instruction class");
 946   case S_BUFFER_LOAD_IMM:
 947     switch (Width) {
 948     default:
 949       return 0;
 950     case 2:
 951       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
 952     case 4:
 953       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
 954     }
 955   }
 956 }
 957
 958 std::pair<unsigned, unsigned>
 959 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
 960   if (CI.Offset0 > CI.Offset1) {
 961     switch (CI.Width0) {
 962     default:
 963       return std::make_pair(0, 0);
 964     case 1:
 965       switch (CI.Width1) {
 966       default:
 967         return std::make_pair(0, 0);
 968       case 1:
 969         return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
 970       case 2:
 971         return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
 972       case 3:
 973         return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
 974       }
 975     case 2:
 976       switch (CI.Width1) {
 977       default:
 978         return std::make_pair(0, 0);
 979       case 1:
 980         return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
 981       case 2:
 982         return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
 983       }
 984     case 3:
 985       switch (CI.Width1) {
 986       default:
 987         return std::make_pair(0, 0);
 988       case 1:
 989         return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
 990       }
 991     }
 992   } else {
 993     switch (CI.Width0) {
 994     default:
 995       return std::make_pair(0, 0);
 996     case 1:
 997       switch (CI.Width1) {
 998       default:
 999         return std::make_pair(0, 0);
1000       case 1:
1001         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1002       case 2:
1003         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1004       case 3:
1005         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1006       }
1007     case 2:
1008       switch (CI.Width1) {
1009       default:
1010         return std::make_pair(0, 0);
1011       case 1:
1012         return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1013       case 2:
1014         return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1015       }
1016     case 3:
1017       switch (CI.Width1) {
1018       default:
1019         return std::make_pair(0, 0);
1020       case 1:
1021         return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1022       }
1023     }
1024   }
1025 }
1026
1027 const TargetRegisterClass *
1028 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1029   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1030     switch (CI.Width0 + CI.Width1) {
1031     default:
1032       return nullptr;
1033     case 2:
1034       return &AMDGPU::SReg_64_XEXECRegClass;
1035     case 4:
1036       return &AMDGPU::SReg_128RegClass;
1037     case 8:
1038       return &AMDGPU::SReg_256RegClass;
1039     case 16:
1040       return &AMDGPU::SReg_512RegClass;
1041     }
1042   } else {
1043     switch (CI.Width0 + CI.Width1) {
1044     default:
1045       return nullptr;
1046     case 2:
1047       return &AMDGPU::VReg_64RegClass;
1048     case 3:
1049       return &AMDGPU::VReg_96RegClass;
1050     case 4:
1051       return &AMDGPU::VReg_128RegClass;
1052     }
1053   }
1054 }
1055
1056 MachineBasicBlock::iterator
1057 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1058   MachineBasicBlock *MBB = CI.I->getParent();
1059   DebugLoc DL = CI.I->getDebugLoc();
1060
1061   const unsigned Opcode = getNewOpcode(CI);
1062
1063   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1064   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1065   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1066
1067   // Copy to the new source register.
1068   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1069   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1070
1071   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1072   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1073
1074   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1075       .add(*Src0)
1076       .addImm(SubRegIdx0)
1077       .add(*Src1)
1078       .addImm(SubRegIdx1);
1079
1080   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1081                  .addReg(SrcReg, RegState::Kill);
1082
1083   const unsigned Regs = getRegs(Opcode);
1084
1085   if (Regs & VADDR)
1086     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1087
1088   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1089       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1090       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1091       .addImm(CI.GLC0)                          // glc
1092       .addImm(CI.SLC0)                          // slc
1093       .addImm(0)                                // tfe
1094       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1095
1096   moveInstsAfter(MIB, CI.InstsToMove);
1097
1098   MachineBasicBlock::iterator Next = std::next(CI.I);
1099   CI.I->eraseFromParent();
1100   CI.Paired->eraseFromParent();
1101   return Next;
1102 }
1103
1104 MachineOperand
1105 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1106   APInt V(32, Val, true);
1107   if (TII->isInlineConstant(V))
1108     return MachineOperand::CreateImm(Val);
1109
1110   unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1111   MachineInstr *Mov =
1112   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1113           TII->get(AMDGPU::S_MOV_B32), Reg)
1114     .addImm(Val);
1115   (void)Mov;
1116   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1117   return MachineOperand::CreateReg(Reg, false);
1118 }
1119
1120 // Compute base address using Addr and return the final register.
1121 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1122                                            const MemAddress &Addr) {
1123   MachineBasicBlock *MBB = MI.getParent();
1124   MachineBasicBlock::iterator MBBI = MI.getIterator();
1125   DebugLoc DL = MI.getDebugLoc();
1126
1127   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1128           Addr.Base.LoSubReg) &&
1129          "Expected 32-bit Base-Register-Low!!");
1130
1131   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1132           Addr.Base.HiSubReg) &&
1133          "Expected 32-bit Base-Register-Hi!!");
1134
1135   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1136   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1137   MachineOperand OffsetHi =
1138     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1139   unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1140   unsigned DeadCarryReg =
1141     MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1142
1143   unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1144   unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145   MachineInstr *LoHalf =
1146     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1147       .addReg(CarryReg, RegState::Define)
1148       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1149     .add(OffsetLo);
1150   (void)LoHalf;
1151   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1152
1153   MachineInstr *HiHalf =
1154   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1155     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1156     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1157     .add(OffsetHi)
1158     .addReg(CarryReg, RegState::Kill);
1159   (void)HiHalf;
1160   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1161
1162   unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1163   MachineInstr *FullBase =
1164     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1165       .addReg(DestSub0)
1166       .addImm(AMDGPU::sub0)
1167       .addReg(DestSub1)
1168       .addImm(AMDGPU::sub1);
1169   (void)FullBase;
1170   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1171
1172   return FullDestReg;
1173 }
1174
1175 // Update base and offset with the NewBase and NewOffset in MI.
1176 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1177                                                unsigned NewBase,
1178                                                int32_t NewOffset) {
1179   TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1180   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1181 }
1182
1183 Optional<int32_t>
1184 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1185   if (Op.isImm())
1186     return Op.getImm();
1187
1188   if (!Op.isReg())
1189     return None;
1190
1191   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1192   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1193       !Def->getOperand(1).isImm())
1194     return None;
1195
1196   return Def->getOperand(1).getImm();
1197 }
1198
1199 // Analyze Base and extracts:
1200 //  - 32bit base registers, subregisters
1201 //  - 64bit constant offset
1202 // Expecting base computation as:
1203 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1204 //   %LO:vgpr_32, %c:sreg_64_xexec =
1205 //       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1206 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1207 //   %Base:vreg_64 =
1208 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1209 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1210                                                       MemAddress &Addr) {
1211   if (!Base.isReg())
1212     return;
1213
1214   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1215   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1216       || Def->getNumOperands() != 5)
1217     return;
1218
1219   MachineOperand BaseLo = Def->getOperand(1);
1220   MachineOperand BaseHi = Def->getOperand(3);
1221   if (!BaseLo.isReg() || !BaseHi.isReg())
1222     return;
1223
1224   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1225   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1226
1227   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1228       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1229     return;
1230
1231   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1232   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1233
1234   auto Offset0P = extractConstOffset(*Src0);
1235   if (Offset0P)
1236     BaseLo = *Src1;
1237   else {
1238     if (!(Offset0P = extractConstOffset(*Src1)))
1239       return;
1240     BaseLo = *Src0;
1241   }
1242
1243   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1244   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1245
1246   if (Src0->isImm())
1247     std::swap(Src0, Src1);
1248
1249   if (!Src1->isImm())
1250     return;
1251
1252   uint64_t Offset1 = Src1->getImm();
1253   BaseHi = *Src0;
1254
1255   Addr.Base.LoReg = BaseLo.getReg();
1256   Addr.Base.HiReg = BaseHi.getReg();
1257   Addr.Base.LoSubReg = BaseLo.getSubReg();
1258   Addr.Base.HiSubReg = BaseHi.getSubReg();
1259   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1260 }
1261
1262 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1263     MachineInstr &MI,
1264     MemInfoMap &Visited,
1265     SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1266
1267   // TODO: Support flat and scratch.
1268   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1269       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1270     return false;
1271
1272   // TODO: Support Store.
1273   if (!MI.mayLoad())
1274     return false;
1275
1276   if (AnchorList.count(&MI))
1277     return false;
1278
1279   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1280
1281   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1282     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1283     return false;
1284   }
1285
1286   // Step1: Find the base-registers and a 64bit constant offset.
1287   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1288   MemAddress MAddr;
1289   if (Visited.find(&MI) == Visited.end()) {
1290     processBaseWithConstOffset(Base, MAddr);
1291     Visited[&MI] = MAddr;
1292   } else
1293     MAddr = Visited[&MI];
1294
1295   if (MAddr.Offset == 0) {
1296     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1297                          " constant offsets that can be promoted.\n";);
1298     return false;
1299   }
1300
1301   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1302              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1303
1304   // Step2: Traverse through MI's basic block and find an anchor(that has the
1305   // same base-registers) with the highest 13bit distance from MI's offset.
1306   // E.g. (64bit loads)
1307   // bb:
1308   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1309   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1310   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1311   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1312   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1313   //
1314   // Starting from the first load, the optimization will try to find a new base
1315   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1316   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1317   // as the new-base(anchor) because of the maximum distance which can
1318   // accomodate more intermediate bases presumeably.
1319   //
1320   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1321   // (&a + 8192) for load1, load2, load4.
1322   //   addr = &a + 8192
1323   //   load1 = load(addr,       -4096)
1324   //   load2 = load(addr,       -2048)
1325   //   load3 = load(addr,       0)
1326   //   load4 = load(addr,       2048)
1327   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1328   //
1329   MachineInstr *AnchorInst = nullptr;
1330   MemAddress AnchorAddr;
1331   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1332   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1333
1334   MachineBasicBlock *MBB = MI.getParent();
1335   MachineBasicBlock::iterator E = MBB->end();
1336   MachineBasicBlock::iterator MBBI = MI.getIterator();
1337   ++MBBI;
1338   const SITargetLowering *TLI =
1339     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1340
1341   for ( ; MBBI != E; ++MBBI) {
1342     MachineInstr &MINext = *MBBI;
1343     // TODO: Support finding an anchor(with same base) from store addresses or
1344     // any other load addresses where the opcodes are different.
1345     if (MINext.getOpcode() != MI.getOpcode() ||
1346         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1347       continue;
1348
1349     const MachineOperand &BaseNext =
1350       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1351     MemAddress MAddrNext;
1352     if (Visited.find(&MINext) == Visited.end()) {
1353       processBaseWithConstOffset(BaseNext, MAddrNext);
1354       Visited[&MINext] = MAddrNext;
1355     } else
1356       MAddrNext = Visited[&MINext];
1357
1358     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1359         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1360         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1361         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1362       continue;
1363
1364     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1365
1366     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1367     TargetLoweringBase::AddrMode AM;
1368     AM.HasBaseReg = true;
1369     AM.BaseOffs = Dist;
1370     if (TLI->isLegalGlobalAddressingMode(AM) &&
1371         (uint32_t)std::abs(Dist) > MaxDist) {
1372       MaxDist = std::abs(Dist);
1373
1374       AnchorAddr = MAddrNext;
1375       AnchorInst = &MINext;
1376     }
1377   }
1378
1379   if (AnchorInst) {
1380     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1381                AnchorInst->dump());
1382     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1383                <<  AnchorAddr.Offset << "\n\n");
1384
1385     // Instead of moving up, just re-compute anchor-instruction's base address.
1386     unsigned Base = computeBase(MI, AnchorAddr);
1387
1388     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1389     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1390
1391     for (auto P : InstsWCommonBase) {
1392       TargetLoweringBase::AddrMode AM;
1393       AM.HasBaseReg = true;
1394       AM.BaseOffs = P.second - AnchorAddr.Offset;
1395
1396       if (TLI->isLegalGlobalAddressingMode(AM)) {
1397         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1398                    dbgs() << ")"; P.first->dump());
1399         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1400         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1401       }
1402     }
1403     AnchorList.insert(AnchorInst);
1404     return true;
1405   }
1406
1407   return false;
1408 }
1409
1410 // Scan through looking for adjacent LDS operations with constant offsets from
1411 // the same base register. We rely on the scheduler to do the hard work of
1412 // clustering nearby loads, and assume these are all adjacent.
1413 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1414   bool Modified = false;
1415
1416   // Contain the list
1417   MemInfoMap Visited;
1418   // Contains the list of instructions for which constant offsets are being
1419   // promoted to the IMM.
1420   SmallPtrSet<MachineInstr *, 4> AnchorList;
1421
1422   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1423     MachineInstr &MI = *I;
1424
1425     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1426       Modified = true;
1427
1428     // Don't combine if volatile.
1429     if (MI.hasOrderedMemoryRef()) {
1430       ++I;
1431       continue;
1432     }
1433
1434     const unsigned Opc = MI.getOpcode();
1435
1436     CombineInfo CI;
1437     CI.I = I;
1438     CI.InstClass = getInstClass(Opc);
1439
1440     switch (CI.InstClass) {
1441     default:
1442       break;
1443     case DS_READ:
1444       CI.EltSize =
1445           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1446                                                                           : 4;
1447       if (findMatchingInst(CI)) {
1448         Modified = true;
1449         I = mergeRead2Pair(CI);
1450       } else {
1451         ++I;
1452       }
1453       continue;
1454     case DS_WRITE:
1455       CI.EltSize =
1456           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1457                                                                             : 4;
1458       if (findMatchingInst(CI)) {
1459         Modified = true;
1460         I = mergeWrite2Pair(CI);
1461       } else {
1462         ++I;
1463       }
1464       continue;
1465     case S_BUFFER_LOAD_IMM:
1466       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1467       if (findMatchingInst(CI)) {
1468         Modified = true;
1469         I = mergeSBufferLoadImmPair(CI);
1470         OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1471       } else {
1472         ++I;
1473       }
1474       continue;
1475     case BUFFER_LOAD_OFFEN:
1476     case BUFFER_LOAD_OFFSET:
1477     case BUFFER_LOAD_OFFEN_exact:
1478     case BUFFER_LOAD_OFFSET_exact:
1479       CI.EltSize = 4;
1480       if (findMatchingInst(CI)) {
1481         Modified = true;
1482         I = mergeBufferLoadPair(CI);
1483         OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1484       } else {
1485         ++I;
1486       }
1487       continue;
1488     case BUFFER_STORE_OFFEN:
1489     case BUFFER_STORE_OFFSET:
1490     case BUFFER_STORE_OFFEN_exact:
1491     case BUFFER_STORE_OFFSET_exact:
1492       CI.EltSize = 4;
1493       if (findMatchingInst(CI)) {
1494         Modified = true;
1495         I = mergeBufferStorePair(CI);
1496         OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1497       } else {
1498         ++I;
1499       }
1500       continue;
1501     }
1502
1503     ++I;
1504   }
1505
1506   return Modified;
1507 }
1508
1509 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1510   if (skipFunction(MF.getFunction()))
1511     return false;
1512
1513   STM = &MF.getSubtarget<GCNSubtarget>();
1514   if (!STM->loadStoreOptEnabled())
1515     return false;
1516
1517   TII = STM->getInstrInfo();
1518   TRI = &TII->getRegisterInfo();
1519
1520   MRI = &MF.getRegInfo();
1521   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1522
1523   assert(MRI->isSSA() && "Must be run on SSA");
1524
1525   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1526
1527   bool Modified = false;
1528
1529   for (MachineBasicBlock &MBB : MF) {
1530     do {
1531       OptimizeAgain = false;
1532       Modified |= optimizeBlock(MBB);
1533     } while (OptimizeAgain);
1534   }
1535
1536   return Modified;
1537 }