llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

   1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 /// The pass tries to use the 32-bit encoding for instructions when possible.
   8 //===----------------------------------------------------------------------===//
   9 //
  10
  11 #include "AMDGPU.h"
  12 #include "GCNSubtarget.h"
  13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  14 #include "Utils/AMDGPUBaseInfo.h"
  15 #include "llvm/ADT/Statistic.h"
  16 #include "llvm/CodeGen/MachineFunctionPass.h"
  17
  18 #define DEBUG_TYPE "si-shrink-instructions"
  19
  20 STATISTIC(NumInstructionsShrunk,
  21           "Number of 64-bit instruction reduced to 32-bit.");
  22 STATISTIC(NumLiteralConstantsFolded,
  23           "Number of literal constants folded into 32-bit instructions.");
  24
  25 using namespace llvm;
  26
  27 namespace {
  28
  29 class SIShrinkInstructions : public MachineFunctionPass {
  30   MachineFunction *MF;
  31   MachineRegisterInfo *MRI;
  32   const GCNSubtarget *ST;
  33   const SIInstrInfo *TII;
  34   const SIRegisterInfo *TRI;
  35
  36 public:
  37   static char ID;
  38
  39 public:
  40   SIShrinkInstructions() : MachineFunctionPass(ID) {
  41   }
  42
  43   bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
  44   bool shouldShrinkTrue16(MachineInstr &MI) const;
  45   bool isKImmOperand(const MachineOperand &Src) const;
  46   bool isKUImmOperand(const MachineOperand &Src) const;
  47   bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
  48   void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
  49   void shrinkScalarCompare(MachineInstr &MI) const;
  50   void shrinkMIMG(MachineInstr &MI) const;
  51   void shrinkMadFma(MachineInstr &MI) const;
  52   bool shrinkScalarLogicOp(MachineInstr &MI) const;
  53   bool tryReplaceDeadSDST(MachineInstr &MI) const;
  54   bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
  55                      Register Reg, unsigned SubReg) const;
  56   bool instReadsReg(const MachineInstr *MI, unsigned Reg,
  57                     unsigned SubReg) const;
  58   bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
  59                        unsigned SubReg) const;
  60   TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
  61                                                    unsigned I) const;
  62   void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
  63   MachineInstr *matchSwap(MachineInstr &MovT) const;
  64
  65   bool runOnMachineFunction(MachineFunction &MF) override;
  66
  67   StringRef getPassName() const override { return "SI Shrink Instructions"; }
  68
  69   void getAnalysisUsage(AnalysisUsage &AU) const override {
  70     AU.setPreservesCFG();
  71     MachineFunctionPass::getAnalysisUsage(AU);
  72   }
  73 };
  74
  75 } // End anonymous namespace.
  76
  77 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
  78                 "SI Shrink Instructions", false, false)
  79
  80 char SIShrinkInstructions::ID = 0;
  81
  82 FunctionPass *llvm::createSIShrinkInstructionsPass() {
  83   return new SIShrinkInstructions();
  84 }
  85
  86 /// This function checks \p MI for operands defined by a move immediate
  87 /// instruction and then folds the literal constant into the instruction if it
  88 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
  89 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
  90                                           bool TryToCommute) const {
  91   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
  92
  93   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
  94
  95   // Try to fold Src0
  96   MachineOperand &Src0 = MI.getOperand(Src0Idx);
  97   if (Src0.isReg()) {
  98     Register Reg = Src0.getReg();
  99     if (Reg.isVirtual()) {
 100       MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
 101       if (Def && Def->isMoveImmediate()) {
 102         MachineOperand &MovSrc = Def->getOperand(1);
 103         bool ConstantFolded = false;
 104
 105         if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
 106           if (MovSrc.isImm()) {
 107             Src0.ChangeToImmediate(MovSrc.getImm());
 108             ConstantFolded = true;
 109           } else if (MovSrc.isFI()) {
 110             Src0.ChangeToFrameIndex(MovSrc.getIndex());
 111             ConstantFolded = true;
 112           } else if (MovSrc.isGlobal()) {
 113             Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
 114                             MovSrc.getTargetFlags());
 115             ConstantFolded = true;
 116           }
 117         }
 118
 119         if (ConstantFolded) {
 120           if (MRI->use_nodbg_empty(Reg))
 121             Def->eraseFromParent();
 122           ++NumLiteralConstantsFolded;
 123           return true;
 124         }
 125       }
 126     }
 127   }
 128
 129   // We have failed to fold src0, so commute the instruction and try again.
 130   if (TryToCommute && MI.isCommutable()) {
 131     if (TII->commuteInstruction(MI)) {
 132       if (foldImmediates(MI, false))
 133         return true;
 134
 135       // Commute back.
 136       TII->commuteInstruction(MI);
 137     }
 138   }
 139
 140   return false;
 141 }
 142
 143 /// Do not shrink the instruction if its registers are not expressible in the
 144 /// shrunk encoding.
 145 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
 146   for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
 147     const MachineOperand &MO = MI.getOperand(I);
 148     if (MO.isReg()) {
 149       Register Reg = MO.getReg();
 150       assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink "
 151                                  "True16 Instructions post-RA");
 152       if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
 153           !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
 154         return false;
 155     }
 156   }
 157   return true;
 158 }
 159
 160 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
 161   return isInt<16>(SignExtend64(Src.getImm(), 32)) &&
 162          !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
 163 }
 164
 165 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
 166   return isUInt<16>(Src.getImm()) &&
 167          !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
 168 }
 169
 170 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
 171                                                 bool &IsUnsigned) const {
 172   if (isInt<16>(SignExtend64(Src.getImm(), 32))) {
 173     IsUnsigned = false;
 174     return !TII->isInlineConstant(Src);
 175   }
 176
 177   if (isUInt<16>(Src.getImm())) {
 178     IsUnsigned = true;
 179     return !TII->isInlineConstant(Src);
 180   }
 181
 182   return false;
 183 }
 184
 185 /// \returns the opcode of an instruction a move immediate of the constant \p
 186 /// Src can be replaced with if the constant is replaced with \p ModifiedImm.
 187 /// i.e.
 188 ///
 189 /// If the bitreverse of a constant is an inline immediate, reverse the
 190 /// immediate and return the bitreverse opcode.
 191 ///
 192 /// If the bitwise negation of a constant is an inline immediate, reverse the
 193 /// immediate and return the bitwise not opcode.
 194 static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
 195                                          const MachineOperand &Src,
 196                                          int32_t &ModifiedImm, bool Scalar) {
 197   if (TII->isInlineConstant(Src))
 198     return 0;
 199   int32_t SrcImm = static_cast<int32_t>(Src.getImm());
 200
 201   if (!Scalar) {
 202     // We could handle the scalar case with here, but we would need to check
 203     // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
 204     // it, as the reasonable values are already covered by s_movk_i32.
 205     ModifiedImm = ~SrcImm;
 206     if (TII->isInlineConstant(APInt(32, ModifiedImm)))
 207       return AMDGPU::V_NOT_B32_e32;
 208   }
 209
 210   ModifiedImm = reverseBits<int32_t>(SrcImm);
 211   if (TII->isInlineConstant(APInt(32, ModifiedImm)))
 212     return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
 213
 214   return 0;
 215 }
 216
 217 /// Copy implicit register operands from specified instruction to this
 218 /// instruction that are not part of the instruction definition.
 219 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
 220                                                 MachineInstr &MI) const {
 221   MachineFunction &MF = *MI.getMF();
 222   for (unsigned i = MI.getDesc().getNumOperands() +
 223                     MI.getDesc().implicit_uses().size() +
 224                     MI.getDesc().implicit_defs().size(),
 225                 e = MI.getNumOperands();
 226        i != e; ++i) {
 227     const MachineOperand &MO = MI.getOperand(i);
 228     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
 229       NewMI.addOperand(MF, MO);
 230   }
 231 }
 232
 233 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
 234   if (!ST->hasSCmpK())
 235     return;
 236
 237   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
 238   // get constants on the RHS.
 239   if (!MI.getOperand(0).isReg())
 240     TII->commuteInstruction(MI, false, 0, 1);
 241
 242   // cmpk requires src0 to be a register
 243   const MachineOperand &Src0 = MI.getOperand(0);
 244   if (!Src0.isReg())
 245     return;
 246
 247   MachineOperand &Src1 = MI.getOperand(1);
 248   if (!Src1.isImm())
 249     return;
 250
 251   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
 252   if (SOPKOpc == -1)
 253     return;
 254
 255   // eq/ne is special because the imm16 can be treated as signed or unsigned,
 256   // and initially selected to the unsigned versions.
 257   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
 258     bool HasUImm;
 259     if (isKImmOrKUImmOperand(Src1, HasUImm)) {
 260       if (!HasUImm) {
 261         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
 262           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
 263         Src1.setImm(SignExtend32(Src1.getImm(), 32));
 264       }
 265
 266       MI.setDesc(TII->get(SOPKOpc));
 267     }
 268
 269     return;
 270   }
 271
 272   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 273
 274   if ((SIInstrInfo::sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
 275       (!SIInstrInfo::sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
 276     if (!SIInstrInfo::sopkIsZext(SOPKOpc))
 277       Src1.setImm(SignExtend64(Src1.getImm(), 32));
 278     MI.setDesc(NewDesc);
 279   }
 280 }
 281
 282 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
 283 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
 284   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
 285   if (!Info)
 286     return;
 287
 288   uint8_t NewEncoding;
 289   switch (Info->MIMGEncoding) {
 290   case AMDGPU::MIMGEncGfx10NSA:
 291     NewEncoding = AMDGPU::MIMGEncGfx10Default;
 292     break;
 293   case AMDGPU::MIMGEncGfx11NSA:
 294     NewEncoding = AMDGPU::MIMGEncGfx11Default;
 295     break;
 296   default:
 297     return;
 298   }
 299
 300   int VAddr0Idx =
 301       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
 302   unsigned NewAddrDwords = Info->VAddrDwords;
 303   const TargetRegisterClass *RC;
 304
 305   if (Info->VAddrDwords == 2) {
 306     RC = &AMDGPU::VReg_64RegClass;
 307   } else if (Info->VAddrDwords == 3) {
 308     RC = &AMDGPU::VReg_96RegClass;
 309   } else if (Info->VAddrDwords == 4) {
 310     RC = &AMDGPU::VReg_128RegClass;
 311   } else if (Info->VAddrDwords == 5) {
 312     RC = &AMDGPU::VReg_160RegClass;
 313   } else if (Info->VAddrDwords == 6) {
 314     RC = &AMDGPU::VReg_192RegClass;
 315   } else if (Info->VAddrDwords == 7) {
 316     RC = &AMDGPU::VReg_224RegClass;
 317   } else if (Info->VAddrDwords == 8) {
 318     RC = &AMDGPU::VReg_256RegClass;
 319   } else if (Info->VAddrDwords == 9) {
 320     RC = &AMDGPU::VReg_288RegClass;
 321   } else if (Info->VAddrDwords == 10) {
 322     RC = &AMDGPU::VReg_320RegClass;
 323   } else if (Info->VAddrDwords == 11) {
 324     RC = &AMDGPU::VReg_352RegClass;
 325   } else if (Info->VAddrDwords == 12) {
 326     RC = &AMDGPU::VReg_384RegClass;
 327   } else {
 328     RC = &AMDGPU::VReg_512RegClass;
 329     NewAddrDwords = 16;
 330   }
 331
 332   unsigned VgprBase = 0;
 333   unsigned NextVgpr = 0;
 334   bool IsUndef = true;
 335   bool IsKill = NewAddrDwords == Info->VAddrDwords;
 336   const unsigned NSAMaxSize = ST->getNSAMaxSize();
 337   const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
 338   const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
 339   for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) {
 340     const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
 341     unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
 342     unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
 343     assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
 344
 345     if (Idx == 0) {
 346       VgprBase = Vgpr;
 347       NextVgpr = Vgpr + Dwords;
 348     } else if (Vgpr == NextVgpr) {
 349       NextVgpr = Vgpr + Dwords;
 350     } else {
 351       return;
 352     }
 353
 354     if (!Op.isUndef())
 355       IsUndef = false;
 356     if (!Op.isKill())
 357       IsKill = false;
 358   }
 359
 360   if (VgprBase + NewAddrDwords > 256)
 361     return;
 362
 363   // Further check for implicit tied operands - this may be present if TFE is
 364   // enabled
 365   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
 366   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
 367   unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
 368   unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
 369   int ToUntie = -1;
 370   if (TFEVal || LWEVal) {
 371     // TFE/LWE is enabled so we need to deal with an implicit tied operand
 372     for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
 373       if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
 374           MI.getOperand(i).isImplicit()) {
 375         // This is the tied operand
 376         assert(
 377             ToUntie == -1 &&
 378             "found more than one tied implicit operand when expecting only 1");
 379         ToUntie = i;
 380         MI.untieRegOperand(ToUntie);
 381       }
 382     }
 383   }
 384
 385   unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding,
 386                                              Info->VDataDwords, NewAddrDwords);
 387   MI.setDesc(TII->get(NewOpcode));
 388   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
 389   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
 390   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
 391
 392   for (unsigned i = 1; i < EndVAddr; ++i)
 393     MI.removeOperand(VAddr0Idx + 1);
 394
 395   if (ToUntie >= 0) {
 396     MI.tieOperands(
 397         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
 398         ToUntie - (EndVAddr - 1));
 399   }
 400 }
 401
 402 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
 403 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
 404   // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
 405   // there is no reason to try to shrink them.
 406   if (!ST->hasVOP3Literal())
 407     return;
 408
 409   // There is no advantage to doing this pre-RA.
 410   if (!MF->getProperties().hasProperty(
 411           MachineFunctionProperties::Property::NoVRegs))
 412     return;
 413
 414   if (TII->hasAnyModifiersSet(MI))
 415     return;
 416
 417   const unsigned Opcode = MI.getOpcode();
 418   MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
 419   MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
 420   MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
 421   unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
 422
 423   bool Swap;
 424
 425   // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
 426   if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
 427     if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
 428       Swap = false;
 429     else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
 430       Swap = true;
 431     else
 432       return;
 433
 434     switch (Opcode) {
 435     default:
 436       llvm_unreachable("Unexpected mad/fma opcode!");
 437     case AMDGPU::V_MAD_F32_e64:
 438       NewOpcode = AMDGPU::V_MADAK_F32;
 439       break;
 440     case AMDGPU::V_FMA_F32_e64:
 441       NewOpcode = AMDGPU::V_FMAAK_F32;
 442       break;
 443     case AMDGPU::V_MAD_F16_e64:
 444       NewOpcode = AMDGPU::V_MADAK_F16;
 445       break;
 446     case AMDGPU::V_FMA_F16_e64:
 447     case AMDGPU::V_FMA_F16_gfx9_e64:
 448       NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
 449                                           : AMDGPU::V_FMAAK_F16;
 450       break;
 451     }
 452   }
 453
 454   // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
 455   if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
 456     if (Src1.isImm() && !TII->isInlineConstant(Src1))
 457       Swap = false;
 458     else if (Src0.isImm() && !TII->isInlineConstant(Src0))
 459       Swap = true;
 460     else
 461       return;
 462
 463     switch (Opcode) {
 464     default:
 465       llvm_unreachable("Unexpected mad/fma opcode!");
 466     case AMDGPU::V_MAD_F32_e64:
 467       NewOpcode = AMDGPU::V_MADMK_F32;
 468       break;
 469     case AMDGPU::V_FMA_F32_e64:
 470       NewOpcode = AMDGPU::V_FMAMK_F32;
 471       break;
 472     case AMDGPU::V_MAD_F16_e64:
 473       NewOpcode = AMDGPU::V_MADMK_F16;
 474       break;
 475     case AMDGPU::V_FMA_F16_e64:
 476     case AMDGPU::V_FMA_F16_gfx9_e64:
 477       NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
 478                                           : AMDGPU::V_FMAMK_F16;
 479       break;
 480     }
 481   }
 482
 483   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
 484     return;
 485
 486   if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI))
 487     return;
 488
 489   if (Swap) {
 490     // Swap Src0 and Src1 by building a new instruction.
 491     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
 492             MI.getOperand(0).getReg())
 493         .add(Src1)
 494         .add(Src0)
 495         .add(Src2)
 496         .setMIFlags(MI.getFlags());
 497     MI.eraseFromParent();
 498   } else {
 499     TII->removeModOperands(MI);
 500     MI.setDesc(TII->get(NewOpcode));
 501   }
 502 }
 503
 504 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
 505 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
 506 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
 507 /// XNOR (as a ^ b == ~(a ^ ~b)).
 508 /// \returns true if the caller should continue the machine function iterator
 509 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
 510   unsigned Opc = MI.getOpcode();
 511   const MachineOperand *Dest = &MI.getOperand(0);
 512   MachineOperand *Src0 = &MI.getOperand(1);
 513   MachineOperand *Src1 = &MI.getOperand(2);
 514   MachineOperand *SrcReg = Src0;
 515   MachineOperand *SrcImm = Src1;
 516
 517   if (!SrcImm->isImm() ||
 518       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
 519     return false;
 520
 521   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
 522   uint32_t NewImm = 0;
 523
 524   if (Opc == AMDGPU::S_AND_B32) {
 525     if (isPowerOf2_32(~Imm)) {
 526       NewImm = llvm::countr_one(Imm);
 527       Opc = AMDGPU::S_BITSET0_B32;
 528     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
 529       NewImm = ~Imm;
 530       Opc = AMDGPU::S_ANDN2_B32;
 531     }
 532   } else if (Opc == AMDGPU::S_OR_B32) {
 533     if (isPowerOf2_32(Imm)) {
 534       NewImm = llvm::countr_zero(Imm);
 535       Opc = AMDGPU::S_BITSET1_B32;
 536     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
 537       NewImm = ~Imm;
 538       Opc = AMDGPU::S_ORN2_B32;
 539     }
 540   } else if (Opc == AMDGPU::S_XOR_B32) {
 541     if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
 542       NewImm = ~Imm;
 543       Opc = AMDGPU::S_XNOR_B32;
 544     }
 545   } else {
 546     llvm_unreachable("unexpected opcode");
 547   }
 548
 549   if (NewImm != 0) {
 550     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
 551       MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
 552       MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
 553       return true;
 554     }
 555
 556     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
 557       const bool IsUndef = SrcReg->isUndef();
 558       const bool IsKill = SrcReg->isKill();
 559       MI.setDesc(TII->get(Opc));
 560       if (Opc == AMDGPU::S_BITSET0_B32 ||
 561           Opc == AMDGPU::S_BITSET1_B32) {
 562         Src0->ChangeToImmediate(NewImm);
 563         // Remove the immediate and add the tied input.
 564         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
 565                                           /*isImp*/ false, IsKill,
 566                                           /*isDead*/ false, IsUndef);
 567         MI.tieOperands(0, 2);
 568       } else {
 569         SrcImm->setImm(NewImm);
 570       }
 571     }
 572   }
 573
 574   return false;
 575 }
 576
 577 // This is the same as MachineInstr::readsRegister/modifiesRegister except
 578 // it takes subregs into account.
 579 bool SIShrinkInstructions::instAccessReg(
 580     iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
 581     unsigned SubReg) const {
 582   for (const MachineOperand &MO : R) {
 583     if (!MO.isReg())
 584       continue;
 585
 586     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
 587       if (TRI->regsOverlap(Reg, MO.getReg()))
 588         return true;
 589     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
 590       LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
 591                             TRI->getSubRegIndexLaneMask(MO.getSubReg());
 592       if (Overlap.any())
 593         return true;
 594     }
 595   }
 596   return false;
 597 }
 598
 599 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
 600                                         unsigned SubReg) const {
 601   return instAccessReg(MI->uses(), Reg, SubReg);
 602 }
 603
 604 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
 605                                            unsigned SubReg) const {
 606   return instAccessReg(MI->defs(), Reg, SubReg);
 607 }
 608
 609 TargetInstrInfo::RegSubRegPair
 610 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
 611                                         unsigned I) const {
 612   if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
 613     if (Reg.isPhysical()) {
 614       Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
 615     } else {
 616       Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
 617     }
 618   }
 619   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
 620 }
 621
 622 void SIShrinkInstructions::dropInstructionKeepingImpDefs(
 623     MachineInstr &MI) const {
 624   for (unsigned i = MI.getDesc().getNumOperands() +
 625                     MI.getDesc().implicit_uses().size() +
 626                     MI.getDesc().implicit_defs().size(),
 627                 e = MI.getNumOperands();
 628        i != e; ++i) {
 629     const MachineOperand &Op = MI.getOperand(i);
 630     if (!Op.isDef())
 631       continue;
 632     BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
 633             TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
 634   }
 635
 636   MI.eraseFromParent();
 637 }
 638
 639 // Match:
 640 // mov t, x
 641 // mov x, y
 642 // mov y, t
 643 //
 644 // =>
 645 //
 646 // mov t, x (t is potentially dead and move eliminated)
 647 // v_swap_b32 x, y
 648 //
 649 // Returns next valid instruction pointer if was able to create v_swap_b32.
 650 //
 651 // This shall not be done too early not to prevent possible folding which may
 652 // remove matched moves, and this should preferably be done before RA to
 653 // release saved registers and also possibly after RA which can insert copies
 654 // too.
 655 //
 656 // This is really just a generic peephole that is not a canonical shrinking,
 657 // although requirements match the pass placement and it reduces code size too.
 658 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
 659   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
 660          MovT.getOpcode() == AMDGPU::COPY);
 661
 662   Register T = MovT.getOperand(0).getReg();
 663   unsigned Tsub = MovT.getOperand(0).getSubReg();
 664   MachineOperand &Xop = MovT.getOperand(1);
 665
 666   if (!Xop.isReg())
 667     return nullptr;
 668   Register X = Xop.getReg();
 669   unsigned Xsub = Xop.getSubReg();
 670
 671   unsigned Size = TII->getOpSize(MovT, 0) / 4;
 672
 673   if (!TRI->isVGPR(*MRI, X))
 674     return nullptr;
 675
 676   const unsigned SearchLimit = 16;
 677   unsigned Count = 0;
 678   bool KilledT = false;
 679   for (auto Iter = std::next(MovT.getIterator()),
 680             E = MovT.getParent()->instr_end();
 681        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
 682
 683     MachineInstr *MovY = &*Iter;
 684     KilledT = MovY->killsRegister(T, TRI);
 685
 686     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
 687          MovY->getOpcode() != AMDGPU::COPY) ||
 688         !MovY->getOperand(1).isReg()        ||
 689         MovY->getOperand(1).getReg() != T   ||
 690         MovY->getOperand(1).getSubReg() != Tsub)
 691       continue;
 692
 693     Register Y = MovY->getOperand(0).getReg();
 694     unsigned Ysub = MovY->getOperand(0).getSubReg();
 695
 696     if (!TRI->isVGPR(*MRI, Y))
 697       continue;
 698
 699     MachineInstr *MovX = nullptr;
 700     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
 701          I != IY; ++I) {
 702       if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
 703           instModifiesReg(&*I, T, Tsub) ||
 704           (MovX && instModifiesReg(&*I, X, Xsub))) {
 705         MovX = nullptr;
 706         break;
 707       }
 708       if (!instReadsReg(&*I, Y, Ysub)) {
 709         if (!MovX && instModifiesReg(&*I, X, Xsub)) {
 710           MovX = nullptr;
 711           break;
 712         }
 713         continue;
 714       }
 715       if (MovX ||
 716           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
 717            I->getOpcode() != AMDGPU::COPY) ||
 718           I->getOperand(0).getReg() != X ||
 719           I->getOperand(0).getSubReg() != Xsub) {
 720         MovX = nullptr;
 721         break;
 722       }
 723
 724       if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
 725         continue;
 726
 727       MovX = &*I;
 728     }
 729
 730     if (!MovX)
 731       continue;
 732
 733     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
 734
 735     for (unsigned I = 0; I < Size; ++I) {
 736       TargetInstrInfo::RegSubRegPair X1, Y1;
 737       X1 = getSubRegForIndex(X, Xsub, I);
 738       Y1 = getSubRegForIndex(Y, Ysub, I);
 739       MachineBasicBlock &MBB = *MovT.getParent();
 740       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
 741                          TII->get(AMDGPU::V_SWAP_B32))
 742         .addDef(X1.Reg, 0, X1.SubReg)
 743         .addDef(Y1.Reg, 0, Y1.SubReg)
 744         .addReg(Y1.Reg, 0, Y1.SubReg)
 745         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
 746       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
 747         // Drop implicit EXEC.
 748         MIB->removeOperand(MIB->getNumExplicitOperands());
 749         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
 750       }
 751     }
 752     MovX->eraseFromParent();
 753     dropInstructionKeepingImpDefs(*MovY);
 754     MachineInstr *Next = &*std::next(MovT.getIterator());
 755
 756     if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
 757       dropInstructionKeepingImpDefs(MovT);
 758     } else {
 759       Xop.setIsKill(false);
 760       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
 761         unsigned OpNo = MovT.getNumExplicitOperands() + I;
 762         const MachineOperand &Op = MovT.getOperand(OpNo);
 763         if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
 764           MovT.removeOperand(OpNo);
 765       }
 766     }
 767
 768     return Next;
 769   }
 770
 771   return nullptr;
 772 }
 773
 774 // If an instruction has dead sdst replace it with NULL register on gfx1030+
 775 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
 776   if (!ST->hasGFX10_3Insts())
 777     return false;
 778
 779   MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
 780   if (!Op)
 781     return false;
 782   Register SDstReg = Op->getReg();
 783   if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
 784     return false;
 785
 786   Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
 787   return true;
 788 }
 789
 790 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 791   if (skipFunction(MF.getFunction()))
 792     return false;
 793
 794   this->MF = &MF;
 795   MRI = &MF.getRegInfo();
 796   ST = &MF.getSubtarget<GCNSubtarget>();
 797   TII = ST->getInstrInfo();
 798   TRI = &TII->getRegisterInfo();
 799
 800   unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
 801
 802   std::vector<unsigned> I1Defs;
 803
 804   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 805                                                   BI != BE; ++BI) {
 806
 807     MachineBasicBlock &MBB = *BI;
 808     MachineBasicBlock::iterator I, Next;
 809     for (I = MBB.begin(); I != MBB.end(); I = Next) {
 810       Next = std::next(I);
 811       MachineInstr &MI = *I;
 812
 813       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
 814         // If this has a literal constant source that is the same as the
 815         // reversed bits of an inline immediate, replace with a bitreverse of
 816         // that constant. This saves 4 bytes in the common case of materializing
 817         // sign bits.
 818
 819         // Test if we are after regalloc. We only want to do this after any
 820         // optimizations happen because this will confuse them.
 821         // XXX - not exactly a check for post-regalloc run.
 822         MachineOperand &Src = MI.getOperand(1);
 823         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
 824           int32_t ModImm;
 825           unsigned ModOpcode =
 826               canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false);
 827           if (ModOpcode != 0) {
 828             MI.setDesc(TII->get(ModOpcode));
 829             Src.setImm(static_cast<int64_t>(ModImm));
 830             continue;
 831           }
 832         }
 833       }
 834
 835       if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
 836                             MI.getOpcode() == AMDGPU::COPY)) {
 837         if (auto *NextMI = matchSwap(MI)) {
 838           Next = NextMI->getIterator();
 839           continue;
 840         }
 841       }
 842
 843       // Try to use S_ADDK_I32 and S_MULK_I32.
 844       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
 845           MI.getOpcode() == AMDGPU::S_MUL_I32) {
 846         const MachineOperand *Dest = &MI.getOperand(0);
 847         MachineOperand *Src0 = &MI.getOperand(1);
 848         MachineOperand *Src1 = &MI.getOperand(2);
 849
 850         if (!Src0->isReg() && Src1->isReg()) {
 851           if (TII->commuteInstruction(MI, false, 1, 2))
 852             std::swap(Src0, Src1);
 853         }
 854
 855         // FIXME: This could work better if hints worked with subregisters. If
 856         // we have a vector add of a constant, we usually don't get the correct
 857         // allocation due to the subregister usage.
 858         if (Dest->getReg().isVirtual() && Src0->isReg()) {
 859           MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
 860           MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
 861           continue;
 862         }
 863
 864         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
 865           if (Src1->isImm() && isKImmOperand(*Src1)) {
 866             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
 867               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
 868
 869             Src1->setImm(SignExtend64(Src1->getImm(), 32));
 870             MI.setDesc(TII->get(Opc));
 871             MI.tieOperands(0, 1);
 872           }
 873         }
 874       }
 875
 876       // Try to use s_cmpk_*
 877       if (MI.isCompare() && TII->isSOPC(MI)) {
 878         shrinkScalarCompare(MI);
 879         continue;
 880       }
 881
 882       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
 883       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
 884         const MachineOperand &Dst = MI.getOperand(0);
 885         MachineOperand &Src = MI.getOperand(1);
 886
 887         if (Src.isImm() && Dst.getReg().isPhysical()) {
 888           unsigned ModOpc;
 889           int32_t ModImm;
 890           if (isKImmOperand(Src)) {
 891             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
 892             Src.setImm(SignExtend64(Src.getImm(), 32));
 893           } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm,
 894                                                         /*Scalar=*/true))) {
 895             MI.setDesc(TII->get(ModOpc));
 896             Src.setImm(static_cast<int64_t>(ModImm));
 897           }
 898         }
 899
 900         continue;
 901       }
 902
 903       // Shrink scalar logic operations.
 904       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
 905           MI.getOpcode() == AMDGPU::S_OR_B32 ||
 906           MI.getOpcode() == AMDGPU::S_XOR_B32) {
 907         if (shrinkScalarLogicOp(MI))
 908           continue;
 909       }
 910
 911       if (TII->isMIMG(MI.getOpcode()) &&
 912           ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
 913           MF.getProperties().hasProperty(
 914               MachineFunctionProperties::Property::NoVRegs)) {
 915         shrinkMIMG(MI);
 916         continue;
 917       }
 918
 919       if (!TII->isVOP3(MI))
 920         continue;
 921
 922       if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
 923           MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
 924           MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
 925           MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
 926           MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
 927         shrinkMadFma(MI);
 928         continue;
 929       }
 930
 931       if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
 932         // If there is no chance we will shrink it and use VCC as sdst to get
 933         // a 32 bit form try to replace dead sdst with NULL.
 934         tryReplaceDeadSDST(MI);
 935         continue;
 936       }
 937
 938       if (!TII->canShrink(MI, *MRI)) {
 939         // Try commuting the instruction and see if that enables us to shrink
 940         // it.
 941         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
 942             !TII->canShrink(MI, *MRI)) {
 943           tryReplaceDeadSDST(MI);
 944           continue;
 945         }
 946       }
 947
 948       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 949
 950       if (TII->isVOPC(Op32)) {
 951         MachineOperand &Op0 = MI.getOperand(0);
 952         if (Op0.isReg()) {
 953           // Exclude VOPCX instructions as these don't explicitly write a
 954           // dst.
 955           Register DstReg = Op0.getReg();
 956           if (DstReg.isVirtual()) {
 957             // VOPC instructions can only write to the VCC register. We can't
 958             // force them to use VCC here, because this is only one register and
 959             // cannot deal with sequences which would require multiple copies of
 960             // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
 961             //
 962             // So, instead of forcing the instruction to write to VCC, we
 963             // provide a hint to the register allocator to use VCC and then we
 964             // will run this pass again after RA and shrink it if it outputs to
 965             // VCC.
 966             MRI->setRegAllocationHint(DstReg, 0, VCCReg);
 967             continue;
 968           }
 969           if (DstReg != VCCReg)
 970             continue;
 971         }
 972       }
 973
 974       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
 975         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
 976         // instructions.
 977         const MachineOperand *Src2 =
 978             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
 979         if (!Src2->isReg())
 980           continue;
 981         Register SReg = Src2->getReg();
 982         if (SReg.isVirtual()) {
 983           MRI->setRegAllocationHint(SReg, 0, VCCReg);
 984           continue;
 985         }
 986         if (SReg != VCCReg)
 987           continue;
 988       }
 989
 990       // Check for the bool flag output for instructions like V_ADD_I32_e64.
 991       const MachineOperand *SDst = TII->getNamedOperand(MI,
 992                                                         AMDGPU::OpName::sdst);
 993
 994       if (SDst) {
 995         bool Next = false;
 996
 997         if (SDst->getReg() != VCCReg) {
 998           if (SDst->getReg().isVirtual())
 999             MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
1000           Next = true;
1001         }
1002
1003         // All of the instructions with carry outs also have an SGPR input in
1004         // src2.
1005         const MachineOperand *Src2 = TII->getNamedOperand(MI,
1006                                                           AMDGPU::OpName::src2);
1007         if (Src2 && Src2->getReg() != VCCReg) {
1008           if (Src2->getReg().isVirtual())
1009             MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
1010           Next = true;
1011         }
1012
1013         if (Next)
1014           continue;
1015       }
1016
1017       // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
1018       // fold an immediate into the shrunk instruction as a literal operand. In
1019       // GFX10 VOP3 instructions can take a literal operand anyway, so there is
1020       // no advantage to doing this.
1021       if (ST->hasVOP3Literal() &&
1022           !MF.getProperties().hasProperty(
1023               MachineFunctionProperties::Property::NoVRegs))
1024         continue;
1025
1026       if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
1027           !shouldShrinkTrue16(MI))
1028         continue;
1029
1030       // We can shrink this instruction
1031       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
1032
1033       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
1034       ++NumInstructionsShrunk;
1035
1036       // Copy extra operands not present in the instruction definition.
1037       copyExtraImplicitOps(*Inst32, MI);
1038
1039       // Copy deadness from the old explicit vcc def to the new implicit def.
1040       if (SDst && SDst->isDead())
1041         Inst32->findRegisterDefOperand(VCCReg, /*TRI=*/nullptr)->setIsDead();
1042
1043       MI.eraseFromParent();
1044       foldImmediates(*Inst32);
1045
1046       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
1047     }
1048   }
1049   return false;
1050 }