lib/Target/AMDGPU/SILowerControlFlow.cpp

   1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass lowers the pseudo control flow instructions to real
  11 /// machine instructions.
  12 ///
  13 /// All control flow is handled using predicated instructions and
  14 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
  15 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
  16 /// by writting to the 64-bit EXEC register (each bit corresponds to a
  17 /// single vector ALU).  Typically, for predicates, a vector ALU will write
  18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
  19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
  20 /// EXEC to update the predicates.
  21 ///
  22 /// For example:
  23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
  24 /// %sgpr0 = SI_IF %vcc
  25 ///   %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
  26 /// %sgpr0 = SI_ELSE %sgpr0
  27 ///   %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
  28 /// SI_END_CF %sgpr0
  29 ///
  30 /// becomes:
  31 ///
  32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc  // Save and update the exec mask
  33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec  // Clear live bits from saved exec mask
  34 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
  35 ///                                   // optimization which allows us to
  36 ///                                   // branch if all the bits of
  37 ///                                   // EXEC are zero.
  38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
  39 ///
  40 /// label0:
  41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %exec   // Restore the exec mask for the Then block
  42 /// %exec = S_XOR_B64 %sgpr0, %exec    // Clear live bits from saved exec mask
  43 /// S_BRANCH_EXECZ label1              // Use our branch optimization
  44 ///                                    // instruction again.
  45 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
  46 /// label1:
  47 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
  48 //===----------------------------------------------------------------------===//
  49
  50 #include "AMDGPU.h"
  51 #include "AMDGPUSubtarget.h"
  52 #include "SIInstrInfo.h"
  53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  54 #include "llvm/ADT/SmallVector.h"
  55 #include "llvm/ADT/StringRef.h"
  56 #include "llvm/CodeGen/LiveIntervals.h"
  57 #include "llvm/CodeGen/MachineBasicBlock.h"
  58 #include "llvm/CodeGen/MachineFunction.h"
  59 #include "llvm/CodeGen/MachineFunctionPass.h"
  60 #include "llvm/CodeGen/MachineInstr.h"
  61 #include "llvm/CodeGen/MachineInstrBuilder.h"
  62 #include "llvm/CodeGen/MachineOperand.h"
  63 #include "llvm/CodeGen/MachineRegisterInfo.h"
  64 #include "llvm/CodeGen/Passes.h"
  65 #include "llvm/CodeGen/SlotIndexes.h"
  66 #include "llvm/CodeGen/TargetRegisterInfo.h"
  67 #include "llvm/MC/MCRegisterInfo.h"
  68 #include "llvm/Pass.h"
  69 #include <cassert>
  70 #include <iterator>
  71
  72 using namespace llvm;
  73
  74 #define DEBUG_TYPE "si-lower-control-flow"
  75
  76 namespace {
  77
  78 class SILowerControlFlow : public MachineFunctionPass {
  79 private:
  80   const SIRegisterInfo *TRI = nullptr;
  81   const SIInstrInfo *TII = nullptr;
  82   LiveIntervals *LIS = nullptr;
  83   MachineRegisterInfo *MRI = nullptr;
  84
  85   void emitIf(MachineInstr &MI);
  86   void emitElse(MachineInstr &MI);
  87   void emitIfBreak(MachineInstr &MI);
  88   void emitLoop(MachineInstr &MI);
  89   void emitEndCf(MachineInstr &MI);
  90
  91   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
  92                         SmallVectorImpl<MachineOperand> &Src) const;
  93
  94   void combineMasks(MachineInstr &MI);
  95
  96 public:
  97   static char ID;
  98
  99   SILowerControlFlow() : MachineFunctionPass(ID) {}
 100
 101   bool runOnMachineFunction(MachineFunction &MF) override;
 102
 103   StringRef getPassName() const override {
 104     return "SI Lower control flow pseudo instructions";
 105   }
 106
 107   void getAnalysisUsage(AnalysisUsage &AU) const override {
 108     // Should preserve the same set that TwoAddressInstructions does.
 109     AU.addPreserved<SlotIndexes>();
 110     AU.addPreserved<LiveIntervals>();
 111     AU.addPreservedID(LiveVariablesID);
 112     AU.addPreservedID(MachineLoopInfoID);
 113     AU.addPreservedID(MachineDominatorsID);
 114     AU.setPreservesCFG();
 115     MachineFunctionPass::getAnalysisUsage(AU);
 116   }
 117 };
 118
 119 } // end anonymous namespace
 120
 121 char SILowerControlFlow::ID = 0;
 122
 123 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
 124                "SI lower control flow", false, false)
 125
 126 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
 127   MachineOperand &ImpDefSCC = MI.getOperand(3);
 128   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
 129
 130   ImpDefSCC.setIsDead(IsDead);
 131 }
 132
 133 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 134
 135 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
 136                        const SIInstrInfo *TII) {
 137   unsigned SaveExecReg = MI.getOperand(0).getReg();
 138   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
 139
 140   if (U == MRI->use_instr_nodbg_end() ||
 141       std::next(U) != MRI->use_instr_nodbg_end() ||
 142       U->getOpcode() != AMDGPU::SI_END_CF)
 143     return false;
 144
 145   // Check for SI_KILL_*_TERMINATOR on path from if to endif.
 146   // if there is any such terminator simplififcations are not safe.
 147   auto SMBB = MI.getParent();
 148   auto EMBB = U->getParent();
 149   DenseSet<const MachineBasicBlock*> Visited;
 150   SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
 151                                               SMBB->succ_end());
 152
 153   while (!Worklist.empty()) {
 154     MachineBasicBlock *MBB = Worklist.pop_back_val();
 155
 156     if (MBB == EMBB || !Visited.insert(MBB).second)
 157       continue;
 158     for(auto &Term : MBB->terminators())
 159       if (TII->isKillTerminator(Term.getOpcode()))
 160         return false;
 161
 162     Worklist.append(MBB->succ_begin(), MBB->succ_end());
 163   }
 164
 165   return true;
 166 }
 167
 168 void SILowerControlFlow::emitIf(MachineInstr &MI) {
 169   MachineBasicBlock &MBB = *MI.getParent();
 170   const DebugLoc &DL = MI.getDebugLoc();
 171   MachineBasicBlock::iterator I(&MI);
 172
 173   MachineOperand &SaveExec = MI.getOperand(0);
 174   MachineOperand &Cond = MI.getOperand(1);
 175   assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
 176          Cond.getSubReg() == AMDGPU::NoSubRegister);
 177
 178   unsigned SaveExecReg = SaveExec.getReg();
 179
 180   MachineOperand &ImpDefSCC = MI.getOperand(4);
 181   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
 182
 183   // If there is only one use of save exec register and that use is SI_END_CF,
 184   // we can optimize SI_IF by returning the full saved exec mask instead of
 185   // just cleared bits.
 186   bool SimpleIf = isSimpleIf(MI, MRI, TII);
 187
 188   // Add an implicit def of exec to discourage scheduling VALU after this which
 189   // will interfere with trying to form s_and_saveexec_b64 later.
 190   unsigned CopyReg = SimpleIf ? SaveExecReg
 191                        : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 192   MachineInstr *CopyExec =
 193     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
 194     .addReg(AMDGPU::EXEC)
 195     .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
 196
 197   unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 198
 199   MachineInstr *And =
 200     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
 201     .addReg(CopyReg)
 202     //.addReg(AMDGPU::EXEC)
 203     .addReg(Cond.getReg());
 204   setImpSCCDefDead(*And, true);
 205
 206   MachineInstr *Xor = nullptr;
 207   if (!SimpleIf) {
 208     Xor =
 209       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
 210       .addReg(Tmp)
 211       .addReg(CopyReg);
 212     setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
 213   }
 214
 215   // Use a copy that is a terminator to get correct spill code placement it with
 216   // fast regalloc.
 217   MachineInstr *SetExec =
 218     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
 219     .addReg(Tmp, RegState::Kill);
 220
 221   // Insert a pseudo terminator to help keep the verifier happy. This will also
 222   // be used later when inserting skips.
 223   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
 224                             .add(MI.getOperand(2));
 225
 226   if (!LIS) {
 227     MI.eraseFromParent();
 228     return;
 229   }
 230
 231   LIS->InsertMachineInstrInMaps(*CopyExec);
 232
 233   // Replace with and so we don't need to fix the live interval for condition
 234   // register.
 235   LIS->ReplaceMachineInstrInMaps(MI, *And);
 236
 237   if (!SimpleIf)
 238     LIS->InsertMachineInstrInMaps(*Xor);
 239   LIS->InsertMachineInstrInMaps(*SetExec);
 240   LIS->InsertMachineInstrInMaps(*NewBr);
 241
 242   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
 243   MI.eraseFromParent();
 244
 245   // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
 246   // hard to add another def here but I'm not sure how to correctly update the
 247   // valno.
 248   LIS->removeInterval(SaveExecReg);
 249   LIS->createAndComputeVirtRegInterval(SaveExecReg);
 250   LIS->createAndComputeVirtRegInterval(Tmp);
 251   if (!SimpleIf)
 252     LIS->createAndComputeVirtRegInterval(CopyReg);
 253 }
 254
 255 void SILowerControlFlow::emitElse(MachineInstr &MI) {
 256   MachineBasicBlock &MBB = *MI.getParent();
 257   const DebugLoc &DL = MI.getDebugLoc();
 258
 259   unsigned DstReg = MI.getOperand(0).getReg();
 260   assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
 261
 262   bool ExecModified = MI.getOperand(3).getImm() != 0;
 263   MachineBasicBlock::iterator Start = MBB.begin();
 264
 265   // We are running before TwoAddressInstructions, and si_else's operands are
 266   // tied. In order to correctly tie the registers, split this into a copy of
 267   // the src like it does.
 268   unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 269   MachineInstr *CopyExec =
 270     BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
 271       .add(MI.getOperand(1)); // Saved EXEC
 272
 273   // This must be inserted before phis and any spill code inserted before the
 274   // else.
 275   unsigned SaveReg = ExecModified ?
 276     MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
 277   MachineInstr *OrSaveExec =
 278     BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
 279     .addReg(CopyReg);
 280
 281   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 282
 283   MachineBasicBlock::iterator ElsePt(MI);
 284
 285   if (ExecModified) {
 286     MachineInstr *And =
 287       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
 288       .addReg(AMDGPU::EXEC)
 289       .addReg(SaveReg);
 290
 291     if (LIS)
 292       LIS->InsertMachineInstrInMaps(*And);
 293   }
 294
 295   MachineInstr *Xor =
 296     BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
 297     .addReg(AMDGPU::EXEC)
 298     .addReg(DstReg);
 299
 300   MachineInstr *Branch =
 301     BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
 302     .addMBB(DestBB);
 303
 304   if (!LIS) {
 305     MI.eraseFromParent();
 306     return;
 307   }
 308
 309   LIS->RemoveMachineInstrFromMaps(MI);
 310   MI.eraseFromParent();
 311
 312   LIS->InsertMachineInstrInMaps(*CopyExec);
 313   LIS->InsertMachineInstrInMaps(*OrSaveExec);
 314
 315   LIS->InsertMachineInstrInMaps(*Xor);
 316   LIS->InsertMachineInstrInMaps(*Branch);
 317
 318   // src reg is tied to dst reg.
 319   LIS->removeInterval(DstReg);
 320   LIS->createAndComputeVirtRegInterval(DstReg);
 321   LIS->createAndComputeVirtRegInterval(CopyReg);
 322   if (ExecModified)
 323     LIS->createAndComputeVirtRegInterval(SaveReg);
 324
 325   // Let this be recomputed.
 326   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
 327 }
 328
 329 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
 330   MachineBasicBlock &MBB = *MI.getParent();
 331   const DebugLoc &DL = MI.getDebugLoc();
 332   auto Dst = MI.getOperand(0).getReg();
 333
 334   // Skip ANDing with exec if the break condition is already masked by exec
 335   // because it is a V_CMP in the same basic block. (We know the break
 336   // condition operand was an i1 in IR, so if it is a VALU instruction it must
 337   // be one with a carry-out.)
 338   bool SkipAnding = false;
 339   if (MI.getOperand(1).isReg()) {
 340     if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
 341       SkipAnding = Def->getParent() == MI.getParent()
 342           && SIInstrInfo::isVALU(*Def);
 343     }
 344   }
 345
 346   // AND the break condition operand with exec, then OR that into the "loop
 347   // exit" mask.
 348   MachineInstr *And = nullptr, *Or = nullptr;
 349   if (!SkipAnding) {
 350     And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
 351              .addReg(AMDGPU::EXEC)
 352              .add(MI.getOperand(1));
 353     Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
 354              .addReg(Dst)
 355              .add(MI.getOperand(2));
 356   } else
 357     Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
 358              .add(MI.getOperand(1))
 359              .add(MI.getOperand(2));
 360
 361   if (LIS) {
 362     if (And)
 363       LIS->InsertMachineInstrInMaps(*And);
 364     LIS->ReplaceMachineInstrInMaps(MI, *Or);
 365   }
 366
 367   MI.eraseFromParent();
 368 }
 369
 370 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
 371   MachineBasicBlock &MBB = *MI.getParent();
 372   const DebugLoc &DL = MI.getDebugLoc();
 373
 374   MachineInstr *AndN2 =
 375       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
 376           .addReg(AMDGPU::EXEC)
 377           .add(MI.getOperand(0));
 378
 379   MachineInstr *Branch =
 380       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
 381           .add(MI.getOperand(1));
 382
 383   if (LIS) {
 384     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
 385     LIS->InsertMachineInstrInMaps(*Branch);
 386   }
 387
 388   MI.eraseFromParent();
 389 }
 390
 391 void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 392   MachineBasicBlock &MBB = *MI.getParent();
 393   const DebugLoc &DL = MI.getDebugLoc();
 394
 395   MachineBasicBlock::iterator InsPt = MBB.begin();
 396   MachineInstr *NewMI =
 397       BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
 398           .addReg(AMDGPU::EXEC)
 399           .add(MI.getOperand(0));
 400
 401   if (LIS)
 402     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
 403
 404   MI.eraseFromParent();
 405
 406   if (LIS)
 407     LIS->handleMove(*NewMI);
 408 }
 409
 410 // Returns replace operands for a logical operation, either single result
 411 // for exec or two operands if source was another equivalent operation.
 412 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
 413        SmallVectorImpl<MachineOperand> &Src) const {
 414   MachineOperand &Op = MI.getOperand(OpNo);
 415   if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
 416     Src.push_back(Op);
 417     return;
 418   }
 419
 420   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
 421   if (!Def || Def->getParent() != MI.getParent() ||
 422       !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
 423     return;
 424
 425   // Make sure we do not modify exec between def and use.
 426   // A copy with implcitly defined exec inserted earlier is an exclusion, it
 427   // does not really modify exec.
 428   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
 429     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
 430         !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
 431       return;
 432
 433   for (const auto &SrcOp : Def->explicit_operands())
 434     if (SrcOp.isReg() && SrcOp.isUse() &&
 435         (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
 436         SrcOp.getReg() == AMDGPU::EXEC))
 437       Src.push_back(SrcOp);
 438 }
 439
 440 // Search and combine pairs of equivalent instructions, like
 441 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
 442 // S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
 443 // One of the operands is exec mask.
 444 void SILowerControlFlow::combineMasks(MachineInstr &MI) {
 445   assert(MI.getNumExplicitOperands() == 3);
 446   SmallVector<MachineOperand, 4> Ops;
 447   unsigned OpToReplace = 1;
 448   findMaskOperands(MI, 1, Ops);
 449   if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
 450   findMaskOperands(MI, 2, Ops);
 451   if (Ops.size() != 3) return;
 452
 453   unsigned UniqueOpndIdx;
 454   if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
 455   else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
 456   else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
 457   else return;
 458
 459   unsigned Reg = MI.getOperand(OpToReplace).getReg();
 460   MI.RemoveOperand(OpToReplace);
 461   MI.addOperand(Ops[UniqueOpndIdx]);
 462   if (MRI->use_empty(Reg))
 463     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
 464 }
 465
 466 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
 467   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 468   TII = ST.getInstrInfo();
 469   TRI = &TII->getRegisterInfo();
 470
 471   // This doesn't actually need LiveIntervals, but we can preserve them.
 472   LIS = getAnalysisIfAvailable<LiveIntervals>();
 473   MRI = &MF.getRegInfo();
 474
 475   MachineFunction::iterator NextBB;
 476   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 477        BI != BE; BI = NextBB) {
 478     NextBB = std::next(BI);
 479     MachineBasicBlock &MBB = *BI;
 480
 481     MachineBasicBlock::iterator I, Next, Last;
 482
 483     for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
 484       Next = std::next(I);
 485       MachineInstr &MI = *I;
 486
 487       switch (MI.getOpcode()) {
 488       case AMDGPU::SI_IF:
 489         emitIf(MI);
 490         break;
 491
 492       case AMDGPU::SI_ELSE:
 493         emitElse(MI);
 494         break;
 495
 496       case AMDGPU::SI_IF_BREAK:
 497         emitIfBreak(MI);
 498         break;
 499
 500       case AMDGPU::SI_LOOP:
 501         emitLoop(MI);
 502         break;
 503
 504       case AMDGPU::SI_END_CF:
 505         emitEndCf(MI);
 506         break;
 507
 508       case AMDGPU::S_AND_B64:
 509       case AMDGPU::S_OR_B64:
 510         // Cleanup bit manipulations on exec mask
 511         combineMasks(MI);
 512         Last = I;
 513         continue;
 514
 515       default:
 516         Last = I;
 517         continue;
 518       }
 519
 520       // Replay newly inserted code to combine masks
 521       Next = (Last == MBB.end()) ? MBB.begin() : Last;
 522     }
 523   }
 524
 525   return true;
 526 }