lib/Target/AMDGPU/SIWholeQuadMode.cpp

   1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass adds instructions to enable whole quad mode for pixel
  11 /// shaders, and whole wavefront mode for all programs.
  12 ///
  13 /// Whole quad mode is required for derivative computations, but it interferes
  14 /// with shader side effects (stores and atomics). This pass is run on the
  15 /// scheduled machine IR but before register coalescing, so that machine SSA is
  16 /// available for analysis. It ensures that WQM is enabled when necessary, but
  17 /// disabled around stores and atomics.
  18 ///
  19 /// When necessary, this pass creates a function prolog
  20 ///
  21 ///   S_MOV_B64 LiveMask, EXEC
  22 ///   S_WQM_B64 EXEC, EXEC
  23 ///
  24 /// to enter WQM at the top of the function and surrounds blocks of Exact
  25 /// instructions by
  26 ///
  27 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
  28 ///   ...
  29 ///   S_MOV_B64 EXEC, Tmp
  30 ///
  31 /// We also compute when a sequence of instructions requires Whole Wavefront
  32 /// Mode (WWM) and insert instructions to save and restore it:
  33 ///
  34 /// S_OR_SAVEEXEC_B64 Tmp, -1
  35 /// ...
  36 /// S_MOV_B64 EXEC, Tmp
  37 ///
  38 /// In order to avoid excessive switching during sequences of Exact
  39 /// instructions, the pass first analyzes which instructions must be run in WQM
  40 /// (aka which instructions produce values that lead to derivative
  41 /// computations).
  42 ///
  43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
  44 ///
  45 /// There is room for improvement given better control flow analysis:
  46 ///
  47 ///  (1) at the top level (outside of control flow statements, and as long as
  48 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
  49 ///      the LiveMask (this is implemented for the entry block).
  50 ///
  51 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
  52 ///      consist of exact and don't-care instructions, the switch only has to
  53 ///      be done at the entry and exit points rather than potentially in each
  54 ///      block of the region.
  55 ///
  56 //===----------------------------------------------------------------------===//
  57
  58 #include "AMDGPU.h"
  59 #include "AMDGPUSubtarget.h"
  60 #include "SIInstrInfo.h"
  61 #include "SIMachineFunctionInfo.h"
  62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  63 #include "llvm/ADT/DenseMap.h"
  64 #include "llvm/ADT/PostOrderIterator.h"
  65 #include "llvm/ADT/SmallVector.h"
  66 #include "llvm/ADT/StringRef.h"
  67 #include "llvm/CodeGen/LiveInterval.h"
  68 #include "llvm/CodeGen/LiveIntervals.h"
  69 #include "llvm/CodeGen/MachineBasicBlock.h"
  70 #include "llvm/CodeGen/MachineFunction.h"
  71 #include "llvm/CodeGen/MachineFunctionPass.h"
  72 #include "llvm/CodeGen/MachineInstr.h"
  73 #include "llvm/CodeGen/MachineInstrBuilder.h"
  74 #include "llvm/CodeGen/MachineOperand.h"
  75 #include "llvm/CodeGen/MachineRegisterInfo.h"
  76 #include "llvm/CodeGen/SlotIndexes.h"
  77 #include "llvm/CodeGen/TargetRegisterInfo.h"
  78 #include "llvm/IR/CallingConv.h"
  79 #include "llvm/IR/DebugLoc.h"
  80 #include "llvm/MC/MCRegisterInfo.h"
  81 #include "llvm/Pass.h"
  82 #include "llvm/Support/Debug.h"
  83 #include "llvm/Support/raw_ostream.h"
  84 #include <cassert>
  85 #include <vector>
  86
  87 using namespace llvm;
  88
  89 #define DEBUG_TYPE "si-wqm"
  90
  91 namespace {
  92
  93 enum {
  94   StateWQM = 0x1,
  95   StateWWM = 0x2,
  96   StateExact = 0x4,
  97 };
  98
  99 struct PrintState {
 100 public:
 101   int State;
 102
 103   explicit PrintState(int State) : State(State) {}
 104 };
 105
 106 #ifndef NDEBUG
 107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
 108   if (PS.State & StateWQM)
 109     OS << "WQM";
 110   if (PS.State & StateWWM) {
 111     if (PS.State & StateWQM)
 112       OS << '|';
 113     OS << "WWM";
 114   }
 115   if (PS.State & StateExact) {
 116     if (PS.State & (StateWQM | StateWWM))
 117       OS << '|';
 118     OS << "Exact";
 119   }
 120
 121   return OS;
 122 }
 123 #endif
 124
 125 struct InstrInfo {
 126   char Needs = 0;
 127   char Disabled = 0;
 128   char OutNeeds = 0;
 129 };
 130
 131 struct BlockInfo {
 132   char Needs = 0;
 133   char InNeeds = 0;
 134   char OutNeeds = 0;
 135 };
 136
 137 struct WorkItem {
 138   MachineBasicBlock *MBB = nullptr;
 139   MachineInstr *MI = nullptr;
 140
 141   WorkItem() = default;
 142   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
 143   WorkItem(MachineInstr *MI) : MI(MI) {}
 144 };
 145
 146 class SIWholeQuadMode : public MachineFunctionPass {
 147 private:
 148   CallingConv::ID CallingConv;
 149   const SIInstrInfo *TII;
 150   const SIRegisterInfo *TRI;
 151   const GCNSubtarget *ST;
 152   MachineRegisterInfo *MRI;
 153   LiveIntervals *LIS;
 154
 155   DenseMap<const MachineInstr *, InstrInfo> Instructions;
 156   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
 157   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 158   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
 159
 160   void printInfo();
 161
 162   void markInstruction(MachineInstr &MI, char Flag,
 163                        std::vector<WorkItem> &Worklist);
 164   void markInstructionUses(const MachineInstr &MI, char Flag,
 165                            std::vector<WorkItem> &Worklist);
 166   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
 167   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
 168   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
 169   char analyzeFunction(MachineFunction &MF);
 170
 171   bool requiresCorrectState(const MachineInstr &MI) const;
 172
 173   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
 174                                       MachineBasicBlock::iterator Before);
 175   MachineBasicBlock::iterator
 176   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 177                    MachineBasicBlock::iterator Last, bool PreferLast,
 178                    bool SaveSCC);
 179   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 180                unsigned SaveWQM, unsigned LiveMaskReg);
 181   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 182              unsigned SavedWQM);
 183   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 184              unsigned SaveOrig);
 185   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 186                unsigned SavedOrig);
 187   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 188
 189   void lowerLiveMaskQueries(unsigned LiveMaskReg);
 190   void lowerCopyInstrs();
 191
 192 public:
 193   static char ID;
 194
 195   SIWholeQuadMode() :
 196     MachineFunctionPass(ID) { }
 197
 198   bool runOnMachineFunction(MachineFunction &MF) override;
 199
 200   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
 201
 202   void getAnalysisUsage(AnalysisUsage &AU) const override {
 203     AU.addRequired<LiveIntervals>();
 204     AU.addPreserved<SlotIndexes>();
 205     AU.addPreserved<LiveIntervals>();
 206     AU.setPreservesCFG();
 207     MachineFunctionPass::getAnalysisUsage(AU);
 208   }
 209 };
 210
 211 } // end anonymous namespace
 212
 213 char SIWholeQuadMode::ID = 0;
 214
 215 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 216                       false)
 217 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 218 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 219                     false)
 220
 221 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 222
 223 FunctionPass *llvm::createSIWholeQuadModePass() {
 224   return new SIWholeQuadMode;
 225 }
 226
 227 #ifndef NDEBUG
 228 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 229   for (const auto &BII : Blocks) {
 230     dbgs() << "\n"
 231            << printMBBReference(*BII.first) << ":\n"
 232            << "  InNeeds = " << PrintState(BII.second.InNeeds)
 233            << ", Needs = " << PrintState(BII.second.Needs)
 234            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
 235
 236     for (const MachineInstr &MI : *BII.first) {
 237       auto III = Instructions.find(&MI);
 238       if (III == Instructions.end())
 239         continue;
 240
 241       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
 242              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
 243     }
 244   }
 245 }
 246 #endif
 247
 248 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 249                                       std::vector<WorkItem> &Worklist) {
 250   InstrInfo &II = Instructions[&MI];
 251
 252   assert(!(Flag & StateExact) && Flag != 0);
 253
 254   // Remove any disabled states from the flag. The user that required it gets
 255   // an undefined value in the helper lanes. For example, this can happen if
 256   // the result of an atomic is used by instruction that requires WQM, where
 257   // ignoring the request for WQM is correct as per the relevant specs.
 258   Flag &= ~II.Disabled;
 259
 260   // Ignore if the flag is already encompassed by the existing needs, or we
 261   // just disabled everything.
 262   if ((II.Needs & Flag) == Flag)
 263     return;
 264
 265   II.Needs |= Flag;
 266   Worklist.push_back(&MI);
 267 }
 268
 269 /// Mark all instructions defining the uses in \p MI with \p Flag.
 270 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
 271                                           std::vector<WorkItem> &Worklist) {
 272   for (const MachineOperand &Use : MI.uses()) {
 273     if (!Use.isReg() || !Use.isUse())
 274       continue;
 275
 276     Register Reg = Use.getReg();
 277
 278     // Handle physical registers that we need to track; this is mostly relevant
 279     // for VCC, which can appear as the (implicit) input of a uniform branch,
 280     // e.g. when a loop counter is stored in a VGPR.
 281     if (!Register::isVirtualRegister(Reg)) {
 282       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
 283         continue;
 284
 285       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
 286         LiveRange &LR = LIS->getRegUnit(*RegUnit);
 287         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
 288         if (!Value)
 289           continue;
 290
 291         // Since we're in machine SSA, we do not need to track physical
 292         // registers across basic blocks.
 293         if (Value->isPHIDef())
 294           continue;
 295
 296         markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
 297                         Worklist);
 298       }
 299
 300       continue;
 301     }
 302
 303     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
 304       markInstruction(DefMI, Flag, Worklist);
 305   }
 306 }
 307
 308 // Scan instructions to determine which ones require an Exact execmask and
 309 // which ones seed WQM requirements.
 310 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 311                                        std::vector<WorkItem> &Worklist) {
 312   char GlobalFlags = 0;
 313   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
 314   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 315   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
 316
 317   // We need to visit the basic blocks in reverse post-order so that we visit
 318   // defs before uses, in particular so that we don't accidentally mark an
 319   // instruction as needing e.g. WQM before visiting it and realizing it needs
 320   // WQM disabled.
 321   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
 322   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
 323     MachineBasicBlock &MBB = **BI;
 324     BlockInfo &BBI = Blocks[&MBB];
 325
 326     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
 327       MachineInstr &MI = *II;
 328       InstrInfo &III = Instructions[&MI];
 329       unsigned Opcode = MI.getOpcode();
 330       char Flags = 0;
 331
 332       if (TII->isWQM(Opcode)) {
 333         // Sampling instructions don't need to produce results for all pixels
 334         // in a quad, they just require all inputs of a quad to have been
 335         // computed for derivatives.
 336         markInstructionUses(MI, StateWQM, Worklist);
 337         GlobalFlags |= StateWQM;
 338         continue;
 339       } else if (Opcode == AMDGPU::WQM) {
 340         // The WQM intrinsic requires its output to have all the helper lanes
 341         // correct, so we need it to be in WQM.
 342         Flags = StateWQM;
 343         LowerToCopyInstrs.push_back(&MI);
 344       } else if (Opcode == AMDGPU::SOFT_WQM) {
 345         LowerToCopyInstrs.push_back(&MI);
 346         SoftWQMInstrs.push_back(&MI);
 347         continue;
 348       } else if (Opcode == AMDGPU::WWM) {
 349         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
 350         // to be executed in WQM or Exact so that its copy doesn't clobber
 351         // inactive lanes.
 352         markInstructionUses(MI, StateWWM, Worklist);
 353         GlobalFlags |= StateWWM;
 354         LowerToCopyInstrs.push_back(&MI);
 355         continue;
 356       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
 357                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
 358         III.Disabled = StateWWM;
 359         MachineOperand &Inactive = MI.getOperand(2);
 360         if (Inactive.isReg()) {
 361           if (Inactive.isUndef()) {
 362             LowerToCopyInstrs.push_back(&MI);
 363           } else {
 364             Register Reg = Inactive.getReg();
 365             if (Register::isVirtualRegister(Reg)) {
 366               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
 367                 markInstruction(DefMI, StateWWM, Worklist);
 368             }
 369           }
 370         }
 371         SetInactiveInstrs.push_back(&MI);
 372         continue;
 373       } else if (TII->isDisableWQM(MI)) {
 374         BBI.Needs |= StateExact;
 375         if (!(BBI.InNeeds & StateExact)) {
 376           BBI.InNeeds |= StateExact;
 377           Worklist.push_back(&MBB);
 378         }
 379         GlobalFlags |= StateExact;
 380         III.Disabled = StateWQM | StateWWM;
 381         continue;
 382       } else {
 383         if (Opcode == AMDGPU::SI_PS_LIVE) {
 384           LiveMaskQueries.push_back(&MI);
 385         } else if (WQMOutputs) {
 386           // The function is in machine SSA form, which means that physical
 387           // VGPRs correspond to shader inputs and outputs. Inputs are
 388           // only used, outputs are only defined.
 389           for (const MachineOperand &MO : MI.defs()) {
 390             if (!MO.isReg())
 391               continue;
 392
 393             Register Reg = MO.getReg();
 394
 395             if (!Register::isVirtualRegister(Reg) &&
 396                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
 397               Flags = StateWQM;
 398               break;
 399             }
 400           }
 401         }
 402
 403         if (!Flags)
 404           continue;
 405       }
 406
 407       markInstruction(MI, Flags, Worklist);
 408       GlobalFlags |= Flags;
 409     }
 410   }
 411
 412   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
 413   // ever used anywhere in the function. This implements the corresponding
 414   // semantics of @llvm.amdgcn.set.inactive.
 415   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
 416   if (GlobalFlags & StateWQM) {
 417     for (MachineInstr *MI : SetInactiveInstrs)
 418       markInstruction(*MI, StateWQM, Worklist);
 419     for (MachineInstr *MI : SoftWQMInstrs)
 420       markInstruction(*MI, StateWQM, Worklist);
 421   }
 422
 423   return GlobalFlags;
 424 }
 425
 426 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 427                                            std::vector<WorkItem>& Worklist) {
 428   MachineBasicBlock *MBB = MI.getParent();
 429   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
 430   BlockInfo &BI = Blocks[MBB];
 431
 432   // Control flow-type instructions and stores to temporary memory that are
 433   // followed by WQM computations must themselves be in WQM.
 434   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
 435       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
 436     Instructions[&MI].Needs = StateWQM;
 437     II.Needs = StateWQM;
 438   }
 439
 440   // Propagate to block level
 441   if (II.Needs & StateWQM) {
 442     BI.Needs |= StateWQM;
 443     if (!(BI.InNeeds & StateWQM)) {
 444       BI.InNeeds |= StateWQM;
 445       Worklist.push_back(MBB);
 446     }
 447   }
 448
 449   // Propagate backwards within block
 450   if (MachineInstr *PrevMI = MI.getPrevNode()) {
 451     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
 452     if (!PrevMI->isPHI()) {
 453       InstrInfo &PrevII = Instructions[PrevMI];
 454       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
 455         PrevII.OutNeeds |= InNeeds;
 456         Worklist.push_back(PrevMI);
 457       }
 458     }
 459   }
 460
 461   // Propagate WQM flag to instruction inputs
 462   assert(!(II.Needs & StateExact));
 463
 464   if (II.Needs != 0)
 465     markInstructionUses(MI, II.Needs, Worklist);
 466
 467   // Ensure we process a block containing WWM, even if it does not require any
 468   // WQM transitions.
 469   if (II.Needs & StateWWM)
 470     BI.Needs |= StateWWM;
 471 }
 472
 473 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
 474                                      std::vector<WorkItem>& Worklist) {
 475   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
 476
 477   // Propagate through instructions
 478   if (!MBB.empty()) {
 479     MachineInstr *LastMI = &*MBB.rbegin();
 480     InstrInfo &LastII = Instructions[LastMI];
 481     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
 482       LastII.OutNeeds |= BI.OutNeeds;
 483       Worklist.push_back(LastMI);
 484     }
 485   }
 486
 487   // Predecessor blocks must provide for our WQM/Exact needs.
 488   for (MachineBasicBlock *Pred : MBB.predecessors()) {
 489     BlockInfo &PredBI = Blocks[Pred];
 490     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
 491       continue;
 492
 493     PredBI.OutNeeds |= BI.InNeeds;
 494     PredBI.InNeeds |= BI.InNeeds;
 495     Worklist.push_back(Pred);
 496   }
 497
 498   // All successors must be prepared to accept the same set of WQM/Exact data.
 499   for (MachineBasicBlock *Succ : MBB.successors()) {
 500     BlockInfo &SuccBI = Blocks[Succ];
 501     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
 502       continue;
 503
 504     SuccBI.InNeeds |= BI.OutNeeds;
 505     Worklist.push_back(Succ);
 506   }
 507 }
 508
 509 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
 510   std::vector<WorkItem> Worklist;
 511   char GlobalFlags = scanInstructions(MF, Worklist);
 512
 513   while (!Worklist.empty()) {
 514     WorkItem WI = Worklist.back();
 515     Worklist.pop_back();
 516
 517     if (WI.MI)
 518       propagateInstruction(*WI.MI, Worklist);
 519     else
 520       propagateBlock(*WI.MBB, Worklist);
 521   }
 522
 523   return GlobalFlags;
 524 }
 525
 526 /// Whether \p MI really requires the exec state computed during analysis.
 527 ///
 528 /// Scalar instructions must occasionally be marked WQM for correct propagation
 529 /// (e.g. thread masks leading up to branches), but when it comes to actual
 530 /// execution, they don't care about EXEC.
 531 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
 532   if (MI.isTerminator())
 533     return true;
 534
 535   // Skip instructions that are not affected by EXEC
 536   if (TII->isScalarUnit(MI))
 537     return false;
 538
 539   // Generic instructions such as COPY will either disappear by register
 540   // coalescing or be lowered to SALU or VALU instructions.
 541   if (MI.isTransient()) {
 542     if (MI.getNumExplicitOperands() >= 1) {
 543       const MachineOperand &Op = MI.getOperand(0);
 544       if (Op.isReg()) {
 545         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
 546           // SGPR instructions are not affected by EXEC
 547           return false;
 548         }
 549       }
 550     }
 551   }
 552
 553   return true;
 554 }
 555
 556 MachineBasicBlock::iterator
 557 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
 558                          MachineBasicBlock::iterator Before) {
 559   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 560
 561   MachineInstr *Save =
 562       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
 563           .addReg(AMDGPU::SCC);
 564   MachineInstr *Restore =
 565       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
 566           .addReg(SaveReg);
 567
 568   LIS->InsertMachineInstrInMaps(*Save);
 569   LIS->InsertMachineInstrInMaps(*Restore);
 570   LIS->createAndComputeVirtRegInterval(SaveReg);
 571
 572   return Restore;
 573 }
 574
 575 // Return an iterator in the (inclusive) range [First, Last] at which
 576 // instructions can be safely inserted, keeping in mind that some of the
 577 // instructions we want to add necessarily clobber SCC.
 578 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
 579     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 580     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
 581   if (!SaveSCC)
 582     return PreferLast ? Last : First;
 583
 584   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
 585   auto MBBE = MBB.end();
 586   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
 587                                      : LIS->getMBBEndIdx(&MBB);
 588   SlotIndex LastIdx =
 589       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
 590   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
 591   const LiveRange::Segment *S;
 592
 593   for (;;) {
 594     S = LR.getSegmentContaining(Idx);
 595     if (!S)
 596       break;
 597
 598     if (PreferLast) {
 599       SlotIndex Next = S->start.getBaseIndex();
 600       if (Next < FirstIdx)
 601         break;
 602       Idx = Next;
 603     } else {
 604       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
 605       if (Next > LastIdx)
 606         break;
 607       Idx = Next;
 608     }
 609   }
 610
 611   MachineBasicBlock::iterator MBBI;
 612
 613   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
 614     MBBI = MI;
 615   else {
 616     assert(Idx == LIS->getMBBEndIdx(&MBB));
 617     MBBI = MBB.end();
 618   }
 619
 620   if (S)
 621     MBBI = saveSCC(MBB, MBBI);
 622
 623   return MBBI;
 624 }
 625
 626 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
 627                               MachineBasicBlock::iterator Before,
 628                               unsigned SaveWQM, unsigned LiveMaskReg) {
 629   MachineInstr *MI;
 630
 631   if (SaveWQM) {
 632     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
 633                    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
 634                  SaveWQM)
 635              .addReg(LiveMaskReg);
 636   } else {
 637     unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 638     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
 639                    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
 640                  Exec)
 641              .addReg(Exec)
 642              .addReg(LiveMaskReg);
 643   }
 644
 645   LIS->InsertMachineInstrInMaps(*MI);
 646 }
 647
 648 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
 649                             MachineBasicBlock::iterator Before,
 650                             unsigned SavedWQM) {
 651   MachineInstr *MI;
 652
 653   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 654   if (SavedWQM) {
 655     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
 656              .addReg(SavedWQM);
 657   } else {
 658     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
 659                    AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
 660                  Exec)
 661              .addReg(Exec);
 662   }
 663
 664   LIS->InsertMachineInstrInMaps(*MI);
 665 }
 666
 667 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
 668                             MachineBasicBlock::iterator Before,
 669                             unsigned SaveOrig) {
 670   MachineInstr *MI;
 671
 672   assert(SaveOrig);
 673   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
 674            .addImm(-1);
 675   LIS->InsertMachineInstrInMaps(*MI);
 676 }
 677
 678 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
 679                               MachineBasicBlock::iterator Before,
 680                               unsigned SavedOrig) {
 681   MachineInstr *MI;
 682
 683   assert(SavedOrig);
 684   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
 685                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
 686            .addReg(SavedOrig);
 687   LIS->InsertMachineInstrInMaps(*MI);
 688 }
 689
 690 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 691                                    bool isEntry) {
 692   auto BII = Blocks.find(&MBB);
 693   if (BII == Blocks.end())
 694     return;
 695
 696   const BlockInfo &BI = BII->second;
 697
 698   // This is a non-entry block that is WQM throughout, so no need to do
 699   // anything.
 700   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
 701     return;
 702
 703   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
 704                     << ":\n");
 705
 706   unsigned SavedWQMReg = 0;
 707   unsigned SavedNonWWMReg = 0;
 708   bool WQMFromExec = isEntry;
 709   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
 710   char NonWWMState = 0;
 711   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
 712
 713   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
 714   if (isEntry)
 715     ++II; // Skip the instruction that saves LiveMask
 716
 717   // This stores the first instruction where it's safe to switch from WQM to
 718   // Exact or vice versa.
 719   MachineBasicBlock::iterator FirstWQM = IE;
 720
 721   // This stores the first instruction where it's safe to switch from WWM to
 722   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
 723   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
 724   // switch to/from WQM as well.
 725   MachineBasicBlock::iterator FirstWWM = IE;
 726   for (;;) {
 727     MachineBasicBlock::iterator Next = II;
 728     char Needs = StateExact | StateWQM; // WWM is disabled by default
 729     char OutNeeds = 0;
 730
 731     if (FirstWQM == IE)
 732       FirstWQM = II;
 733
 734     if (FirstWWM == IE)
 735       FirstWWM = II;
 736
 737     // First, figure out the allowed states (Needs) based on the propagated
 738     // flags.
 739     if (II != IE) {
 740       MachineInstr &MI = *II;
 741
 742       if (requiresCorrectState(MI)) {
 743         auto III = Instructions.find(&MI);
 744         if (III != Instructions.end()) {
 745           if (III->second.Needs & StateWWM)
 746             Needs = StateWWM;
 747           else if (III->second.Needs & StateWQM)
 748             Needs = StateWQM;
 749           else
 750             Needs &= ~III->second.Disabled;
 751           OutNeeds = III->second.OutNeeds;
 752         }
 753       } else {
 754         // If the instruction doesn't actually need a correct EXEC, then we can
 755         // safely leave WWM enabled.
 756         Needs = StateExact | StateWQM | StateWWM;
 757       }
 758
 759       if (MI.isTerminator() && OutNeeds == StateExact)
 760         Needs = StateExact;
 761
 762       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
 763         MI.getOperand(3).setImm(1);
 764
 765       ++Next;
 766     } else {
 767       // End of basic block
 768       if (BI.OutNeeds & StateWQM)
 769         Needs = StateWQM;
 770       else if (BI.OutNeeds == StateExact)
 771         Needs = StateExact;
 772       else
 773         Needs = StateWQM | StateExact;
 774     }
 775
 776     // Now, transition if necessary.
 777     if (!(Needs & State)) {
 778       MachineBasicBlock::iterator First;
 779       if (State == StateWWM || Needs == StateWWM) {
 780         // We must switch to or from WWM
 781         First = FirstWWM;
 782       } else {
 783         // We only need to switch to/from WQM, so we can use FirstWQM
 784         First = FirstWQM;
 785       }
 786
 787       MachineBasicBlock::iterator Before =
 788           prepareInsertion(MBB, First, II, Needs == StateWQM,
 789                            Needs == StateExact || WQMFromExec);
 790
 791       if (State == StateWWM) {
 792         assert(SavedNonWWMReg);
 793         fromWWM(MBB, Before, SavedNonWWMReg);
 794         State = NonWWMState;
 795       }
 796
 797       if (Needs == StateWWM) {
 798         NonWWMState = State;
 799         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
 800         toWWM(MBB, Before, SavedNonWWMReg);
 801         State = StateWWM;
 802       } else {
 803         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
 804           if (!WQMFromExec && (OutNeeds & StateWQM))
 805             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
 806
 807           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
 808           State = StateExact;
 809         } else if (State == StateExact && (Needs & StateWQM) &&
 810                    !(Needs & StateExact)) {
 811           assert(WQMFromExec == (SavedWQMReg == 0));
 812
 813           toWQM(MBB, Before, SavedWQMReg);
 814
 815           if (SavedWQMReg) {
 816             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
 817             SavedWQMReg = 0;
 818           }
 819           State = StateWQM;
 820         } else {
 821           // We can get here if we transitioned from WWM to a non-WWM state that
 822           // already matches our needs, but we shouldn't need to do anything.
 823           assert(Needs & State);
 824         }
 825       }
 826     }
 827
 828     if (Needs != (StateExact | StateWQM | StateWWM)) {
 829       if (Needs != (StateExact | StateWQM))
 830         FirstWQM = IE;
 831       FirstWWM = IE;
 832     }
 833
 834     if (II == IE)
 835       break;
 836     II = Next;
 837   }
 838 }
 839
 840 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
 841   for (MachineInstr *MI : LiveMaskQueries) {
 842     const DebugLoc &DL = MI->getDebugLoc();
 843     Register Dest = MI->getOperand(0).getReg();
 844     MachineInstr *Copy =
 845         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
 846             .addReg(LiveMaskReg);
 847
 848     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
 849     MI->eraseFromParent();
 850   }
 851 }
 852
 853 void SIWholeQuadMode::lowerCopyInstrs() {
 854   for (MachineInstr *MI : LowerToCopyInstrs) {
 855     for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
 856       MI->RemoveOperand(i);
 857
 858     const Register Reg = MI->getOperand(0).getReg();
 859
 860     if (TRI->isVGPR(*MRI, Reg)) {
 861       const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
 862                                                 ? MRI->getRegClass(Reg)
 863                                                 : TRI->getPhysRegClass(Reg);
 864
 865       const unsigned MovOp = TII->getMovOpcode(regClass);
 866       MI->setDesc(TII->get(MovOp));
 867
 868       // And make it implicitly depend on exec (like all VALU movs should do).
 869       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 870     } else {
 871       MI->setDesc(TII->get(AMDGPU::COPY));
 872     }
 873   }
 874 }
 875
 876 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 877   Instructions.clear();
 878   Blocks.clear();
 879   LiveMaskQueries.clear();
 880   LowerToCopyInstrs.clear();
 881   CallingConv = MF.getFunction().getCallingConv();
 882
 883   ST = &MF.getSubtarget<GCNSubtarget>();
 884
 885   TII = ST->getInstrInfo();
 886   TRI = &TII->getRegisterInfo();
 887   MRI = &MF.getRegInfo();
 888   LIS = &getAnalysis<LiveIntervals>();
 889
 890   char GlobalFlags = analyzeFunction(MF);
 891   unsigned LiveMaskReg = 0;
 892   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 893   if (!(GlobalFlags & StateWQM)) {
 894     lowerLiveMaskQueries(Exec);
 895     if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
 896       return !LiveMaskQueries.empty();
 897   } else {
 898     // Store a copy of the original live mask when required
 899     MachineBasicBlock &Entry = MF.front();
 900     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 901
 902     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
 903       LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
 904       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
 905                                  TII->get(AMDGPU::COPY), LiveMaskReg)
 906                              .addReg(Exec);
 907       LIS->InsertMachineInstrInMaps(*MI);
 908     }
 909
 910     lowerLiveMaskQueries(LiveMaskReg);
 911
 912     if (GlobalFlags == StateWQM) {
 913       // For a shader that needs only WQM, we can just set it once.
 914       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
 915                 AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
 916               Exec)
 917           .addReg(Exec);
 918
 919       lowerCopyInstrs();
 920       // EntryMI may become invalid here
 921       return true;
 922     }
 923   }
 924
 925   LLVM_DEBUG(printInfo());
 926
 927   lowerCopyInstrs();
 928
 929   // Handle the general case
 930   for (auto BII : Blocks)
 931     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
 932
 933   // Physical registers like SCC aren't tracked by default anyway, so just
 934   // removing the ranges we computed is the simplest option for maintaining
 935   // the analysis results.
 936   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
 937
 938   return true;
 939 }