llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

   1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
  11 /// for pixel shaders, and strict whole wavefront mode for all programs.
  12 ///
  13 /// The "strict" prefix indicates that inactive lanes do not take part in
  14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
  15 /// always be enabled irrespective of control flow decisions. Conversely in
  16 /// non-strict WQM inactive lanes may control flow decisions.
  17 ///
  18 /// Whole quad mode is required for derivative computations, but it interferes
  19 /// with shader side effects (stores and atomics). It ensures that WQM is
  20 /// enabled when necessary, but disabled around stores and atomics.
  21 ///
  22 /// When necessary, this pass creates a function prolog
  23 ///
  24 ///   S_MOV_B64 LiveMask, EXEC
  25 ///   S_WQM_B64 EXEC, EXEC
  26 ///
  27 /// to enter WQM at the top of the function and surrounds blocks of Exact
  28 /// instructions by
  29 ///
  30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
  31 ///   ...
  32 ///   S_MOV_B64 EXEC, Tmp
  33 ///
  34 /// We also compute when a sequence of instructions requires strict whole
  35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
  36 ///
  37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
  38 ///   ...
  39 ///   S_MOV_B64 EXEC, Tmp
  40 ///
  41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
  42 /// we use a similar save and restore mechanism and force whole quad mode for
  43 /// those instructions:
  44 ///
  45 ///  S_MOV_B64 Tmp, EXEC
  46 ///  S_WQM_B64 EXEC, EXEC
  47 ///  ...
  48 ///  S_MOV_B64 EXEC, Tmp
  49 ///
  50 /// In order to avoid excessive switching during sequences of Exact
  51 /// instructions, the pass first analyzes which instructions must be run in WQM
  52 /// (aka which instructions produce values that lead to derivative
  53 /// computations).
  54 ///
  55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
  56 ///
  57 /// There is room for improvement given better control flow analysis:
  58 ///
  59 ///  (1) at the top level (outside of control flow statements, and as long as
  60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
  61 ///      the LiveMask (this is implemented for the entry block).
  62 ///
  63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
  64 ///      consist of exact and don't-care instructions, the switch only has to
  65 ///      be done at the entry and exit points rather than potentially in each
  66 ///      block of the region.
  67 ///
  68 //===----------------------------------------------------------------------===//
  69
  70 #include "AMDGPU.h"
  71 #include "GCNSubtarget.h"
  72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  73 #include "llvm/ADT/MapVector.h"
  74 #include "llvm/ADT/PostOrderIterator.h"
  75 #include "llvm/CodeGen/LiveIntervals.h"
  76 #include "llvm/CodeGen/MachineBasicBlock.h"
  77 #include "llvm/CodeGen/MachineDominators.h"
  78 #include "llvm/CodeGen/MachineFunctionPass.h"
  79 #include "llvm/CodeGen/MachineInstr.h"
  80 #include "llvm/CodeGen/MachinePostDominators.h"
  81 #include "llvm/IR/CallingConv.h"
  82 #include "llvm/InitializePasses.h"
  83 #include "llvm/Support/raw_ostream.h"
  84
  85 using namespace llvm;
  86
  87 #define DEBUG_TYPE "si-wqm"
  88
  89 namespace {
  90
  91 enum {
  92   StateWQM = 0x1,
  93   StateStrictWWM = 0x2,
  94   StateStrictWQM = 0x4,
  95   StateExact = 0x8,
  96   StateStrict = StateStrictWWM | StateStrictWQM,
  97 };
  98
  99 struct PrintState {
 100 public:
 101   int State;
 102
 103   explicit PrintState(int State) : State(State) {}
 104 };
 105
 106 #ifndef NDEBUG
 107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
 108
 109   static const std::pair<char, const char *> Mapping[] = {
 110       std::make_pair(StateWQM, "WQM"),
 111       std::make_pair(StateStrictWWM, "StrictWWM"),
 112       std::make_pair(StateStrictWQM, "StrictWQM"),
 113       std::make_pair(StateExact, "Exact")};
 114   char State = PS.State;
 115   for (auto M : Mapping) {
 116     if (State & M.first) {
 117       OS << M.second;
 118       State &= ~M.first;
 119
 120       if (State)
 121         OS << '|';
 122     }
 123   }
 124   assert(State == 0);
 125   return OS;
 126 }
 127 #endif
 128
 129 struct InstrInfo {
 130   char Needs = 0;
 131   char Disabled = 0;
 132   char OutNeeds = 0;
 133 };
 134
 135 struct BlockInfo {
 136   char Needs = 0;
 137   char InNeeds = 0;
 138   char OutNeeds = 0;
 139   char InitialState = 0;
 140   bool NeedsLowering = false;
 141 };
 142
 143 struct WorkItem {
 144   MachineBasicBlock *MBB = nullptr;
 145   MachineInstr *MI = nullptr;
 146
 147   WorkItem() = default;
 148   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
 149   WorkItem(MachineInstr *MI) : MI(MI) {}
 150 };
 151
 152 class SIWholeQuadMode : public MachineFunctionPass {
 153 private:
 154   const SIInstrInfo *TII;
 155   const SIRegisterInfo *TRI;
 156   const GCNSubtarget *ST;
 157   MachineRegisterInfo *MRI;
 158   LiveIntervals *LIS;
 159   MachineDominatorTree *MDT;
 160   MachinePostDominatorTree *PDT;
 161
 162   unsigned AndOpc;
 163   unsigned AndN2Opc;
 164   unsigned XorOpc;
 165   unsigned AndSaveExecOpc;
 166   unsigned OrSaveExecOpc;
 167   unsigned WQMOpc;
 168   Register Exec;
 169   Register LiveMaskReg;
 170
 171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
 172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
 173
 174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
 175   DenseMap<const MachineInstr *, char> StateTransition;
 176
 177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
 178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
 179   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
 180   SmallVector<MachineInstr *, 4> KillInstrs;
 181
 182   void printInfo();
 183
 184   void markInstruction(MachineInstr &MI, char Flag,
 185                        std::vector<WorkItem> &Worklist);
 186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
 187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
 188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
 189                    std::vector<WorkItem> &Worklist);
 190   void markInstructionUses(const MachineInstr &MI, char Flag,
 191                            std::vector<WorkItem> &Worklist);
 192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
 193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
 194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
 195   char analyzeFunction(MachineFunction &MF);
 196
 197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
 198                                       MachineBasicBlock::iterator Before);
 199   MachineBasicBlock::iterator
 200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 201                    MachineBasicBlock::iterator Last, bool PreferLast,
 202                    bool SaveSCC);
 203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 204                Register SaveWQM);
 205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 206              Register SavedWQM);
 207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 208                     Register SaveOrig, char StrictStateNeeded);
 209   void fromStrictMode(MachineBasicBlock &MBB,
 210                       MachineBasicBlock::iterator Before, Register SavedOrig,
 211                       char NonStrictState, char CurrentStrictState);
 212
 213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
 214
 215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
 216                             bool IsWQM);
 217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
 218
 219   void lowerBlock(MachineBasicBlock &MBB);
 220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
 221
 222   void lowerLiveMaskQueries();
 223   void lowerCopyInstrs();
 224   void lowerKillInstrs(bool IsWQM);
 225
 226 public:
 227   static char ID;
 228
 229   SIWholeQuadMode() :
 230     MachineFunctionPass(ID) { }
 231
 232   bool runOnMachineFunction(MachineFunction &MF) override;
 233
 234   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
 235
 236   void getAnalysisUsage(AnalysisUsage &AU) const override {
 237     AU.addRequired<LiveIntervals>();
 238     AU.addPreserved<SlotIndexes>();
 239     AU.addPreserved<LiveIntervals>();
 240     AU.addRequired<MachineDominatorTree>();
 241     AU.addPreserved<MachineDominatorTree>();
 242     AU.addRequired<MachinePostDominatorTree>();
 243     AU.addPreserved<MachinePostDominatorTree>();
 244     MachineFunctionPass::getAnalysisUsage(AU);
 245   }
 246
 247   MachineFunctionProperties getClearedProperties() const override {
 248     return MachineFunctionProperties().set(
 249         MachineFunctionProperties::Property::IsSSA);
 250   }
 251 };
 252
 253 } // end anonymous namespace
 254
 255 char SIWholeQuadMode::ID = 0;
 256
 257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 258                       false)
 259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 263                     false)
 264
 265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 266
 267 FunctionPass *llvm::createSIWholeQuadModePass() {
 268   return new SIWholeQuadMode;
 269 }
 270
 271 #ifndef NDEBUG
 272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 273   for (const auto &BII : Blocks) {
 274     dbgs() << "\n"
 275            << printMBBReference(*BII.first) << ":\n"
 276            << "  InNeeds = " << PrintState(BII.second.InNeeds)
 277            << ", Needs = " << PrintState(BII.second.Needs)
 278            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
 279
 280     for (const MachineInstr &MI : *BII.first) {
 281       auto III = Instructions.find(&MI);
 282       if (III == Instructions.end())
 283         continue;
 284
 285       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
 286              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
 287     }
 288   }
 289 }
 290 #endif
 291
 292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 293                                       std::vector<WorkItem> &Worklist) {
 294   InstrInfo &II = Instructions[&MI];
 295
 296   assert(!(Flag & StateExact) && Flag != 0);
 297
 298   // Remove any disabled states from the flag. The user that required it gets
 299   // an undefined value in the helper lanes. For example, this can happen if
 300   // the result of an atomic is used by instruction that requires WQM, where
 301   // ignoring the request for WQM is correct as per the relevant specs.
 302   Flag &= ~II.Disabled;
 303
 304   // Ignore if the flag is already encompassed by the existing needs, or we
 305   // just disabled everything.
 306   if ((II.Needs & Flag) == Flag)
 307     return;
 308
 309   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
 310   II.Needs |= Flag;
 311   Worklist.push_back(&MI);
 312 }
 313
 314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
 315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
 316                                Register Reg, unsigned SubReg, char Flag,
 317                                std::vector<WorkItem> &Worklist) {
 318   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
 319
 320   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
 321   const VNInfo *Value = UseLRQ.valueIn();
 322   if (!Value)
 323     return;
 324
 325   // Note: this code assumes that lane masks on AMDGPU completely
 326   // cover registers.
 327   const LaneBitmask UseLanes =
 328       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
 329              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
 330                                 : LaneBitmask::getNone());
 331
 332   // Perform a depth-first iteration of the LiveRange graph marking defs.
 333   // Stop processing of a given branch when all use lanes have been defined.
 334   // The first definition stops processing for a physical register.
 335   struct PhiEntry {
 336     const VNInfo *Phi;
 337     unsigned PredIdx;
 338     LaneBitmask DefinedLanes;
 339
 340     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
 341         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
 342   };
 343   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
 344   SmallVector<PhiEntry, 2> PhiStack;
 345   SmallSet<VisitKey, 4> Visited;
 346   LaneBitmask DefinedLanes;
 347   unsigned NextPredIdx = 0; // Only used for processing phi nodes
 348   do {
 349     const VNInfo *NextValue = nullptr;
 350     const VisitKey Key(Value, DefinedLanes);
 351
 352     if (!Visited.count(Key)) {
 353       Visited.insert(Key);
 354       // On first visit to a phi then start processing first predecessor
 355       NextPredIdx = 0;
 356     }
 357
 358     if (Value->isPHIDef()) {
 359       // Each predecessor node in the phi must be processed as a subgraph
 360       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
 361       assert(MBB && "Phi-def has no defining MBB");
 362
 363       // Find next predecessor to process
 364       unsigned Idx = NextPredIdx;
 365       auto PI = MBB->pred_begin() + Idx;
 366       auto PE = MBB->pred_end();
 367       for (; PI != PE && !NextValue; ++PI, ++Idx) {
 368         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
 369           if (!Visited.count(VisitKey(VN, DefinedLanes)))
 370             NextValue = VN;
 371         }
 372       }
 373
 374       // If there are more predecessors to process; add phi to stack
 375       if (PI != PE)
 376         PhiStack.emplace_back(Value, Idx, DefinedLanes);
 377     } else {
 378       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
 379       assert(MI && "Def has no defining instruction");
 380
 381       if (Reg.isVirtual()) {
 382         // Iterate over all operands to find relevant definitions
 383         bool HasDef = false;
 384         for (const MachineOperand &Op : MI->operands()) {
 385           if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
 386             continue;
 387
 388           // Compute lanes defined and overlap with use
 389           LaneBitmask OpLanes =
 390               Op.isUndef() ? LaneBitmask::getAll()
 391                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
 392           LaneBitmask Overlap = (UseLanes & OpLanes);
 393
 394           // Record if this instruction defined any of use
 395           HasDef |= Overlap.any();
 396
 397           // Mark any lanes defined
 398           DefinedLanes |= OpLanes;
 399         }
 400
 401         // Check if all lanes of use have been defined
 402         if ((DefinedLanes & UseLanes) != UseLanes) {
 403           // Definition not complete; need to process input value
 404           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
 405           if (const VNInfo *VN = LRQ.valueIn()) {
 406             if (!Visited.count(VisitKey(VN, DefinedLanes)))
 407               NextValue = VN;
 408           }
 409         }
 410
 411         // Only mark the instruction if it defines some part of the use
 412         if (HasDef)
 413           markInstruction(*MI, Flag, Worklist);
 414       } else {
 415         // For physical registers simply mark the defining instruction
 416         markInstruction(*MI, Flag, Worklist);
 417       }
 418     }
 419
 420     if (!NextValue && !PhiStack.empty()) {
 421       // Reach end of chain; revert to processing last phi
 422       PhiEntry &Entry = PhiStack.back();
 423       NextValue = Entry.Phi;
 424       NextPredIdx = Entry.PredIdx;
 425       DefinedLanes = Entry.DefinedLanes;
 426       PhiStack.pop_back();
 427     }
 428
 429     Value = NextValue;
 430   } while (Value);
 431 }
 432
 433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
 434                                   const MachineOperand &Op, char Flag,
 435                                   std::vector<WorkItem> &Worklist) {
 436   assert(Op.isReg());
 437   Register Reg = Op.getReg();
 438
 439   // Ignore some hardware registers
 440   switch (Reg) {
 441   case AMDGPU::EXEC:
 442   case AMDGPU::EXEC_LO:
 443     return;
 444   default:
 445     break;
 446   }
 447
 448   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
 449                     << " for " << MI);
 450   if (Reg.isVirtual()) {
 451     LiveRange &LR = LIS->getInterval(Reg);
 452     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
 453   } else {
 454     // Handle physical registers that we need to track; this is mostly relevant
 455     // for VCC, which can appear as the (implicit) input of a uniform branch,
 456     // e.g. when a loop counter is stored in a VGPR.
 457     for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
 458          ++RegUnit) {
 459       LiveRange &LR = LIS->getRegUnit(*RegUnit);
 460       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
 461       if (!Value)
 462         continue;
 463
 464       markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
 465     }
 466   }
 467 }
 468
 469 /// Mark all instructions defining the uses in \p MI with \p Flag.
 470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
 471                                           std::vector<WorkItem> &Worklist) {
 472   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
 473                     << MI);
 474
 475   for (const MachineOperand &Use : MI.uses()) {
 476     if (!Use.isReg() || !Use.isUse())
 477       continue;
 478     markOperand(MI, Use, Flag, Worklist);
 479   }
 480 }
 481
 482 // Scan instructions to determine which ones require an Exact execmask and
 483 // which ones seed WQM requirements.
 484 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 485                                        std::vector<WorkItem> &Worklist) {
 486   char GlobalFlags = 0;
 487   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
 488   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 489   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
 490
 491   // We need to visit the basic blocks in reverse post-order so that we visit
 492   // defs before uses, in particular so that we don't accidentally mark an
 493   // instruction as needing e.g. WQM before visiting it and realizing it needs
 494   // WQM disabled.
 495   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
 496   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
 497     MachineBasicBlock &MBB = **BI;
 498     BlockInfo &BBI = Blocks[&MBB];
 499
 500     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
 501       MachineInstr &MI = *II;
 502       InstrInfo &III = Instructions[&MI];
 503       unsigned Opcode = MI.getOpcode();
 504       char Flags = 0;
 505
 506       if (TII->isWQM(Opcode)) {
 507         // If LOD is not supported WQM is not needed.
 508         if (!ST->hasExtendedImageInsts())
 509           continue;
 510         // Sampling instructions don't need to produce results for all pixels
 511         // in a quad, they just require all inputs of a quad to have been
 512         // computed for derivatives.
 513         markInstructionUses(MI, StateWQM, Worklist);
 514         GlobalFlags |= StateWQM;
 515         continue;
 516       } else if (Opcode == AMDGPU::WQM) {
 517         // The WQM intrinsic requires its output to have all the helper lanes
 518         // correct, so we need it to be in WQM.
 519         Flags = StateWQM;
 520         LowerToCopyInstrs.push_back(&MI);
 521       } else if (Opcode == AMDGPU::SOFT_WQM) {
 522         LowerToCopyInstrs.push_back(&MI);
 523         SoftWQMInstrs.push_back(&MI);
 524         continue;
 525       } else if (Opcode == AMDGPU::STRICT_WWM) {
 526         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
 527         // it needs to be executed in WQM or Exact so that its copy doesn't
 528         // clobber inactive lanes.
 529         markInstructionUses(MI, StateStrictWWM, Worklist);
 530         GlobalFlags |= StateStrictWWM;
 531         LowerToMovInstrs.push_back(&MI);
 532         continue;
 533       } else if (Opcode == AMDGPU::STRICT_WQM) {
 534         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
 535         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
 536         // quads that have at least one active thread.
 537         markInstructionUses(MI, StateStrictWQM, Worklist);
 538         GlobalFlags |= StateStrictWQM;
 539         LowerToMovInstrs.push_back(&MI);
 540         continue;
 541       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
 542                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
 543         III.Disabled = StateStrict;
 544         MachineOperand &Inactive = MI.getOperand(2);
 545         if (Inactive.isReg()) {
 546           if (Inactive.isUndef()) {
 547             LowerToCopyInstrs.push_back(&MI);
 548           } else {
 549             markOperand(MI, Inactive, StateStrictWWM, Worklist);
 550           }
 551         }
 552         SetInactiveInstrs.push_back(&MI);
 553         continue;
 554       } else if (TII->isDisableWQM(MI)) {
 555         BBI.Needs |= StateExact;
 556         if (!(BBI.InNeeds & StateExact)) {
 557           BBI.InNeeds |= StateExact;
 558           Worklist.push_back(&MBB);
 559         }
 560         GlobalFlags |= StateExact;
 561         III.Disabled = StateWQM | StateStrict;
 562         continue;
 563       } else {
 564         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
 565           LiveMaskQueries.push_back(&MI);
 566         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
 567                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
 568                    Opcode == AMDGPU::SI_DEMOTE_I1) {
 569           KillInstrs.push_back(&MI);
 570           BBI.NeedsLowering = true;
 571         } else if (WQMOutputs) {
 572           // The function is in machine SSA form, which means that physical
 573           // VGPRs correspond to shader inputs and outputs. Inputs are
 574           // only used, outputs are only defined.
 575           // FIXME: is this still valid?
 576           for (const MachineOperand &MO : MI.defs()) {
 577             if (!MO.isReg())
 578               continue;
 579
 580             Register Reg = MO.getReg();
 581
 582             if (!Reg.isVirtual() &&
 583                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
 584               Flags = StateWQM;
 585               break;
 586             }
 587           }
 588         }
 589
 590         if (!Flags)
 591           continue;
 592       }
 593
 594       markInstruction(MI, Flags, Worklist);
 595       GlobalFlags |= Flags;
 596     }
 597   }
 598
 599   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
 600   // ever used anywhere in the function. This implements the corresponding
 601   // semantics of @llvm.amdgcn.set.inactive.
 602   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
 603   if (GlobalFlags & StateWQM) {
 604     for (MachineInstr *MI : SetInactiveInstrs)
 605       markInstruction(*MI, StateWQM, Worklist);
 606     for (MachineInstr *MI : SoftWQMInstrs)
 607       markInstruction(*MI, StateWQM, Worklist);
 608   }
 609
 610   return GlobalFlags;
 611 }
 612
 613 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 614                                            std::vector<WorkItem>& Worklist) {
 615   MachineBasicBlock *MBB = MI.getParent();
 616   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
 617   BlockInfo &BI = Blocks[MBB];
 618
 619   // Control flow-type instructions and stores to temporary memory that are
 620   // followed by WQM computations must themselves be in WQM.
 621   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
 622       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
 623     Instructions[&MI].Needs = StateWQM;
 624     II.Needs = StateWQM;
 625   }
 626
 627   // Propagate to block level
 628   if (II.Needs & StateWQM) {
 629     BI.Needs |= StateWQM;
 630     if (!(BI.InNeeds & StateWQM)) {
 631       BI.InNeeds |= StateWQM;
 632       Worklist.push_back(MBB);
 633     }
 634   }
 635
 636   // Propagate backwards within block
 637   if (MachineInstr *PrevMI = MI.getPrevNode()) {
 638     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
 639     if (!PrevMI->isPHI()) {
 640       InstrInfo &PrevII = Instructions[PrevMI];
 641       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
 642         PrevII.OutNeeds |= InNeeds;
 643         Worklist.push_back(PrevMI);
 644       }
 645     }
 646   }
 647
 648   // Propagate WQM flag to instruction inputs
 649   assert(!(II.Needs & StateExact));
 650
 651   if (II.Needs != 0)
 652     markInstructionUses(MI, II.Needs, Worklist);
 653
 654   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
 655   // not require any WQM transitions.
 656   if (II.Needs & StateStrictWWM)
 657     BI.Needs |= StateStrictWWM;
 658   if (II.Needs & StateStrictWQM)
 659     BI.Needs |= StateStrictWQM;
 660 }
 661
 662 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
 663                                      std::vector<WorkItem>& Worklist) {
 664   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
 665
 666   // Propagate through instructions
 667   if (!MBB.empty()) {
 668     MachineInstr *LastMI = &*MBB.rbegin();
 669     InstrInfo &LastII = Instructions[LastMI];
 670     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
 671       LastII.OutNeeds |= BI.OutNeeds;
 672       Worklist.push_back(LastMI);
 673     }
 674   }
 675
 676   // Predecessor blocks must provide for our WQM/Exact needs.
 677   for (MachineBasicBlock *Pred : MBB.predecessors()) {
 678     BlockInfo &PredBI = Blocks[Pred];
 679     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
 680       continue;
 681
 682     PredBI.OutNeeds |= BI.InNeeds;
 683     PredBI.InNeeds |= BI.InNeeds;
 684     Worklist.push_back(Pred);
 685   }
 686
 687   // All successors must be prepared to accept the same set of WQM/Exact data.
 688   for (MachineBasicBlock *Succ : MBB.successors()) {
 689     BlockInfo &SuccBI = Blocks[Succ];
 690     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
 691       continue;
 692
 693     SuccBI.InNeeds |= BI.OutNeeds;
 694     Worklist.push_back(Succ);
 695   }
 696 }
 697
 698 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
 699   std::vector<WorkItem> Worklist;
 700   char GlobalFlags = scanInstructions(MF, Worklist);
 701
 702   while (!Worklist.empty()) {
 703     WorkItem WI = Worklist.back();
 704     Worklist.pop_back();
 705
 706     if (WI.MI)
 707       propagateInstruction(*WI.MI, Worklist);
 708     else
 709       propagateBlock(*WI.MBB, Worklist);
 710   }
 711
 712   return GlobalFlags;
 713 }
 714
 715 MachineBasicBlock::iterator
 716 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
 717                          MachineBasicBlock::iterator Before) {
 718   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 719
 720   MachineInstr *Save =
 721       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
 722           .addReg(AMDGPU::SCC);
 723   MachineInstr *Restore =
 724       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
 725           .addReg(SaveReg);
 726
 727   LIS->InsertMachineInstrInMaps(*Save);
 728   LIS->InsertMachineInstrInMaps(*Restore);
 729   LIS->createAndComputeVirtRegInterval(SaveReg);
 730
 731   return Restore;
 732 }
 733
 734 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
 735                                                MachineInstr *TermMI) {
 736   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
 737                     << *TermMI << "\n");
 738
 739   MachineBasicBlock *SplitBB =
 740       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
 741
 742   // Convert last instruction in block to a terminator.
 743   // Note: this only covers the expected patterns
 744   unsigned NewOpcode = 0;
 745   switch (TermMI->getOpcode()) {
 746   case AMDGPU::S_AND_B32:
 747     NewOpcode = AMDGPU::S_AND_B32_term;
 748     break;
 749   case AMDGPU::S_AND_B64:
 750     NewOpcode = AMDGPU::S_AND_B64_term;
 751     break;
 752   case AMDGPU::S_MOV_B32:
 753     NewOpcode = AMDGPU::S_MOV_B32_term;
 754     break;
 755   case AMDGPU::S_MOV_B64:
 756     NewOpcode = AMDGPU::S_MOV_B64_term;
 757     break;
 758   default:
 759     break;
 760   }
 761   if (NewOpcode)
 762     TermMI->setDesc(TII->get(NewOpcode));
 763
 764   if (SplitBB != BB) {
 765     // Update dominator trees
 766     using DomTreeT = DomTreeBase<MachineBasicBlock>;
 767     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
 768     for (MachineBasicBlock *Succ : SplitBB->successors()) {
 769       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
 770       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
 771     }
 772     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
 773     if (MDT)
 774       MDT->getBase().applyUpdates(DTUpdates);
 775     if (PDT)
 776       PDT->getBase().applyUpdates(DTUpdates);
 777
 778     // Link blocks
 779     MachineInstr *MI =
 780         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
 781             .addMBB(SplitBB);
 782     LIS->InsertMachineInstrInMaps(*MI);
 783   }
 784
 785   return SplitBB;
 786 }
 787
 788 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
 789                                             MachineInstr &MI) {
 790   const DebugLoc &DL = MI.getDebugLoc();
 791   unsigned Opcode = 0;
 792
 793   assert(MI.getOperand(0).isReg());
 794
 795   // Comparison is for live lanes; however here we compute the inverse
 796   // (killed lanes).  This is because VCMP will always generate 0 bits
 797   // for inactive lanes so a mask of live lanes would not be correct
 798   // inside control flow.
 799   // Invert the comparison by swapping the operands and adjusting
 800   // the comparison codes.
 801
 802   switch (MI.getOperand(2).getImm()) {
 803   case ISD::SETUEQ:
 804     Opcode = AMDGPU::V_CMP_LG_F32_e64;
 805     break;
 806   case ISD::SETUGT:
 807     Opcode = AMDGPU::V_CMP_GE_F32_e64;
 808     break;
 809   case ISD::SETUGE:
 810     Opcode = AMDGPU::V_CMP_GT_F32_e64;
 811     break;
 812   case ISD::SETULT:
 813     Opcode = AMDGPU::V_CMP_LE_F32_e64;
 814     break;
 815   case ISD::SETULE:
 816     Opcode = AMDGPU::V_CMP_LT_F32_e64;
 817     break;
 818   case ISD::SETUNE:
 819     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
 820     break;
 821   case ISD::SETO:
 822     Opcode = AMDGPU::V_CMP_O_F32_e64;
 823     break;
 824   case ISD::SETUO:
 825     Opcode = AMDGPU::V_CMP_U_F32_e64;
 826     break;
 827   case ISD::SETOEQ:
 828   case ISD::SETEQ:
 829     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
 830     break;
 831   case ISD::SETOGT:
 832   case ISD::SETGT:
 833     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
 834     break;
 835   case ISD::SETOGE:
 836   case ISD::SETGE:
 837     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
 838     break;
 839   case ISD::SETOLT:
 840   case ISD::SETLT:
 841     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
 842     break;
 843   case ISD::SETOLE:
 844   case ISD::SETLE:
 845     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
 846     break;
 847   case ISD::SETONE:
 848   case ISD::SETNE:
 849     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
 850     break;
 851   default:
 852     llvm_unreachable("invalid ISD:SET cond code");
 853   }
 854
 855   // Pick opcode based on comparison type.
 856   MachineInstr *VcmpMI;
 857   const MachineOperand &Op0 = MI.getOperand(0);
 858   const MachineOperand &Op1 = MI.getOperand(1);
 859   if (TRI->isVGPR(*MRI, Op0.getReg())) {
 860     Opcode = AMDGPU::getVOPe32(Opcode);
 861     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
 862   } else {
 863     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
 864                  .addReg(AMDGPU::VCC, RegState::Define)
 865                  .addImm(0) // src0 modifiers
 866                  .add(Op1)
 867                  .addImm(0) // src1 modifiers
 868                  .add(Op0)
 869                  .addImm(0); // omod
 870   }
 871
 872   // VCC represents lanes killed.
 873   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
 874
 875   MachineInstr *MaskUpdateMI =
 876       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
 877           .addReg(LiveMaskReg)
 878           .addReg(VCC);
 879
 880   // State of SCC represents whether any lanes are live in mask,
 881   // if SCC is 0 then no lanes will be alive anymore.
 882   MachineInstr *EarlyTermMI =
 883       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
 884
 885   MachineInstr *ExecMaskMI =
 886       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
 887
 888   assert(MBB.succ_size() == 1);
 889   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
 890                               .addMBB(*MBB.succ_begin());
 891
 892   // Update live intervals
 893   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
 894   MBB.remove(&MI);
 895
 896   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
 897   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
 898   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
 899   LIS->InsertMachineInstrInMaps(*NewTerm);
 900
 901   return NewTerm;
 902 }
 903
 904 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
 905                                            MachineInstr &MI, bool IsWQM) {
 906   const DebugLoc &DL = MI.getDebugLoc();
 907   MachineInstr *MaskUpdateMI = nullptr;
 908
 909   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
 910   const MachineOperand &Op = MI.getOperand(0);
 911   int64_t KillVal = MI.getOperand(1).getImm();
 912   MachineInstr *ComputeKilledMaskMI = nullptr;
 913   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
 914   Register TmpReg;
 915
 916   // Is this a static or dynamic kill?
 917   if (Op.isImm()) {
 918     if (Op.getImm() == KillVal) {
 919       // Static: all active lanes are killed
 920       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
 921                          .addReg(LiveMaskReg)
 922                          .addReg(Exec);
 923     } else {
 924       // Static: kill does nothing
 925       MachineInstr *NewTerm = nullptr;
 926       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
 927         LIS->RemoveMachineInstrFromMaps(MI);
 928       } else {
 929         assert(MBB.succ_size() == 1);
 930         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
 931                       .addMBB(*MBB.succ_begin());
 932         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
 933       }
 934       MBB.remove(&MI);
 935       return NewTerm;
 936     }
 937   } else {
 938     if (!KillVal) {
 939       // Op represents live lanes after kill,
 940       // so exec mask needs to be factored in.
 941       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
 942       ComputeKilledMaskMI =
 943           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
 944       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
 945                          .addReg(LiveMaskReg)
 946                          .addReg(TmpReg);
 947     } else {
 948       // Op represents lanes to kill
 949       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
 950                          .addReg(LiveMaskReg)
 951                          .add(Op);
 952     }
 953   }
 954
 955   // State of SCC represents whether any lanes are live in mask,
 956   // if SCC is 0 then no lanes will be alive anymore.
 957   MachineInstr *EarlyTermMI =
 958       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
 959
 960   // In the case we got this far some lanes are still live,
 961   // update EXEC to deactivate lanes as appropriate.
 962   MachineInstr *NewTerm;
 963   MachineInstr *WQMMaskMI = nullptr;
 964   Register LiveMaskWQM;
 965   if (IsDemote) {
 966     // Demotes deactive quads with only helper lanes
 967     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
 968     WQMMaskMI =
 969         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
 970     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
 971                   .addReg(Exec)
 972                   .addReg(LiveMaskWQM);
 973   } else {
 974     // Kills deactivate lanes
 975     if (Op.isImm()) {
 976       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
 977       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
 978     } else if (!IsWQM) {
 979       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
 980                     .addReg(Exec)
 981                     .addReg(LiveMaskReg);
 982     } else {
 983       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
 984       NewTerm =
 985           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
 986     }
 987   }
 988
 989   // Update live intervals
 990   LIS->RemoveMachineInstrFromMaps(MI);
 991   MBB.remove(&MI);
 992   assert(EarlyTermMI);
 993   assert(MaskUpdateMI);
 994   assert(NewTerm);
 995   if (ComputeKilledMaskMI)
 996     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
 997   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
 998   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
 999   if (WQMMaskMI)
1000     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1001   LIS->InsertMachineInstrInMaps(*NewTerm);
1002
1003   if (CndReg) {
1004     LIS->removeInterval(CndReg);
1005     LIS->createAndComputeVirtRegInterval(CndReg);
1006   }
1007   if (TmpReg)
1008     LIS->createAndComputeVirtRegInterval(TmpReg);
1009   if (LiveMaskWQM)
1010     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1011
1012   return NewTerm;
1013 }
1014
1015 // Replace (or supplement) instructions accessing live mask.
1016 // This can only happen once all the live mask registers have been created
1017 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1018 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1019   auto BII = Blocks.find(&MBB);
1020   if (BII == Blocks.end())
1021     return;
1022
1023   const BlockInfo &BI = BII->second;
1024   if (!BI.NeedsLowering)
1025     return;
1026
1027   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1028
1029   SmallVector<MachineInstr *, 4> SplitPoints;
1030   char State = BI.InitialState;
1031
1032   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1033   while (II != IE) {
1034     auto Next = std::next(II);
1035     MachineInstr &MI = *II;
1036
1037     if (StateTransition.count(&MI))
1038       State = StateTransition[&MI];
1039
1040     MachineInstr *SplitPoint = nullptr;
1041     switch (MI.getOpcode()) {
1042     case AMDGPU::SI_DEMOTE_I1:
1043     case AMDGPU::SI_KILL_I1_TERMINATOR:
1044       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1045       break;
1046     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1047       SplitPoint = lowerKillF32(MBB, MI);
1048       break;
1049     default:
1050       break;
1051     }
1052     if (SplitPoint)
1053       SplitPoints.push_back(SplitPoint);
1054
1055     II = Next;
1056   }
1057
1058   // Perform splitting after instruction scan to simplify iteration.
1059   if (!SplitPoints.empty()) {
1060     MachineBasicBlock *BB = &MBB;
1061     for (MachineInstr *MI : SplitPoints) {
1062       BB = splitBlock(BB, MI);
1063     }
1064   }
1065 }
1066
1067 // Return an iterator in the (inclusive) range [First, Last] at which
1068 // instructions can be safely inserted, keeping in mind that some of the
1069 // instructions we want to add necessarily clobber SCC.
1070 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1071     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1072     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1073   if (!SaveSCC)
1074     return PreferLast ? Last : First;
1075
1076   LiveRange &LR =
1077       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1078   auto MBBE = MBB.end();
1079   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1080                                      : LIS->getMBBEndIdx(&MBB);
1081   SlotIndex LastIdx =
1082       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1083   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1084   const LiveRange::Segment *S;
1085
1086   for (;;) {
1087     S = LR.getSegmentContaining(Idx);
1088     if (!S)
1089       break;
1090
1091     if (PreferLast) {
1092       SlotIndex Next = S->start.getBaseIndex();
1093       if (Next < FirstIdx)
1094         break;
1095       Idx = Next;
1096     } else {
1097       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1098       assert(EndMI && "Segment does not end on valid instruction");
1099       auto NextI = std::next(EndMI->getIterator());
1100       if (NextI == MBB.end())
1101         break;
1102       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1103       if (Next > LastIdx)
1104         break;
1105       Idx = Next;
1106     }
1107   }
1108
1109   MachineBasicBlock::iterator MBBI;
1110
1111   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1112     MBBI = MI;
1113   else {
1114     assert(Idx == LIS->getMBBEndIdx(&MBB));
1115     MBBI = MBB.end();
1116   }
1117
1118   // Move insertion point past any operations modifying EXEC.
1119   // This assumes that the value of SCC defined by any of these operations
1120   // does not need to be preserved.
1121   while (MBBI != Last) {
1122     bool IsExecDef = false;
1123     for (const MachineOperand &MO : MBBI->operands()) {
1124       if (MO.isReg() && MO.isDef()) {
1125         IsExecDef |=
1126             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1127       }
1128     }
1129     if (!IsExecDef)
1130       break;
1131     MBBI++;
1132     S = nullptr;
1133   }
1134
1135   if (S)
1136     MBBI = saveSCC(MBB, MBBI);
1137
1138   return MBBI;
1139 }
1140
1141 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1142                               MachineBasicBlock::iterator Before,
1143                               Register SaveWQM) {
1144   MachineInstr *MI;
1145
1146   if (SaveWQM) {
1147     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1148              .addReg(LiveMaskReg);
1149   } else {
1150     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1151              .addReg(Exec)
1152              .addReg(LiveMaskReg);
1153   }
1154
1155   LIS->InsertMachineInstrInMaps(*MI);
1156   StateTransition[MI] = StateExact;
1157 }
1158
1159 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1160                             MachineBasicBlock::iterator Before,
1161                             Register SavedWQM) {
1162   MachineInstr *MI;
1163
1164   if (SavedWQM) {
1165     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1166              .addReg(SavedWQM);
1167   } else {
1168     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1169   }
1170
1171   LIS->InsertMachineInstrInMaps(*MI);
1172   StateTransition[MI] = StateWQM;
1173 }
1174
1175 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1176                                    MachineBasicBlock::iterator Before,
1177                                    Register SaveOrig, char StrictStateNeeded) {
1178   MachineInstr *MI;
1179   assert(SaveOrig);
1180   assert(StrictStateNeeded == StateStrictWWM ||
1181          StrictStateNeeded == StateStrictWQM);
1182
1183   if (StrictStateNeeded == StateStrictWWM) {
1184     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1185                  SaveOrig)
1186              .addImm(-1);
1187   } else {
1188     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1189                  SaveOrig)
1190              .addImm(-1);
1191   }
1192   LIS->InsertMachineInstrInMaps(*MI);
1193   StateTransition[MI] = StateStrictWWM;
1194 }
1195
1196 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1197                                      MachineBasicBlock::iterator Before,
1198                                      Register SavedOrig, char NonStrictState,
1199                                      char CurrentStrictState) {
1200   MachineInstr *MI;
1201
1202   assert(SavedOrig);
1203   assert(CurrentStrictState == StateStrictWWM ||
1204          CurrentStrictState == StateStrictWQM);
1205
1206   if (CurrentStrictState == StateStrictWWM) {
1207     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1208                  Exec)
1209              .addReg(SavedOrig);
1210   } else {
1211     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1212                  Exec)
1213              .addReg(SavedOrig);
1214   }
1215   LIS->InsertMachineInstrInMaps(*MI);
1216   StateTransition[MI] = NonStrictState;
1217 }
1218
1219 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1220   auto BII = Blocks.find(&MBB);
1221   if (BII == Blocks.end())
1222     return;
1223
1224   BlockInfo &BI = BII->second;
1225
1226   // This is a non-entry block that is WQM throughout, so no need to do
1227   // anything.
1228   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1229     BI.InitialState = StateWQM;
1230     return;
1231   }
1232
1233   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1234                     << ":\n");
1235
1236   Register SavedWQMReg;
1237   Register SavedNonStrictReg;
1238   bool WQMFromExec = IsEntry;
1239   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1240   char NonStrictState = 0;
1241   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1242
1243   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1244   if (IsEntry) {
1245     // Skip the instruction that saves LiveMask
1246     if (II != IE && II->getOpcode() == AMDGPU::COPY)
1247       ++II;
1248   }
1249
1250   // This stores the first instruction where it's safe to switch from WQM to
1251   // Exact or vice versa.
1252   MachineBasicBlock::iterator FirstWQM = IE;
1253
1254   // This stores the first instruction where it's safe to switch from Strict
1255   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1256   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1257   // be safe to switch to/from WQM as well.
1258   MachineBasicBlock::iterator FirstStrict = IE;
1259
1260   // Record initial state is block information.
1261   BI.InitialState = State;
1262
1263   for (;;) {
1264     MachineBasicBlock::iterator Next = II;
1265     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1266     char OutNeeds = 0;
1267
1268     if (FirstWQM == IE)
1269       FirstWQM = II;
1270
1271     if (FirstStrict == IE)
1272       FirstStrict = II;
1273
1274     // First, figure out the allowed states (Needs) based on the propagated
1275     // flags.
1276     if (II != IE) {
1277       MachineInstr &MI = *II;
1278
1279       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1280         auto III = Instructions.find(&MI);
1281         if (III != Instructions.end()) {
1282           if (III->second.Needs & StateStrictWWM)
1283             Needs = StateStrictWWM;
1284           else if (III->second.Needs & StateStrictWQM)
1285             Needs = StateStrictWQM;
1286           else if (III->second.Needs & StateWQM)
1287             Needs = StateWQM;
1288           else
1289             Needs &= ~III->second.Disabled;
1290           OutNeeds = III->second.OutNeeds;
1291         }
1292       } else {
1293         // If the instruction doesn't actually need a correct EXEC, then we can
1294         // safely leave Strict mode enabled.
1295         Needs = StateExact | StateWQM | StateStrict;
1296       }
1297
1298       if (MI.isTerminator() && OutNeeds == StateExact)
1299         Needs = StateExact;
1300
1301       ++Next;
1302     } else {
1303       // End of basic block
1304       if (BI.OutNeeds & StateWQM)
1305         Needs = StateWQM;
1306       else if (BI.OutNeeds == StateExact)
1307         Needs = StateExact;
1308       else
1309         Needs = StateWQM | StateExact;
1310     }
1311
1312     // Now, transition if necessary.
1313     if (!(Needs & State)) {
1314       MachineBasicBlock::iterator First;
1315       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1316           State == StateStrictWQM || Needs == StateStrictWQM) {
1317         // We must switch to or from Strict mode.
1318         First = FirstStrict;
1319       } else {
1320         // We only need to switch to/from WQM, so we can use FirstWQM.
1321         First = FirstWQM;
1322       }
1323
1324       // Whether we need to save SCC depends on start and end states.
1325       bool SaveSCC = false;
1326       switch (State) {
1327       case StateExact:
1328       case StateStrictWWM:
1329       case StateStrictWQM:
1330         // Exact/Strict -> Strict: save SCC
1331         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1332         // Exact/Strict -> Exact: no save
1333         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1334         break;
1335       case StateWQM:
1336         // WQM -> Exact/Strict: save SCC
1337         SaveSCC = !(Needs & StateWQM);
1338         break;
1339       default:
1340         llvm_unreachable("Unknown state");
1341         break;
1342       }
1343       MachineBasicBlock::iterator Before =
1344           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1345
1346       if (State & StateStrict) {
1347         assert(State == StateStrictWWM || State == StateStrictWQM);
1348         assert(SavedNonStrictReg);
1349         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1350
1351         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1352         SavedNonStrictReg = 0;
1353         State = NonStrictState;
1354       }
1355
1356       if (Needs & StateStrict) {
1357         NonStrictState = State;
1358         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1359         assert(!SavedNonStrictReg);
1360         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1361
1362         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1363         State = Needs;
1364
1365       } else {
1366         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1367           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1368             assert(!SavedWQMReg);
1369             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1370           }
1371
1372           toExact(MBB, Before, SavedWQMReg);
1373           State = StateExact;
1374         } else if (State == StateExact && (Needs & StateWQM) &&
1375                    !(Needs & StateExact)) {
1376           assert(WQMFromExec == (SavedWQMReg == 0));
1377
1378           toWQM(MBB, Before, SavedWQMReg);
1379
1380           if (SavedWQMReg) {
1381             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1382             SavedWQMReg = 0;
1383           }
1384           State = StateWQM;
1385         } else {
1386           // We can get here if we transitioned from StrictWWM to a
1387           // non-StrictWWM state that already matches our needs, but we
1388           // shouldn't need to do anything.
1389           assert(Needs & State);
1390         }
1391       }
1392     }
1393
1394     if (Needs != (StateExact | StateWQM | StateStrict)) {
1395       if (Needs != (StateExact | StateWQM))
1396         FirstWQM = IE;
1397       FirstStrict = IE;
1398     }
1399
1400     if (II == IE)
1401       break;
1402
1403     II = Next;
1404   }
1405   assert(!SavedWQMReg);
1406   assert(!SavedNonStrictReg);
1407 }
1408
1409 void SIWholeQuadMode::lowerLiveMaskQueries() {
1410   for (MachineInstr *MI : LiveMaskQueries) {
1411     const DebugLoc &DL = MI->getDebugLoc();
1412     Register Dest = MI->getOperand(0).getReg();
1413
1414     MachineInstr *Copy =
1415         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1416             .addReg(LiveMaskReg);
1417
1418     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1419     MI->eraseFromParent();
1420   }
1421 }
1422
1423 void SIWholeQuadMode::lowerCopyInstrs() {
1424   for (MachineInstr *MI : LowerToMovInstrs) {
1425     assert(MI->getNumExplicitOperands() == 2);
1426
1427     const Register Reg = MI->getOperand(0).getReg();
1428     const unsigned SubReg = MI->getOperand(0).getSubReg();
1429
1430     if (TRI->isVGPR(*MRI, Reg)) {
1431       const TargetRegisterClass *regClass =
1432           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1433       if (SubReg)
1434         regClass = TRI->getSubRegClass(regClass, SubReg);
1435
1436       const unsigned MovOp = TII->getMovOpcode(regClass);
1437       MI->setDesc(TII->get(MovOp));
1438
1439       // Check that it already implicitly depends on exec (like all VALU movs
1440       // should do).
1441       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1442         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1443       }));
1444     } else {
1445       // Remove early-clobber and exec dependency from simple SGPR copies.
1446       // This allows some to be eliminated during/post RA.
1447       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1448       if (MI->getOperand(0).isEarlyClobber()) {
1449         LIS->removeInterval(Reg);
1450         MI->getOperand(0).setIsEarlyClobber(false);
1451         LIS->createAndComputeVirtRegInterval(Reg);
1452       }
1453       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1454       while (Index >= 0) {
1455         MI->RemoveOperand(Index);
1456         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1457       }
1458       MI->setDesc(TII->get(AMDGPU::COPY));
1459       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1460     }
1461   }
1462   for (MachineInstr *MI : LowerToCopyInstrs) {
1463     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1464         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1465       assert(MI->getNumExplicitOperands() == 3);
1466       // the only reason we should be here is V_SET_INACTIVE has
1467       // an undef input so it is being replaced by a simple copy.
1468       // There should be a second undef source that we should remove.
1469       assert(MI->getOperand(2).isUndef());
1470       MI->RemoveOperand(2);
1471       MI->untieRegOperand(1);
1472     } else {
1473       assert(MI->getNumExplicitOperands() == 2);
1474     }
1475
1476     MI->setDesc(TII->get(AMDGPU::COPY));
1477   }
1478 }
1479
1480 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1481   for (MachineInstr *MI : KillInstrs) {
1482     MachineBasicBlock *MBB = MI->getParent();
1483     MachineInstr *SplitPoint = nullptr;
1484     switch (MI->getOpcode()) {
1485     case AMDGPU::SI_DEMOTE_I1:
1486     case AMDGPU::SI_KILL_I1_TERMINATOR:
1487       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1488       break;
1489     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1490       SplitPoint = lowerKillF32(*MBB, *MI);
1491       break;
1492     default:
1493       continue;
1494     }
1495     if (SplitPoint)
1496       splitBlock(MBB, SplitPoint);
1497   }
1498 }
1499
1500 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1501   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1502                     << " ------------- \n");
1503   LLVM_DEBUG(MF.dump(););
1504
1505   Instructions.clear();
1506   Blocks.clear();
1507   LiveMaskQueries.clear();
1508   LowerToCopyInstrs.clear();
1509   LowerToMovInstrs.clear();
1510   KillInstrs.clear();
1511   StateTransition.clear();
1512
1513   ST = &MF.getSubtarget<GCNSubtarget>();
1514
1515   TII = ST->getInstrInfo();
1516   TRI = &TII->getRegisterInfo();
1517   MRI = &MF.getRegInfo();
1518   LIS = &getAnalysis<LiveIntervals>();
1519   MDT = &getAnalysis<MachineDominatorTree>();
1520   PDT = &getAnalysis<MachinePostDominatorTree>();
1521
1522   if (ST->isWave32()) {
1523     AndOpc = AMDGPU::S_AND_B32;
1524     AndN2Opc = AMDGPU::S_ANDN2_B32;
1525     XorOpc = AMDGPU::S_XOR_B32;
1526     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1527     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1528     WQMOpc = AMDGPU::S_WQM_B32;
1529     Exec = AMDGPU::EXEC_LO;
1530   } else {
1531     AndOpc = AMDGPU::S_AND_B64;
1532     AndN2Opc = AMDGPU::S_ANDN2_B64;
1533     XorOpc = AMDGPU::S_XOR_B64;
1534     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1535     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1536     WQMOpc = AMDGPU::S_WQM_B64;
1537     Exec = AMDGPU::EXEC;
1538   }
1539
1540   const char GlobalFlags = analyzeFunction(MF);
1541   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1542
1543   LiveMaskReg = Exec;
1544
1545   // Shader is simple does not need any state changes or any complex lowering
1546   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1547       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1548     lowerLiveMaskQueries();
1549     return !LiveMaskQueries.empty();
1550   }
1551
1552   MachineBasicBlock &Entry = MF.front();
1553   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1554
1555   // Store a copy of the original live mask when required
1556   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1557     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1558     MachineInstr *MI =
1559         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1560             .addReg(Exec);
1561     LIS->InsertMachineInstrInMaps(*MI);
1562   }
1563
1564   LLVM_DEBUG(printInfo());
1565
1566   lowerLiveMaskQueries();
1567   lowerCopyInstrs();
1568
1569   // Shader only needs WQM
1570   if (GlobalFlags == StateWQM) {
1571     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1572                   .addReg(Exec);
1573     LIS->InsertMachineInstrInMaps(*MI);
1574     lowerKillInstrs(true);
1575   } else {
1576     for (auto BII : Blocks)
1577       processBlock(*BII.first, BII.first == &Entry);
1578     // Lowering blocks causes block splitting so perform as a second pass.
1579     for (auto BII : Blocks)
1580       lowerBlock(*BII.first);
1581   }
1582
1583   // Compute live range for live mask
1584   if (LiveMaskReg != Exec)
1585     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1586
1587   // Physical registers like SCC aren't tracked by default anyway, so just
1588   // removing the ranges we computed is the simplest option for maintaining
1589   // the analysis results.
1590   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1591
1592   // If we performed any kills then recompute EXEC
1593   if (!KillInstrs.empty())
1594     LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
1595
1596   return true;
1597 }