[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / AMDGPU / SIWholeQuadMode.cpp
blob38548eaf94785b7b333ba82d97bf8ab34d715bd8
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 /// ...
32 /// S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
38 /// ...
39 /// S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
47 /// ...
48 /// S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
62 ///
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
85 using namespace llvm;
87 #define DEBUG_TYPE "si-wqm"
89 namespace {
91 enum {
92 StateWQM = 0x1,
93 StateStrictWWM = 0x2,
94 StateStrictWQM = 0x4,
95 StateExact = 0x8,
96 StateStrict = StateStrictWWM | StateStrictWQM,
99 struct PrintState {
100 public:
101 int State;
103 explicit PrintState(int State) : State(State) {}
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
109 static const std::pair<char, const char *> Mapping[] = {
110 std::make_pair(StateWQM, "WQM"),
111 std::make_pair(StateStrictWWM, "StrictWWM"),
112 std::make_pair(StateStrictWQM, "StrictWQM"),
113 std::make_pair(StateExact, "Exact")};
114 char State = PS.State;
115 for (auto M : Mapping) {
116 if (State & M.first) {
117 OS << M.second;
118 State &= ~M.first;
120 if (State)
121 OS << '|';
124 assert(State == 0);
125 return OS;
127 #endif
129 struct InstrInfo {
130 char Needs = 0;
131 char Disabled = 0;
132 char OutNeeds = 0;
135 struct BlockInfo {
136 char Needs = 0;
137 char InNeeds = 0;
138 char OutNeeds = 0;
139 char InitialState = 0;
140 bool NeedsLowering = false;
143 struct WorkItem {
144 MachineBasicBlock *MBB = nullptr;
145 MachineInstr *MI = nullptr;
147 WorkItem() = default;
148 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149 WorkItem(MachineInstr *MI) : MI(MI) {}
152 class SIWholeQuadMode : public MachineFunctionPass {
153 private:
154 const SIInstrInfo *TII;
155 const SIRegisterInfo *TRI;
156 const GCNSubtarget *ST;
157 MachineRegisterInfo *MRI;
158 LiveIntervals *LIS;
159 MachineDominatorTree *MDT;
160 MachinePostDominatorTree *PDT;
162 unsigned AndOpc;
163 unsigned AndN2Opc;
164 unsigned XorOpc;
165 unsigned AndSaveExecOpc;
166 unsigned OrSaveExecOpc;
167 unsigned WQMOpc;
168 Register Exec;
169 Register LiveMaskReg;
171 DenseMap<const MachineInstr *, InstrInfo> Instructions;
172 MapVector<MachineBasicBlock *, BlockInfo> Blocks;
174 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175 DenseMap<const MachineInstr *, char> StateTransition;
177 SmallVector<MachineInstr *, 2> LiveMaskQueries;
178 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179 SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
180 SmallVector<MachineInstr *, 4> KillInstrs;
182 void printInfo();
184 void markInstruction(MachineInstr &MI, char Flag,
185 std::vector<WorkItem> &Worklist);
186 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189 std::vector<WorkItem> &Worklist);
190 void markInstructionUses(const MachineInstr &MI, char Flag,
191 std::vector<WorkItem> &Worklist);
192 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195 char analyzeFunction(MachineFunction &MF);
197 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198 MachineBasicBlock::iterator Before);
199 MachineBasicBlock::iterator
200 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201 MachineBasicBlock::iterator Last, bool PreferLast,
202 bool SaveSCC);
203 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204 Register SaveWQM);
205 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206 Register SavedWQM);
207 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208 Register SaveOrig, char StrictStateNeeded);
209 void fromStrictMode(MachineBasicBlock &MBB,
210 MachineBasicBlock::iterator Before, Register SavedOrig,
211 char NonStrictState, char CurrentStrictState);
213 MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
215 MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216 bool IsWQM);
217 MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
219 void lowerBlock(MachineBasicBlock &MBB);
220 void processBlock(MachineBasicBlock &MBB, bool IsEntry);
222 void lowerLiveMaskQueries();
223 void lowerCopyInstrs();
224 void lowerKillInstrs(bool IsWQM);
226 public:
227 static char ID;
229 SIWholeQuadMode() :
230 MachineFunctionPass(ID) { }
232 bool runOnMachineFunction(MachineFunction &MF) override;
234 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
236 void getAnalysisUsage(AnalysisUsage &AU) const override {
237 AU.addRequired<LiveIntervals>();
238 AU.addPreserved<SlotIndexes>();
239 AU.addPreserved<LiveIntervals>();
240 AU.addRequired<MachineDominatorTree>();
241 AU.addPreserved<MachineDominatorTree>();
242 AU.addRequired<MachinePostDominatorTree>();
243 AU.addPreserved<MachinePostDominatorTree>();
244 MachineFunctionPass::getAnalysisUsage(AU);
247 MachineFunctionProperties getClearedProperties() const override {
248 return MachineFunctionProperties().set(
249 MachineFunctionProperties::Property::IsSSA);
253 } // end anonymous namespace
255 char SIWholeQuadMode::ID = 0;
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258 false)
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263 false)
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
267 FunctionPass *llvm::createSIWholeQuadModePass() {
268 return new SIWholeQuadMode;
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273 for (const auto &BII : Blocks) {
274 dbgs() << "\n"
275 << printMBBReference(*BII.first) << ":\n"
276 << " InNeeds = " << PrintState(BII.second.InNeeds)
277 << ", Needs = " << PrintState(BII.second.Needs)
278 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
280 for (const MachineInstr &MI : *BII.first) {
281 auto III = Instructions.find(&MI);
282 if (III == Instructions.end())
283 continue;
285 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
286 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
290 #endif
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293 std::vector<WorkItem> &Worklist) {
294 InstrInfo &II = Instructions[&MI];
296 assert(!(Flag & StateExact) && Flag != 0);
298 // Remove any disabled states from the flag. The user that required it gets
299 // an undefined value in the helper lanes. For example, this can happen if
300 // the result of an atomic is used by instruction that requires WQM, where
301 // ignoring the request for WQM is correct as per the relevant specs.
302 Flag &= ~II.Disabled;
304 // Ignore if the flag is already encompassed by the existing needs, or we
305 // just disabled everything.
306 if ((II.Needs & Flag) == Flag)
307 return;
309 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310 II.Needs |= Flag;
311 Worklist.push_back(&MI);
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316 Register Reg, unsigned SubReg, char Flag,
317 std::vector<WorkItem> &Worklist) {
318 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
320 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321 const VNInfo *Value = UseLRQ.valueIn();
322 if (!Value)
323 return;
325 // Note: this code assumes that lane masks on AMDGPU completely
326 // cover registers.
327 const LaneBitmask UseLanes =
328 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330 : LaneBitmask::getNone());
332 // Perform a depth-first iteration of the LiveRange graph marking defs.
333 // Stop processing of a given branch when all use lanes have been defined.
334 // The first definition stops processing for a physical register.
335 struct PhiEntry {
336 const VNInfo *Phi;
337 unsigned PredIdx;
338 LaneBitmask DefinedLanes;
340 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
343 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344 SmallVector<PhiEntry, 2> PhiStack;
345 SmallSet<VisitKey, 4> Visited;
346 LaneBitmask DefinedLanes;
347 unsigned NextPredIdx = 0; // Only used for processing phi nodes
348 do {
349 const VNInfo *NextValue = nullptr;
350 const VisitKey Key(Value, DefinedLanes);
352 if (!Visited.count(Key)) {
353 Visited.insert(Key);
354 // On first visit to a phi then start processing first predecessor
355 NextPredIdx = 0;
358 if (Value->isPHIDef()) {
359 // Each predecessor node in the phi must be processed as a subgraph
360 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361 assert(MBB && "Phi-def has no defining MBB");
363 // Find next predecessor to process
364 unsigned Idx = NextPredIdx;
365 auto PI = MBB->pred_begin() + Idx;
366 auto PE = MBB->pred_end();
367 for (; PI != PE && !NextValue; ++PI, ++Idx) {
368 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369 if (!Visited.count(VisitKey(VN, DefinedLanes)))
370 NextValue = VN;
374 // If there are more predecessors to process; add phi to stack
375 if (PI != PE)
376 PhiStack.emplace_back(Value, Idx, DefinedLanes);
377 } else {
378 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379 assert(MI && "Def has no defining instruction");
381 if (Reg.isVirtual()) {
382 // Iterate over all operands to find relevant definitions
383 bool HasDef = false;
384 for (const MachineOperand &Op : MI->operands()) {
385 if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
386 continue;
388 // Compute lanes defined and overlap with use
389 LaneBitmask OpLanes =
390 Op.isUndef() ? LaneBitmask::getAll()
391 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392 LaneBitmask Overlap = (UseLanes & OpLanes);
394 // Record if this instruction defined any of use
395 HasDef |= Overlap.any();
397 // Mark any lanes defined
398 DefinedLanes |= OpLanes;
401 // Check if all lanes of use have been defined
402 if ((DefinedLanes & UseLanes) != UseLanes) {
403 // Definition not complete; need to process input value
404 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405 if (const VNInfo *VN = LRQ.valueIn()) {
406 if (!Visited.count(VisitKey(VN, DefinedLanes)))
407 NextValue = VN;
411 // Only mark the instruction if it defines some part of the use
412 if (HasDef)
413 markInstruction(*MI, Flag, Worklist);
414 } else {
415 // For physical registers simply mark the defining instruction
416 markInstruction(*MI, Flag, Worklist);
420 if (!NextValue && !PhiStack.empty()) {
421 // Reach end of chain; revert to processing last phi
422 PhiEntry &Entry = PhiStack.back();
423 NextValue = Entry.Phi;
424 NextPredIdx = Entry.PredIdx;
425 DefinedLanes = Entry.DefinedLanes;
426 PhiStack.pop_back();
429 Value = NextValue;
430 } while (Value);
433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434 const MachineOperand &Op, char Flag,
435 std::vector<WorkItem> &Worklist) {
436 assert(Op.isReg());
437 Register Reg = Op.getReg();
439 // Ignore some hardware registers
440 switch (Reg) {
441 case AMDGPU::EXEC:
442 case AMDGPU::EXEC_LO:
443 return;
444 default:
445 break;
448 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449 << " for " << MI);
450 if (Reg.isVirtual()) {
451 LiveRange &LR = LIS->getInterval(Reg);
452 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453 } else {
454 // Handle physical registers that we need to track; this is mostly relevant
455 // for VCC, which can appear as the (implicit) input of a uniform branch,
456 // e.g. when a loop counter is stored in a VGPR.
457 for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
458 ++RegUnit) {
459 LiveRange &LR = LIS->getRegUnit(*RegUnit);
460 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
461 if (!Value)
462 continue;
464 markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
469 /// Mark all instructions defining the uses in \p MI with \p Flag.
470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
471 std::vector<WorkItem> &Worklist) {
472 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
473 << MI);
475 for (const MachineOperand &Use : MI.uses()) {
476 if (!Use.isReg() || !Use.isUse())
477 continue;
478 markOperand(MI, Use, Flag, Worklist);
482 // Scan instructions to determine which ones require an Exact execmask and
483 // which ones seed WQM requirements.
484 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
485 std::vector<WorkItem> &Worklist) {
486 char GlobalFlags = 0;
487 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
488 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
489 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
491 // We need to visit the basic blocks in reverse post-order so that we visit
492 // defs before uses, in particular so that we don't accidentally mark an
493 // instruction as needing e.g. WQM before visiting it and realizing it needs
494 // WQM disabled.
495 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
496 for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
497 MachineBasicBlock &MBB = **BI;
498 BlockInfo &BBI = Blocks[&MBB];
500 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
501 MachineInstr &MI = *II;
502 InstrInfo &III = Instructions[&MI];
503 unsigned Opcode = MI.getOpcode();
504 char Flags = 0;
506 if (TII->isWQM(Opcode)) {
507 // If LOD is not supported WQM is not needed.
508 if (!ST->hasExtendedImageInsts())
509 continue;
510 // Sampling instructions don't need to produce results for all pixels
511 // in a quad, they just require all inputs of a quad to have been
512 // computed for derivatives.
513 markInstructionUses(MI, StateWQM, Worklist);
514 GlobalFlags |= StateWQM;
515 continue;
516 } else if (Opcode == AMDGPU::WQM) {
517 // The WQM intrinsic requires its output to have all the helper lanes
518 // correct, so we need it to be in WQM.
519 Flags = StateWQM;
520 LowerToCopyInstrs.push_back(&MI);
521 } else if (Opcode == AMDGPU::SOFT_WQM) {
522 LowerToCopyInstrs.push_back(&MI);
523 SoftWQMInstrs.push_back(&MI);
524 continue;
525 } else if (Opcode == AMDGPU::STRICT_WWM) {
526 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
527 // it needs to be executed in WQM or Exact so that its copy doesn't
528 // clobber inactive lanes.
529 markInstructionUses(MI, StateStrictWWM, Worklist);
530 GlobalFlags |= StateStrictWWM;
531 LowerToMovInstrs.push_back(&MI);
532 continue;
533 } else if (Opcode == AMDGPU::STRICT_WQM) {
534 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
535 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
536 // quads that have at least one active thread.
537 markInstructionUses(MI, StateStrictWQM, Worklist);
538 GlobalFlags |= StateStrictWQM;
539 LowerToMovInstrs.push_back(&MI);
540 continue;
541 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
542 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
543 III.Disabled = StateStrict;
544 MachineOperand &Inactive = MI.getOperand(2);
545 if (Inactive.isReg()) {
546 if (Inactive.isUndef()) {
547 LowerToCopyInstrs.push_back(&MI);
548 } else {
549 markOperand(MI, Inactive, StateStrictWWM, Worklist);
552 SetInactiveInstrs.push_back(&MI);
553 continue;
554 } else if (TII->isDisableWQM(MI)) {
555 BBI.Needs |= StateExact;
556 if (!(BBI.InNeeds & StateExact)) {
557 BBI.InNeeds |= StateExact;
558 Worklist.push_back(&MBB);
560 GlobalFlags |= StateExact;
561 III.Disabled = StateWQM | StateStrict;
562 continue;
563 } else {
564 if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
565 LiveMaskQueries.push_back(&MI);
566 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
567 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
568 Opcode == AMDGPU::SI_DEMOTE_I1) {
569 KillInstrs.push_back(&MI);
570 BBI.NeedsLowering = true;
571 } else if (WQMOutputs) {
572 // The function is in machine SSA form, which means that physical
573 // VGPRs correspond to shader inputs and outputs. Inputs are
574 // only used, outputs are only defined.
575 // FIXME: is this still valid?
576 for (const MachineOperand &MO : MI.defs()) {
577 if (!MO.isReg())
578 continue;
580 Register Reg = MO.getReg();
582 if (!Reg.isVirtual() &&
583 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
584 Flags = StateWQM;
585 break;
590 if (!Flags)
591 continue;
594 markInstruction(MI, Flags, Worklist);
595 GlobalFlags |= Flags;
599 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
600 // ever used anywhere in the function. This implements the corresponding
601 // semantics of @llvm.amdgcn.set.inactive.
602 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
603 if (GlobalFlags & StateWQM) {
604 for (MachineInstr *MI : SetInactiveInstrs)
605 markInstruction(*MI, StateWQM, Worklist);
606 for (MachineInstr *MI : SoftWQMInstrs)
607 markInstruction(*MI, StateWQM, Worklist);
610 return GlobalFlags;
613 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
614 std::vector<WorkItem>& Worklist) {
615 MachineBasicBlock *MBB = MI.getParent();
616 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
617 BlockInfo &BI = Blocks[MBB];
619 // Control flow-type instructions and stores to temporary memory that are
620 // followed by WQM computations must themselves be in WQM.
621 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
622 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
623 Instructions[&MI].Needs = StateWQM;
624 II.Needs = StateWQM;
627 // Propagate to block level
628 if (II.Needs & StateWQM) {
629 BI.Needs |= StateWQM;
630 if (!(BI.InNeeds & StateWQM)) {
631 BI.InNeeds |= StateWQM;
632 Worklist.push_back(MBB);
636 // Propagate backwards within block
637 if (MachineInstr *PrevMI = MI.getPrevNode()) {
638 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
639 if (!PrevMI->isPHI()) {
640 InstrInfo &PrevII = Instructions[PrevMI];
641 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
642 PrevII.OutNeeds |= InNeeds;
643 Worklist.push_back(PrevMI);
648 // Propagate WQM flag to instruction inputs
649 assert(!(II.Needs & StateExact));
651 if (II.Needs != 0)
652 markInstructionUses(MI, II.Needs, Worklist);
654 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
655 // not require any WQM transitions.
656 if (II.Needs & StateStrictWWM)
657 BI.Needs |= StateStrictWWM;
658 if (II.Needs & StateStrictWQM)
659 BI.Needs |= StateStrictWQM;
662 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
663 std::vector<WorkItem>& Worklist) {
664 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
666 // Propagate through instructions
667 if (!MBB.empty()) {
668 MachineInstr *LastMI = &*MBB.rbegin();
669 InstrInfo &LastII = Instructions[LastMI];
670 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
671 LastII.OutNeeds |= BI.OutNeeds;
672 Worklist.push_back(LastMI);
676 // Predecessor blocks must provide for our WQM/Exact needs.
677 for (MachineBasicBlock *Pred : MBB.predecessors()) {
678 BlockInfo &PredBI = Blocks[Pred];
679 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
680 continue;
682 PredBI.OutNeeds |= BI.InNeeds;
683 PredBI.InNeeds |= BI.InNeeds;
684 Worklist.push_back(Pred);
687 // All successors must be prepared to accept the same set of WQM/Exact data.
688 for (MachineBasicBlock *Succ : MBB.successors()) {
689 BlockInfo &SuccBI = Blocks[Succ];
690 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
691 continue;
693 SuccBI.InNeeds |= BI.OutNeeds;
694 Worklist.push_back(Succ);
698 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
699 std::vector<WorkItem> Worklist;
700 char GlobalFlags = scanInstructions(MF, Worklist);
702 while (!Worklist.empty()) {
703 WorkItem WI = Worklist.back();
704 Worklist.pop_back();
706 if (WI.MI)
707 propagateInstruction(*WI.MI, Worklist);
708 else
709 propagateBlock(*WI.MBB, Worklist);
712 return GlobalFlags;
715 MachineBasicBlock::iterator
716 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
717 MachineBasicBlock::iterator Before) {
718 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
720 MachineInstr *Save =
721 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
722 .addReg(AMDGPU::SCC);
723 MachineInstr *Restore =
724 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
725 .addReg(SaveReg);
727 LIS->InsertMachineInstrInMaps(*Save);
728 LIS->InsertMachineInstrInMaps(*Restore);
729 LIS->createAndComputeVirtRegInterval(SaveReg);
731 return Restore;
734 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
735 MachineInstr *TermMI) {
736 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
737 << *TermMI << "\n");
739 MachineBasicBlock *SplitBB =
740 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
742 // Convert last instruction in block to a terminator.
743 // Note: this only covers the expected patterns
744 unsigned NewOpcode = 0;
745 switch (TermMI->getOpcode()) {
746 case AMDGPU::S_AND_B32:
747 NewOpcode = AMDGPU::S_AND_B32_term;
748 break;
749 case AMDGPU::S_AND_B64:
750 NewOpcode = AMDGPU::S_AND_B64_term;
751 break;
752 case AMDGPU::S_MOV_B32:
753 NewOpcode = AMDGPU::S_MOV_B32_term;
754 break;
755 case AMDGPU::S_MOV_B64:
756 NewOpcode = AMDGPU::S_MOV_B64_term;
757 break;
758 default:
759 break;
761 if (NewOpcode)
762 TermMI->setDesc(TII->get(NewOpcode));
764 if (SplitBB != BB) {
765 // Update dominator trees
766 using DomTreeT = DomTreeBase<MachineBasicBlock>;
767 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
768 for (MachineBasicBlock *Succ : SplitBB->successors()) {
769 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
770 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
772 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
773 if (MDT)
774 MDT->getBase().applyUpdates(DTUpdates);
775 if (PDT)
776 PDT->getBase().applyUpdates(DTUpdates);
778 // Link blocks
779 MachineInstr *MI =
780 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
781 .addMBB(SplitBB);
782 LIS->InsertMachineInstrInMaps(*MI);
785 return SplitBB;
788 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
789 MachineInstr &MI) {
790 const DebugLoc &DL = MI.getDebugLoc();
791 unsigned Opcode = 0;
793 assert(MI.getOperand(0).isReg());
795 // Comparison is for live lanes; however here we compute the inverse
796 // (killed lanes). This is because VCMP will always generate 0 bits
797 // for inactive lanes so a mask of live lanes would not be correct
798 // inside control flow.
799 // Invert the comparison by swapping the operands and adjusting
800 // the comparison codes.
802 switch (MI.getOperand(2).getImm()) {
803 case ISD::SETUEQ:
804 Opcode = AMDGPU::V_CMP_LG_F32_e64;
805 break;
806 case ISD::SETUGT:
807 Opcode = AMDGPU::V_CMP_GE_F32_e64;
808 break;
809 case ISD::SETUGE:
810 Opcode = AMDGPU::V_CMP_GT_F32_e64;
811 break;
812 case ISD::SETULT:
813 Opcode = AMDGPU::V_CMP_LE_F32_e64;
814 break;
815 case ISD::SETULE:
816 Opcode = AMDGPU::V_CMP_LT_F32_e64;
817 break;
818 case ISD::SETUNE:
819 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
820 break;
821 case ISD::SETO:
822 Opcode = AMDGPU::V_CMP_O_F32_e64;
823 break;
824 case ISD::SETUO:
825 Opcode = AMDGPU::V_CMP_U_F32_e64;
826 break;
827 case ISD::SETOEQ:
828 case ISD::SETEQ:
829 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
830 break;
831 case ISD::SETOGT:
832 case ISD::SETGT:
833 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
834 break;
835 case ISD::SETOGE:
836 case ISD::SETGE:
837 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
838 break;
839 case ISD::SETOLT:
840 case ISD::SETLT:
841 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
842 break;
843 case ISD::SETOLE:
844 case ISD::SETLE:
845 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
846 break;
847 case ISD::SETONE:
848 case ISD::SETNE:
849 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
850 break;
851 default:
852 llvm_unreachable("invalid ISD:SET cond code");
855 // Pick opcode based on comparison type.
856 MachineInstr *VcmpMI;
857 const MachineOperand &Op0 = MI.getOperand(0);
858 const MachineOperand &Op1 = MI.getOperand(1);
859 if (TRI->isVGPR(*MRI, Op0.getReg())) {
860 Opcode = AMDGPU::getVOPe32(Opcode);
861 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
862 } else {
863 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
864 .addReg(AMDGPU::VCC, RegState::Define)
865 .addImm(0) // src0 modifiers
866 .add(Op1)
867 .addImm(0) // src1 modifiers
868 .add(Op0)
869 .addImm(0); // omod
872 // VCC represents lanes killed.
873 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
875 MachineInstr *MaskUpdateMI =
876 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
877 .addReg(LiveMaskReg)
878 .addReg(VCC);
880 // State of SCC represents whether any lanes are live in mask,
881 // if SCC is 0 then no lanes will be alive anymore.
882 MachineInstr *EarlyTermMI =
883 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
885 MachineInstr *ExecMaskMI =
886 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
888 assert(MBB.succ_size() == 1);
889 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
890 .addMBB(*MBB.succ_begin());
892 // Update live intervals
893 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
894 MBB.remove(&MI);
896 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
897 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
898 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
899 LIS->InsertMachineInstrInMaps(*NewTerm);
901 return NewTerm;
904 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
905 MachineInstr &MI, bool IsWQM) {
906 const DebugLoc &DL = MI.getDebugLoc();
907 MachineInstr *MaskUpdateMI = nullptr;
909 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
910 const MachineOperand &Op = MI.getOperand(0);
911 int64_t KillVal = MI.getOperand(1).getImm();
912 MachineInstr *ComputeKilledMaskMI = nullptr;
913 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
914 Register TmpReg;
916 // Is this a static or dynamic kill?
917 if (Op.isImm()) {
918 if (Op.getImm() == KillVal) {
919 // Static: all active lanes are killed
920 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
921 .addReg(LiveMaskReg)
922 .addReg(Exec);
923 } else {
924 // Static: kill does nothing
925 MachineInstr *NewTerm = nullptr;
926 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
927 LIS->RemoveMachineInstrFromMaps(MI);
928 } else {
929 assert(MBB.succ_size() == 1);
930 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
931 .addMBB(*MBB.succ_begin());
932 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
934 MBB.remove(&MI);
935 return NewTerm;
937 } else {
938 if (!KillVal) {
939 // Op represents live lanes after kill,
940 // so exec mask needs to be factored in.
941 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
942 ComputeKilledMaskMI =
943 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
944 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
945 .addReg(LiveMaskReg)
946 .addReg(TmpReg);
947 } else {
948 // Op represents lanes to kill
949 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
950 .addReg(LiveMaskReg)
951 .add(Op);
955 // State of SCC represents whether any lanes are live in mask,
956 // if SCC is 0 then no lanes will be alive anymore.
957 MachineInstr *EarlyTermMI =
958 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
960 // In the case we got this far some lanes are still live,
961 // update EXEC to deactivate lanes as appropriate.
962 MachineInstr *NewTerm;
963 MachineInstr *WQMMaskMI = nullptr;
964 Register LiveMaskWQM;
965 if (IsDemote) {
966 // Demotes deactive quads with only helper lanes
967 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
968 WQMMaskMI =
969 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
970 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
971 .addReg(Exec)
972 .addReg(LiveMaskWQM);
973 } else {
974 // Kills deactivate lanes
975 if (Op.isImm()) {
976 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
977 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
978 } else if (!IsWQM) {
979 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
980 .addReg(Exec)
981 .addReg(LiveMaskReg);
982 } else {
983 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
984 NewTerm =
985 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
989 // Update live intervals
990 LIS->RemoveMachineInstrFromMaps(MI);
991 MBB.remove(&MI);
992 assert(EarlyTermMI);
993 assert(MaskUpdateMI);
994 assert(NewTerm);
995 if (ComputeKilledMaskMI)
996 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
997 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
998 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
999 if (WQMMaskMI)
1000 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1001 LIS->InsertMachineInstrInMaps(*NewTerm);
1003 if (CndReg) {
1004 LIS->removeInterval(CndReg);
1005 LIS->createAndComputeVirtRegInterval(CndReg);
1007 if (TmpReg)
1008 LIS->createAndComputeVirtRegInterval(TmpReg);
1009 if (LiveMaskWQM)
1010 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1012 return NewTerm;
1015 // Replace (or supplement) instructions accessing live mask.
1016 // This can only happen once all the live mask registers have been created
1017 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1018 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1019 auto BII = Blocks.find(&MBB);
1020 if (BII == Blocks.end())
1021 return;
1023 const BlockInfo &BI = BII->second;
1024 if (!BI.NeedsLowering)
1025 return;
1027 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1029 SmallVector<MachineInstr *, 4> SplitPoints;
1030 char State = BI.InitialState;
1032 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1033 while (II != IE) {
1034 auto Next = std::next(II);
1035 MachineInstr &MI = *II;
1037 if (StateTransition.count(&MI))
1038 State = StateTransition[&MI];
1040 MachineInstr *SplitPoint = nullptr;
1041 switch (MI.getOpcode()) {
1042 case AMDGPU::SI_DEMOTE_I1:
1043 case AMDGPU::SI_KILL_I1_TERMINATOR:
1044 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1045 break;
1046 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1047 SplitPoint = lowerKillF32(MBB, MI);
1048 break;
1049 default:
1050 break;
1052 if (SplitPoint)
1053 SplitPoints.push_back(SplitPoint);
1055 II = Next;
1058 // Perform splitting after instruction scan to simplify iteration.
1059 if (!SplitPoints.empty()) {
1060 MachineBasicBlock *BB = &MBB;
1061 for (MachineInstr *MI : SplitPoints) {
1062 BB = splitBlock(BB, MI);
1067 // Return an iterator in the (inclusive) range [First, Last] at which
1068 // instructions can be safely inserted, keeping in mind that some of the
1069 // instructions we want to add necessarily clobber SCC.
1070 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1071 MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1072 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1073 if (!SaveSCC)
1074 return PreferLast ? Last : First;
1076 LiveRange &LR =
1077 LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1078 auto MBBE = MBB.end();
1079 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1080 : LIS->getMBBEndIdx(&MBB);
1081 SlotIndex LastIdx =
1082 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1083 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1084 const LiveRange::Segment *S;
1086 for (;;) {
1087 S = LR.getSegmentContaining(Idx);
1088 if (!S)
1089 break;
1091 if (PreferLast) {
1092 SlotIndex Next = S->start.getBaseIndex();
1093 if (Next < FirstIdx)
1094 break;
1095 Idx = Next;
1096 } else {
1097 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1098 assert(EndMI && "Segment does not end on valid instruction");
1099 auto NextI = std::next(EndMI->getIterator());
1100 if (NextI == MBB.end())
1101 break;
1102 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1103 if (Next > LastIdx)
1104 break;
1105 Idx = Next;
1109 MachineBasicBlock::iterator MBBI;
1111 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1112 MBBI = MI;
1113 else {
1114 assert(Idx == LIS->getMBBEndIdx(&MBB));
1115 MBBI = MBB.end();
1118 // Move insertion point past any operations modifying EXEC.
1119 // This assumes that the value of SCC defined by any of these operations
1120 // does not need to be preserved.
1121 while (MBBI != Last) {
1122 bool IsExecDef = false;
1123 for (const MachineOperand &MO : MBBI->operands()) {
1124 if (MO.isReg() && MO.isDef()) {
1125 IsExecDef |=
1126 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1129 if (!IsExecDef)
1130 break;
1131 MBBI++;
1132 S = nullptr;
1135 if (S)
1136 MBBI = saveSCC(MBB, MBBI);
1138 return MBBI;
1141 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1142 MachineBasicBlock::iterator Before,
1143 Register SaveWQM) {
1144 MachineInstr *MI;
1146 if (SaveWQM) {
1147 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1148 .addReg(LiveMaskReg);
1149 } else {
1150 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1151 .addReg(Exec)
1152 .addReg(LiveMaskReg);
1155 LIS->InsertMachineInstrInMaps(*MI);
1156 StateTransition[MI] = StateExact;
1159 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1160 MachineBasicBlock::iterator Before,
1161 Register SavedWQM) {
1162 MachineInstr *MI;
1164 if (SavedWQM) {
1165 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1166 .addReg(SavedWQM);
1167 } else {
1168 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1171 LIS->InsertMachineInstrInMaps(*MI);
1172 StateTransition[MI] = StateWQM;
1175 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1176 MachineBasicBlock::iterator Before,
1177 Register SaveOrig, char StrictStateNeeded) {
1178 MachineInstr *MI;
1179 assert(SaveOrig);
1180 assert(StrictStateNeeded == StateStrictWWM ||
1181 StrictStateNeeded == StateStrictWQM);
1183 if (StrictStateNeeded == StateStrictWWM) {
1184 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1185 SaveOrig)
1186 .addImm(-1);
1187 } else {
1188 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1189 SaveOrig)
1190 .addImm(-1);
1192 LIS->InsertMachineInstrInMaps(*MI);
1193 StateTransition[MI] = StateStrictWWM;
1196 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1197 MachineBasicBlock::iterator Before,
1198 Register SavedOrig, char NonStrictState,
1199 char CurrentStrictState) {
1200 MachineInstr *MI;
1202 assert(SavedOrig);
1203 assert(CurrentStrictState == StateStrictWWM ||
1204 CurrentStrictState == StateStrictWQM);
1206 if (CurrentStrictState == StateStrictWWM) {
1207 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1208 Exec)
1209 .addReg(SavedOrig);
1210 } else {
1211 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1212 Exec)
1213 .addReg(SavedOrig);
1215 LIS->InsertMachineInstrInMaps(*MI);
1216 StateTransition[MI] = NonStrictState;
1219 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1220 auto BII = Blocks.find(&MBB);
1221 if (BII == Blocks.end())
1222 return;
1224 BlockInfo &BI = BII->second;
1226 // This is a non-entry block that is WQM throughout, so no need to do
1227 // anything.
1228 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1229 BI.InitialState = StateWQM;
1230 return;
1233 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1234 << ":\n");
1236 Register SavedWQMReg;
1237 Register SavedNonStrictReg;
1238 bool WQMFromExec = IsEntry;
1239 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1240 char NonStrictState = 0;
1241 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1243 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1244 if (IsEntry) {
1245 // Skip the instruction that saves LiveMask
1246 if (II != IE && II->getOpcode() == AMDGPU::COPY)
1247 ++II;
1250 // This stores the first instruction where it's safe to switch from WQM to
1251 // Exact or vice versa.
1252 MachineBasicBlock::iterator FirstWQM = IE;
1254 // This stores the first instruction where it's safe to switch from Strict
1255 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1256 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1257 // be safe to switch to/from WQM as well.
1258 MachineBasicBlock::iterator FirstStrict = IE;
1260 // Record initial state is block information.
1261 BI.InitialState = State;
1263 for (;;) {
1264 MachineBasicBlock::iterator Next = II;
1265 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1266 char OutNeeds = 0;
1268 if (FirstWQM == IE)
1269 FirstWQM = II;
1271 if (FirstStrict == IE)
1272 FirstStrict = II;
1274 // First, figure out the allowed states (Needs) based on the propagated
1275 // flags.
1276 if (II != IE) {
1277 MachineInstr &MI = *II;
1279 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1280 auto III = Instructions.find(&MI);
1281 if (III != Instructions.end()) {
1282 if (III->second.Needs & StateStrictWWM)
1283 Needs = StateStrictWWM;
1284 else if (III->second.Needs & StateStrictWQM)
1285 Needs = StateStrictWQM;
1286 else if (III->second.Needs & StateWQM)
1287 Needs = StateWQM;
1288 else
1289 Needs &= ~III->second.Disabled;
1290 OutNeeds = III->second.OutNeeds;
1292 } else {
1293 // If the instruction doesn't actually need a correct EXEC, then we can
1294 // safely leave Strict mode enabled.
1295 Needs = StateExact | StateWQM | StateStrict;
1298 if (MI.isTerminator() && OutNeeds == StateExact)
1299 Needs = StateExact;
1301 ++Next;
1302 } else {
1303 // End of basic block
1304 if (BI.OutNeeds & StateWQM)
1305 Needs = StateWQM;
1306 else if (BI.OutNeeds == StateExact)
1307 Needs = StateExact;
1308 else
1309 Needs = StateWQM | StateExact;
1312 // Now, transition if necessary.
1313 if (!(Needs & State)) {
1314 MachineBasicBlock::iterator First;
1315 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1316 State == StateStrictWQM || Needs == StateStrictWQM) {
1317 // We must switch to or from Strict mode.
1318 First = FirstStrict;
1319 } else {
1320 // We only need to switch to/from WQM, so we can use FirstWQM.
1321 First = FirstWQM;
1324 // Whether we need to save SCC depends on start and end states.
1325 bool SaveSCC = false;
1326 switch (State) {
1327 case StateExact:
1328 case StateStrictWWM:
1329 case StateStrictWQM:
1330 // Exact/Strict -> Strict: save SCC
1331 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1332 // Exact/Strict -> Exact: no save
1333 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1334 break;
1335 case StateWQM:
1336 // WQM -> Exact/Strict: save SCC
1337 SaveSCC = !(Needs & StateWQM);
1338 break;
1339 default:
1340 llvm_unreachable("Unknown state");
1341 break;
1343 MachineBasicBlock::iterator Before =
1344 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1346 if (State & StateStrict) {
1347 assert(State == StateStrictWWM || State == StateStrictWQM);
1348 assert(SavedNonStrictReg);
1349 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1351 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1352 SavedNonStrictReg = 0;
1353 State = NonStrictState;
1356 if (Needs & StateStrict) {
1357 NonStrictState = State;
1358 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1359 assert(!SavedNonStrictReg);
1360 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1362 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1363 State = Needs;
1365 } else {
1366 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1367 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1368 assert(!SavedWQMReg);
1369 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1372 toExact(MBB, Before, SavedWQMReg);
1373 State = StateExact;
1374 } else if (State == StateExact && (Needs & StateWQM) &&
1375 !(Needs & StateExact)) {
1376 assert(WQMFromExec == (SavedWQMReg == 0));
1378 toWQM(MBB, Before, SavedWQMReg);
1380 if (SavedWQMReg) {
1381 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1382 SavedWQMReg = 0;
1384 State = StateWQM;
1385 } else {
1386 // We can get here if we transitioned from StrictWWM to a
1387 // non-StrictWWM state that already matches our needs, but we
1388 // shouldn't need to do anything.
1389 assert(Needs & State);
1394 if (Needs != (StateExact | StateWQM | StateStrict)) {
1395 if (Needs != (StateExact | StateWQM))
1396 FirstWQM = IE;
1397 FirstStrict = IE;
1400 if (II == IE)
1401 break;
1403 II = Next;
1405 assert(!SavedWQMReg);
1406 assert(!SavedNonStrictReg);
1409 void SIWholeQuadMode::lowerLiveMaskQueries() {
1410 for (MachineInstr *MI : LiveMaskQueries) {
1411 const DebugLoc &DL = MI->getDebugLoc();
1412 Register Dest = MI->getOperand(0).getReg();
1414 MachineInstr *Copy =
1415 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1416 .addReg(LiveMaskReg);
1418 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1419 MI->eraseFromParent();
1423 void SIWholeQuadMode::lowerCopyInstrs() {
1424 for (MachineInstr *MI : LowerToMovInstrs) {
1425 assert(MI->getNumExplicitOperands() == 2);
1427 const Register Reg = MI->getOperand(0).getReg();
1428 const unsigned SubReg = MI->getOperand(0).getSubReg();
1430 if (TRI->isVGPR(*MRI, Reg)) {
1431 const TargetRegisterClass *regClass =
1432 Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1433 if (SubReg)
1434 regClass = TRI->getSubRegClass(regClass, SubReg);
1436 const unsigned MovOp = TII->getMovOpcode(regClass);
1437 MI->setDesc(TII->get(MovOp));
1439 // Check that it already implicitly depends on exec (like all VALU movs
1440 // should do).
1441 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1442 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1443 }));
1444 } else {
1445 // Remove early-clobber and exec dependency from simple SGPR copies.
1446 // This allows some to be eliminated during/post RA.
1447 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1448 if (MI->getOperand(0).isEarlyClobber()) {
1449 LIS->removeInterval(Reg);
1450 MI->getOperand(0).setIsEarlyClobber(false);
1451 LIS->createAndComputeVirtRegInterval(Reg);
1453 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1454 while (Index >= 0) {
1455 MI->RemoveOperand(Index);
1456 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1458 MI->setDesc(TII->get(AMDGPU::COPY));
1459 LLVM_DEBUG(dbgs() << " -> " << *MI);
1462 for (MachineInstr *MI : LowerToCopyInstrs) {
1463 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1464 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1465 assert(MI->getNumExplicitOperands() == 3);
1466 // the only reason we should be here is V_SET_INACTIVE has
1467 // an undef input so it is being replaced by a simple copy.
1468 // There should be a second undef source that we should remove.
1469 assert(MI->getOperand(2).isUndef());
1470 MI->RemoveOperand(2);
1471 MI->untieRegOperand(1);
1472 } else {
1473 assert(MI->getNumExplicitOperands() == 2);
1476 MI->setDesc(TII->get(AMDGPU::COPY));
1480 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1481 for (MachineInstr *MI : KillInstrs) {
1482 MachineBasicBlock *MBB = MI->getParent();
1483 MachineInstr *SplitPoint = nullptr;
1484 switch (MI->getOpcode()) {
1485 case AMDGPU::SI_DEMOTE_I1:
1486 case AMDGPU::SI_KILL_I1_TERMINATOR:
1487 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1488 break;
1489 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1490 SplitPoint = lowerKillF32(*MBB, *MI);
1491 break;
1492 default:
1493 continue;
1495 if (SplitPoint)
1496 splitBlock(MBB, SplitPoint);
1500 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1501 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1502 << " ------------- \n");
1503 LLVM_DEBUG(MF.dump(););
1505 Instructions.clear();
1506 Blocks.clear();
1507 LiveMaskQueries.clear();
1508 LowerToCopyInstrs.clear();
1509 LowerToMovInstrs.clear();
1510 KillInstrs.clear();
1511 StateTransition.clear();
1513 ST = &MF.getSubtarget<GCNSubtarget>();
1515 TII = ST->getInstrInfo();
1516 TRI = &TII->getRegisterInfo();
1517 MRI = &MF.getRegInfo();
1518 LIS = &getAnalysis<LiveIntervals>();
1519 MDT = &getAnalysis<MachineDominatorTree>();
1520 PDT = &getAnalysis<MachinePostDominatorTree>();
1522 if (ST->isWave32()) {
1523 AndOpc = AMDGPU::S_AND_B32;
1524 AndN2Opc = AMDGPU::S_ANDN2_B32;
1525 XorOpc = AMDGPU::S_XOR_B32;
1526 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1527 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1528 WQMOpc = AMDGPU::S_WQM_B32;
1529 Exec = AMDGPU::EXEC_LO;
1530 } else {
1531 AndOpc = AMDGPU::S_AND_B64;
1532 AndN2Opc = AMDGPU::S_ANDN2_B64;
1533 XorOpc = AMDGPU::S_XOR_B64;
1534 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1535 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1536 WQMOpc = AMDGPU::S_WQM_B64;
1537 Exec = AMDGPU::EXEC;
1540 const char GlobalFlags = analyzeFunction(MF);
1541 const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1543 LiveMaskReg = Exec;
1545 // Shader is simple does not need any state changes or any complex lowering
1546 if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1547 LowerToMovInstrs.empty() && KillInstrs.empty()) {
1548 lowerLiveMaskQueries();
1549 return !LiveMaskQueries.empty();
1552 MachineBasicBlock &Entry = MF.front();
1553 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1555 // Store a copy of the original live mask when required
1556 if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1557 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1558 MachineInstr *MI =
1559 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1560 .addReg(Exec);
1561 LIS->InsertMachineInstrInMaps(*MI);
1564 LLVM_DEBUG(printInfo());
1566 lowerLiveMaskQueries();
1567 lowerCopyInstrs();
1569 // Shader only needs WQM
1570 if (GlobalFlags == StateWQM) {
1571 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1572 .addReg(Exec);
1573 LIS->InsertMachineInstrInMaps(*MI);
1574 lowerKillInstrs(true);
1575 } else {
1576 for (auto BII : Blocks)
1577 processBlock(*BII.first, BII.first == &Entry);
1578 // Lowering blocks causes block splitting so perform as a second pass.
1579 for (auto BII : Blocks)
1580 lowerBlock(*BII.first);
1583 // Compute live range for live mask
1584 if (LiveMaskReg != Exec)
1585 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1587 // Physical registers like SCC aren't tracked by default anyway, so just
1588 // removing the ranges we computed is the simplest option for maintaining
1589 // the analysis results.
1590 LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1592 // If we performed any kills then recompute EXEC
1593 if (!KillInstrs.empty())
1594 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
1596 return true;