1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
22 /// When necessary, this pass creates a function prolog
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
32 /// S_MOV_B64 EXEC, Tmp
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
39 /// S_MOV_B64 EXEC, Tmp
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
48 /// S_MOV_B64 EXEC, Tmp
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
57 /// There is room for improvement given better control flow analysis:
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
68 //===----------------------------------------------------------------------===//
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
87 #define DEBUG_TYPE "si-wqm"
96 StateStrict
= StateStrictWWM
| StateStrictWQM
,
103 explicit PrintState(int State
) : State(State
) {}
107 static raw_ostream
&operator<<(raw_ostream
&OS
, const PrintState
&PS
) {
109 static const std::pair
<char, const char *> Mapping
[] = {
110 std::make_pair(StateWQM
, "WQM"),
111 std::make_pair(StateStrictWWM
, "StrictWWM"),
112 std::make_pair(StateStrictWQM
, "StrictWQM"),
113 std::make_pair(StateExact
, "Exact")};
114 char State
= PS
.State
;
115 for (auto M
: Mapping
) {
116 if (State
& M
.first
) {
139 char InitialState
= 0;
140 bool NeedsLowering
= false;
144 MachineBasicBlock
*MBB
= nullptr;
145 MachineInstr
*MI
= nullptr;
147 WorkItem() = default;
148 WorkItem(MachineBasicBlock
*MBB
) : MBB(MBB
) {}
149 WorkItem(MachineInstr
*MI
) : MI(MI
) {}
152 class SIWholeQuadMode
: public MachineFunctionPass
{
154 const SIInstrInfo
*TII
;
155 const SIRegisterInfo
*TRI
;
156 const GCNSubtarget
*ST
;
157 MachineRegisterInfo
*MRI
;
159 MachineDominatorTree
*MDT
;
160 MachinePostDominatorTree
*PDT
;
165 unsigned AndSaveExecOpc
;
166 unsigned OrSaveExecOpc
;
169 Register LiveMaskReg
;
171 DenseMap
<const MachineInstr
*, InstrInfo
> Instructions
;
172 MapVector
<MachineBasicBlock
*, BlockInfo
> Blocks
;
174 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175 DenseMap
<const MachineInstr
*, char> StateTransition
;
177 SmallVector
<MachineInstr
*, 2> LiveMaskQueries
;
178 SmallVector
<MachineInstr
*, 4> LowerToMovInstrs
;
179 SmallVector
<MachineInstr
*, 4> LowerToCopyInstrs
;
180 SmallVector
<MachineInstr
*, 4> KillInstrs
;
184 void markInstruction(MachineInstr
&MI
, char Flag
,
185 std::vector
<WorkItem
> &Worklist
);
186 void markDefs(const MachineInstr
&UseMI
, LiveRange
&LR
, Register Reg
,
187 unsigned SubReg
, char Flag
, std::vector
<WorkItem
> &Worklist
);
188 void markOperand(const MachineInstr
&MI
, const MachineOperand
&Op
, char Flag
,
189 std::vector
<WorkItem
> &Worklist
);
190 void markInstructionUses(const MachineInstr
&MI
, char Flag
,
191 std::vector
<WorkItem
> &Worklist
);
192 char scanInstructions(MachineFunction
&MF
, std::vector
<WorkItem
> &Worklist
);
193 void propagateInstruction(MachineInstr
&MI
, std::vector
<WorkItem
> &Worklist
);
194 void propagateBlock(MachineBasicBlock
&MBB
, std::vector
<WorkItem
> &Worklist
);
195 char analyzeFunction(MachineFunction
&MF
);
197 MachineBasicBlock::iterator
saveSCC(MachineBasicBlock
&MBB
,
198 MachineBasicBlock::iterator Before
);
199 MachineBasicBlock::iterator
200 prepareInsertion(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator First
,
201 MachineBasicBlock::iterator Last
, bool PreferLast
,
203 void toExact(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
205 void toWQM(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
207 void toStrictMode(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
208 Register SaveOrig
, char StrictStateNeeded
);
209 void fromStrictMode(MachineBasicBlock
&MBB
,
210 MachineBasicBlock::iterator Before
, Register SavedOrig
,
211 char NonStrictState
, char CurrentStrictState
);
213 MachineBasicBlock
*splitBlock(MachineBasicBlock
*BB
, MachineInstr
*TermMI
);
215 MachineInstr
*lowerKillI1(MachineBasicBlock
&MBB
, MachineInstr
&MI
,
217 MachineInstr
*lowerKillF32(MachineBasicBlock
&MBB
, MachineInstr
&MI
);
219 void lowerBlock(MachineBasicBlock
&MBB
);
220 void processBlock(MachineBasicBlock
&MBB
, bool IsEntry
);
222 void lowerLiveMaskQueries();
223 void lowerCopyInstrs();
224 void lowerKillInstrs(bool IsWQM
);
230 MachineFunctionPass(ID
) { }
232 bool runOnMachineFunction(MachineFunction
&MF
) override
;
234 StringRef
getPassName() const override
{ return "SI Whole Quad Mode"; }
236 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
237 AU
.addRequired
<LiveIntervals
>();
238 AU
.addPreserved
<SlotIndexes
>();
239 AU
.addPreserved
<LiveIntervals
>();
240 AU
.addRequired
<MachineDominatorTree
>();
241 AU
.addPreserved
<MachineDominatorTree
>();
242 AU
.addRequired
<MachinePostDominatorTree
>();
243 AU
.addPreserved
<MachinePostDominatorTree
>();
244 MachineFunctionPass::getAnalysisUsage(AU
);
247 MachineFunctionProperties
getClearedProperties() const override
{
248 return MachineFunctionProperties().set(
249 MachineFunctionProperties::Property::IsSSA
);
253 } // end anonymous namespace
255 char SIWholeQuadMode::ID
= 0;
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode
, DEBUG_TYPE
, "SI Whole Quad Mode", false,
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree
)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree
)
262 INITIALIZE_PASS_END(SIWholeQuadMode
, DEBUG_TYPE
, "SI Whole Quad Mode", false,
265 char &llvm::SIWholeQuadModeID
= SIWholeQuadMode::ID
;
267 FunctionPass
*llvm::createSIWholeQuadModePass() {
268 return new SIWholeQuadMode
;
272 LLVM_DUMP_METHOD
void SIWholeQuadMode::printInfo() {
273 for (const auto &BII
: Blocks
) {
275 << printMBBReference(*BII
.first
) << ":\n"
276 << " InNeeds = " << PrintState(BII
.second
.InNeeds
)
277 << ", Needs = " << PrintState(BII
.second
.Needs
)
278 << ", OutNeeds = " << PrintState(BII
.second
.OutNeeds
) << "\n\n";
280 for (const MachineInstr
&MI
: *BII
.first
) {
281 auto III
= Instructions
.find(&MI
);
282 if (III
== Instructions
.end())
285 dbgs() << " " << MI
<< " Needs = " << PrintState(III
->second
.Needs
)
286 << ", OutNeeds = " << PrintState(III
->second
.OutNeeds
) << '\n';
292 void SIWholeQuadMode::markInstruction(MachineInstr
&MI
, char Flag
,
293 std::vector
<WorkItem
> &Worklist
) {
294 InstrInfo
&II
= Instructions
[&MI
];
296 assert(!(Flag
& StateExact
) && Flag
!= 0);
298 // Remove any disabled states from the flag. The user that required it gets
299 // an undefined value in the helper lanes. For example, this can happen if
300 // the result of an atomic is used by instruction that requires WQM, where
301 // ignoring the request for WQM is correct as per the relevant specs.
302 Flag
&= ~II
.Disabled
;
304 // Ignore if the flag is already encompassed by the existing needs, or we
305 // just disabled everything.
306 if ((II
.Needs
& Flag
) == Flag
)
309 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag
) << ": " << MI
);
311 Worklist
.push_back(&MI
);
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr
&UseMI
, LiveRange
&LR
,
316 Register Reg
, unsigned SubReg
, char Flag
,
317 std::vector
<WorkItem
> &Worklist
) {
318 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag
) << ": " << UseMI
);
320 LiveQueryResult UseLRQ
= LR
.Query(LIS
->getInstructionIndex(UseMI
));
321 const VNInfo
*Value
= UseLRQ
.valueIn();
325 // Note: this code assumes that lane masks on AMDGPU completely
327 const LaneBitmask UseLanes
=
328 SubReg
? TRI
->getSubRegIndexLaneMask(SubReg
)
329 : (Reg
.isVirtual() ? MRI
->getMaxLaneMaskForVReg(Reg
)
330 : LaneBitmask::getNone());
332 // Perform a depth-first iteration of the LiveRange graph marking defs.
333 // Stop processing of a given branch when all use lanes have been defined.
334 // The first definition stops processing for a physical register.
338 LaneBitmask DefinedLanes
;
340 PhiEntry(const VNInfo
*Phi
, unsigned PredIdx
, LaneBitmask DefinedLanes
)
341 : Phi(Phi
), PredIdx(PredIdx
), DefinedLanes(DefinedLanes
) {}
343 using VisitKey
= std::pair
<const VNInfo
*, LaneBitmask
>;
344 SmallVector
<PhiEntry
, 2> PhiStack
;
345 SmallSet
<VisitKey
, 4> Visited
;
346 LaneBitmask DefinedLanes
;
347 unsigned NextPredIdx
= 0; // Only used for processing phi nodes
349 const VNInfo
*NextValue
= nullptr;
350 const VisitKey
Key(Value
, DefinedLanes
);
352 if (!Visited
.count(Key
)) {
354 // On first visit to a phi then start processing first predecessor
358 if (Value
->isPHIDef()) {
359 // Each predecessor node in the phi must be processed as a subgraph
360 const MachineBasicBlock
*MBB
= LIS
->getMBBFromIndex(Value
->def
);
361 assert(MBB
&& "Phi-def has no defining MBB");
363 // Find next predecessor to process
364 unsigned Idx
= NextPredIdx
;
365 auto PI
= MBB
->pred_begin() + Idx
;
366 auto PE
= MBB
->pred_end();
367 for (; PI
!= PE
&& !NextValue
; ++PI
, ++Idx
) {
368 if (const VNInfo
*VN
= LR
.getVNInfoBefore(LIS
->getMBBEndIdx(*PI
))) {
369 if (!Visited
.count(VisitKey(VN
, DefinedLanes
)))
374 // If there are more predecessors to process; add phi to stack
376 PhiStack
.emplace_back(Value
, Idx
, DefinedLanes
);
378 MachineInstr
*MI
= LIS
->getInstructionFromIndex(Value
->def
);
379 assert(MI
&& "Def has no defining instruction");
381 if (Reg
.isVirtual()) {
382 // Iterate over all operands to find relevant definitions
384 for (const MachineOperand
&Op
: MI
->operands()) {
385 if (!(Op
.isReg() && Op
.isDef() && Op
.getReg() == Reg
))
388 // Compute lanes defined and overlap with use
389 LaneBitmask OpLanes
=
390 Op
.isUndef() ? LaneBitmask::getAll()
391 : TRI
->getSubRegIndexLaneMask(Op
.getSubReg());
392 LaneBitmask Overlap
= (UseLanes
& OpLanes
);
394 // Record if this instruction defined any of use
395 HasDef
|= Overlap
.any();
397 // Mark any lanes defined
398 DefinedLanes
|= OpLanes
;
401 // Check if all lanes of use have been defined
402 if ((DefinedLanes
& UseLanes
) != UseLanes
) {
403 // Definition not complete; need to process input value
404 LiveQueryResult LRQ
= LR
.Query(LIS
->getInstructionIndex(*MI
));
405 if (const VNInfo
*VN
= LRQ
.valueIn()) {
406 if (!Visited
.count(VisitKey(VN
, DefinedLanes
)))
411 // Only mark the instruction if it defines some part of the use
413 markInstruction(*MI
, Flag
, Worklist
);
415 // For physical registers simply mark the defining instruction
416 markInstruction(*MI
, Flag
, Worklist
);
420 if (!NextValue
&& !PhiStack
.empty()) {
421 // Reach end of chain; revert to processing last phi
422 PhiEntry
&Entry
= PhiStack
.back();
423 NextValue
= Entry
.Phi
;
424 NextPredIdx
= Entry
.PredIdx
;
425 DefinedLanes
= Entry
.DefinedLanes
;
433 void SIWholeQuadMode::markOperand(const MachineInstr
&MI
,
434 const MachineOperand
&Op
, char Flag
,
435 std::vector
<WorkItem
> &Worklist
) {
437 Register Reg
= Op
.getReg();
439 // Ignore some hardware registers
442 case AMDGPU::EXEC_LO
:
448 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag
) << ": " << Op
450 if (Reg
.isVirtual()) {
451 LiveRange
&LR
= LIS
->getInterval(Reg
);
452 markDefs(MI
, LR
, Reg
, Op
.getSubReg(), Flag
, Worklist
);
454 // Handle physical registers that we need to track; this is mostly relevant
455 // for VCC, which can appear as the (implicit) input of a uniform branch,
456 // e.g. when a loop counter is stored in a VGPR.
457 for (MCRegUnitIterator
RegUnit(Reg
.asMCReg(), TRI
); RegUnit
.isValid();
459 LiveRange
&LR
= LIS
->getRegUnit(*RegUnit
);
460 const VNInfo
*Value
= LR
.Query(LIS
->getInstructionIndex(MI
)).valueIn();
464 markDefs(MI
, LR
, *RegUnit
, AMDGPU::NoSubRegister
, Flag
, Worklist
);
469 /// Mark all instructions defining the uses in \p MI with \p Flag.
470 void SIWholeQuadMode::markInstructionUses(const MachineInstr
&MI
, char Flag
,
471 std::vector
<WorkItem
> &Worklist
) {
472 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag
) << ": "
475 for (const MachineOperand
&Use
: MI
.uses()) {
476 if (!Use
.isReg() || !Use
.isUse())
478 markOperand(MI
, Use
, Flag
, Worklist
);
482 // Scan instructions to determine which ones require an Exact execmask and
483 // which ones seed WQM requirements.
484 char SIWholeQuadMode::scanInstructions(MachineFunction
&MF
,
485 std::vector
<WorkItem
> &Worklist
) {
486 char GlobalFlags
= 0;
487 bool WQMOutputs
= MF
.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
488 SmallVector
<MachineInstr
*, 4> SetInactiveInstrs
;
489 SmallVector
<MachineInstr
*, 4> SoftWQMInstrs
;
491 // We need to visit the basic blocks in reverse post-order so that we visit
492 // defs before uses, in particular so that we don't accidentally mark an
493 // instruction as needing e.g. WQM before visiting it and realizing it needs
495 ReversePostOrderTraversal
<MachineFunction
*> RPOT(&MF
);
496 for (auto BI
= RPOT
.begin(), BE
= RPOT
.end(); BI
!= BE
; ++BI
) {
497 MachineBasicBlock
&MBB
= **BI
;
498 BlockInfo
&BBI
= Blocks
[&MBB
];
500 for (auto II
= MBB
.begin(), IE
= MBB
.end(); II
!= IE
; ++II
) {
501 MachineInstr
&MI
= *II
;
502 InstrInfo
&III
= Instructions
[&MI
];
503 unsigned Opcode
= MI
.getOpcode();
506 if (TII
->isWQM(Opcode
)) {
507 // If LOD is not supported WQM is not needed.
508 if (!ST
->hasExtendedImageInsts())
510 // Sampling instructions don't need to produce results for all pixels
511 // in a quad, they just require all inputs of a quad to have been
512 // computed for derivatives.
513 markInstructionUses(MI
, StateWQM
, Worklist
);
514 GlobalFlags
|= StateWQM
;
516 } else if (Opcode
== AMDGPU::WQM
) {
517 // The WQM intrinsic requires its output to have all the helper lanes
518 // correct, so we need it to be in WQM.
520 LowerToCopyInstrs
.push_back(&MI
);
521 } else if (Opcode
== AMDGPU::SOFT_WQM
) {
522 LowerToCopyInstrs
.push_back(&MI
);
523 SoftWQMInstrs
.push_back(&MI
);
525 } else if (Opcode
== AMDGPU::STRICT_WWM
) {
526 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
527 // it needs to be executed in WQM or Exact so that its copy doesn't
528 // clobber inactive lanes.
529 markInstructionUses(MI
, StateStrictWWM
, Worklist
);
530 GlobalFlags
|= StateStrictWWM
;
531 LowerToMovInstrs
.push_back(&MI
);
533 } else if (Opcode
== AMDGPU::STRICT_WQM
) {
534 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
535 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
536 // quads that have at least one active thread.
537 markInstructionUses(MI
, StateStrictWQM
, Worklist
);
538 GlobalFlags
|= StateStrictWQM
;
539 LowerToMovInstrs
.push_back(&MI
);
541 } else if (Opcode
== AMDGPU::V_SET_INACTIVE_B32
||
542 Opcode
== AMDGPU::V_SET_INACTIVE_B64
) {
543 III
.Disabled
= StateStrict
;
544 MachineOperand
&Inactive
= MI
.getOperand(2);
545 if (Inactive
.isReg()) {
546 if (Inactive
.isUndef()) {
547 LowerToCopyInstrs
.push_back(&MI
);
549 markOperand(MI
, Inactive
, StateStrictWWM
, Worklist
);
552 SetInactiveInstrs
.push_back(&MI
);
554 } else if (TII
->isDisableWQM(MI
)) {
555 BBI
.Needs
|= StateExact
;
556 if (!(BBI
.InNeeds
& StateExact
)) {
557 BBI
.InNeeds
|= StateExact
;
558 Worklist
.push_back(&MBB
);
560 GlobalFlags
|= StateExact
;
561 III
.Disabled
= StateWQM
| StateStrict
;
564 if (Opcode
== AMDGPU::SI_PS_LIVE
|| Opcode
== AMDGPU::SI_LIVE_MASK
) {
565 LiveMaskQueries
.push_back(&MI
);
566 } else if (Opcode
== AMDGPU::SI_KILL_I1_TERMINATOR
||
567 Opcode
== AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
||
568 Opcode
== AMDGPU::SI_DEMOTE_I1
) {
569 KillInstrs
.push_back(&MI
);
570 BBI
.NeedsLowering
= true;
571 } else if (WQMOutputs
) {
572 // The function is in machine SSA form, which means that physical
573 // VGPRs correspond to shader inputs and outputs. Inputs are
574 // only used, outputs are only defined.
575 // FIXME: is this still valid?
576 for (const MachineOperand
&MO
: MI
.defs()) {
580 Register Reg
= MO
.getReg();
582 if (!Reg
.isVirtual() &&
583 TRI
->hasVectorRegisters(TRI
->getPhysRegClass(Reg
))) {
594 markInstruction(MI
, Flags
, Worklist
);
595 GlobalFlags
|= Flags
;
599 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
600 // ever used anywhere in the function. This implements the corresponding
601 // semantics of @llvm.amdgcn.set.inactive.
602 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
603 if (GlobalFlags
& StateWQM
) {
604 for (MachineInstr
*MI
: SetInactiveInstrs
)
605 markInstruction(*MI
, StateWQM
, Worklist
);
606 for (MachineInstr
*MI
: SoftWQMInstrs
)
607 markInstruction(*MI
, StateWQM
, Worklist
);
613 void SIWholeQuadMode::propagateInstruction(MachineInstr
&MI
,
614 std::vector
<WorkItem
>& Worklist
) {
615 MachineBasicBlock
*MBB
= MI
.getParent();
616 InstrInfo II
= Instructions
[&MI
]; // take a copy to prevent dangling references
617 BlockInfo
&BI
= Blocks
[MBB
];
619 // Control flow-type instructions and stores to temporary memory that are
620 // followed by WQM computations must themselves be in WQM.
621 if ((II
.OutNeeds
& StateWQM
) && !(II
.Disabled
& StateWQM
) &&
622 (MI
.isTerminator() || (TII
->usesVM_CNT(MI
) && MI
.mayStore()))) {
623 Instructions
[&MI
].Needs
= StateWQM
;
627 // Propagate to block level
628 if (II
.Needs
& StateWQM
) {
629 BI
.Needs
|= StateWQM
;
630 if (!(BI
.InNeeds
& StateWQM
)) {
631 BI
.InNeeds
|= StateWQM
;
632 Worklist
.push_back(MBB
);
636 // Propagate backwards within block
637 if (MachineInstr
*PrevMI
= MI
.getPrevNode()) {
638 char InNeeds
= (II
.Needs
& ~StateStrict
) | II
.OutNeeds
;
639 if (!PrevMI
->isPHI()) {
640 InstrInfo
&PrevII
= Instructions
[PrevMI
];
641 if ((PrevII
.OutNeeds
| InNeeds
) != PrevII
.OutNeeds
) {
642 PrevII
.OutNeeds
|= InNeeds
;
643 Worklist
.push_back(PrevMI
);
648 // Propagate WQM flag to instruction inputs
649 assert(!(II
.Needs
& StateExact
));
652 markInstructionUses(MI
, II
.Needs
, Worklist
);
654 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
655 // not require any WQM transitions.
656 if (II
.Needs
& StateStrictWWM
)
657 BI
.Needs
|= StateStrictWWM
;
658 if (II
.Needs
& StateStrictWQM
)
659 BI
.Needs
|= StateStrictWQM
;
662 void SIWholeQuadMode::propagateBlock(MachineBasicBlock
&MBB
,
663 std::vector
<WorkItem
>& Worklist
) {
664 BlockInfo BI
= Blocks
[&MBB
]; // Make a copy to prevent dangling references.
666 // Propagate through instructions
668 MachineInstr
*LastMI
= &*MBB
.rbegin();
669 InstrInfo
&LastII
= Instructions
[LastMI
];
670 if ((LastII
.OutNeeds
| BI
.OutNeeds
) != LastII
.OutNeeds
) {
671 LastII
.OutNeeds
|= BI
.OutNeeds
;
672 Worklist
.push_back(LastMI
);
676 // Predecessor blocks must provide for our WQM/Exact needs.
677 for (MachineBasicBlock
*Pred
: MBB
.predecessors()) {
678 BlockInfo
&PredBI
= Blocks
[Pred
];
679 if ((PredBI
.OutNeeds
| BI
.InNeeds
) == PredBI
.OutNeeds
)
682 PredBI
.OutNeeds
|= BI
.InNeeds
;
683 PredBI
.InNeeds
|= BI
.InNeeds
;
684 Worklist
.push_back(Pred
);
687 // All successors must be prepared to accept the same set of WQM/Exact data.
688 for (MachineBasicBlock
*Succ
: MBB
.successors()) {
689 BlockInfo
&SuccBI
= Blocks
[Succ
];
690 if ((SuccBI
.InNeeds
| BI
.OutNeeds
) == SuccBI
.InNeeds
)
693 SuccBI
.InNeeds
|= BI
.OutNeeds
;
694 Worklist
.push_back(Succ
);
698 char SIWholeQuadMode::analyzeFunction(MachineFunction
&MF
) {
699 std::vector
<WorkItem
> Worklist
;
700 char GlobalFlags
= scanInstructions(MF
, Worklist
);
702 while (!Worklist
.empty()) {
703 WorkItem WI
= Worklist
.back();
707 propagateInstruction(*WI
.MI
, Worklist
);
709 propagateBlock(*WI
.MBB
, Worklist
);
715 MachineBasicBlock::iterator
716 SIWholeQuadMode::saveSCC(MachineBasicBlock
&MBB
,
717 MachineBasicBlock::iterator Before
) {
718 Register SaveReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
721 BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::COPY
), SaveReg
)
722 .addReg(AMDGPU::SCC
);
723 MachineInstr
*Restore
=
724 BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::COPY
), AMDGPU::SCC
)
727 LIS
->InsertMachineInstrInMaps(*Save
);
728 LIS
->InsertMachineInstrInMaps(*Restore
);
729 LIS
->createAndComputeVirtRegInterval(SaveReg
);
734 MachineBasicBlock
*SIWholeQuadMode::splitBlock(MachineBasicBlock
*BB
,
735 MachineInstr
*TermMI
) {
736 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB
) << " @ "
739 MachineBasicBlock
*SplitBB
=
740 BB
->splitAt(*TermMI
, /*UpdateLiveIns*/ true, LIS
);
742 // Convert last instruction in block to a terminator.
743 // Note: this only covers the expected patterns
744 unsigned NewOpcode
= 0;
745 switch (TermMI
->getOpcode()) {
746 case AMDGPU::S_AND_B32
:
747 NewOpcode
= AMDGPU::S_AND_B32_term
;
749 case AMDGPU::S_AND_B64
:
750 NewOpcode
= AMDGPU::S_AND_B64_term
;
752 case AMDGPU::S_MOV_B32
:
753 NewOpcode
= AMDGPU::S_MOV_B32_term
;
755 case AMDGPU::S_MOV_B64
:
756 NewOpcode
= AMDGPU::S_MOV_B64_term
;
762 TermMI
->setDesc(TII
->get(NewOpcode
));
765 // Update dominator trees
766 using DomTreeT
= DomTreeBase
<MachineBasicBlock
>;
767 SmallVector
<DomTreeT::UpdateType
, 16> DTUpdates
;
768 for (MachineBasicBlock
*Succ
: SplitBB
->successors()) {
769 DTUpdates
.push_back({DomTreeT::Insert
, SplitBB
, Succ
});
770 DTUpdates
.push_back({DomTreeT::Delete
, BB
, Succ
});
772 DTUpdates
.push_back({DomTreeT::Insert
, BB
, SplitBB
});
774 MDT
->getBase().applyUpdates(DTUpdates
);
776 PDT
->getBase().applyUpdates(DTUpdates
);
780 BuildMI(*BB
, BB
->end(), DebugLoc(), TII
->get(AMDGPU::S_BRANCH
))
782 LIS
->InsertMachineInstrInMaps(*MI
);
788 MachineInstr
*SIWholeQuadMode::lowerKillF32(MachineBasicBlock
&MBB
,
790 const DebugLoc
&DL
= MI
.getDebugLoc();
793 assert(MI
.getOperand(0).isReg());
795 // Comparison is for live lanes; however here we compute the inverse
796 // (killed lanes). This is because VCMP will always generate 0 bits
797 // for inactive lanes so a mask of live lanes would not be correct
798 // inside control flow.
799 // Invert the comparison by swapping the operands and adjusting
800 // the comparison codes.
802 switch (MI
.getOperand(2).getImm()) {
804 Opcode
= AMDGPU::V_CMP_LG_F32_e64
;
807 Opcode
= AMDGPU::V_CMP_GE_F32_e64
;
810 Opcode
= AMDGPU::V_CMP_GT_F32_e64
;
813 Opcode
= AMDGPU::V_CMP_LE_F32_e64
;
816 Opcode
= AMDGPU::V_CMP_LT_F32_e64
;
819 Opcode
= AMDGPU::V_CMP_EQ_F32_e64
;
822 Opcode
= AMDGPU::V_CMP_O_F32_e64
;
825 Opcode
= AMDGPU::V_CMP_U_F32_e64
;
829 Opcode
= AMDGPU::V_CMP_NEQ_F32_e64
;
833 Opcode
= AMDGPU::V_CMP_NLT_F32_e64
;
837 Opcode
= AMDGPU::V_CMP_NLE_F32_e64
;
841 Opcode
= AMDGPU::V_CMP_NGT_F32_e64
;
845 Opcode
= AMDGPU::V_CMP_NGE_F32_e64
;
849 Opcode
= AMDGPU::V_CMP_NLG_F32_e64
;
852 llvm_unreachable("invalid ISD:SET cond code");
855 // Pick opcode based on comparison type.
856 MachineInstr
*VcmpMI
;
857 const MachineOperand
&Op0
= MI
.getOperand(0);
858 const MachineOperand
&Op1
= MI
.getOperand(1);
859 if (TRI
->isVGPR(*MRI
, Op0
.getReg())) {
860 Opcode
= AMDGPU::getVOPe32(Opcode
);
861 VcmpMI
= BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
)).add(Op1
).add(Op0
);
863 VcmpMI
= BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
))
864 .addReg(AMDGPU::VCC
, RegState::Define
)
865 .addImm(0) // src0 modifiers
867 .addImm(0) // src1 modifiers
872 // VCC represents lanes killed.
873 Register VCC
= ST
->isWave32() ? AMDGPU::VCC_LO
: AMDGPU::VCC
;
875 MachineInstr
*MaskUpdateMI
=
876 BuildMI(MBB
, MI
, DL
, TII
->get(AndN2Opc
), LiveMaskReg
)
880 // State of SCC represents whether any lanes are live in mask,
881 // if SCC is 0 then no lanes will be alive anymore.
882 MachineInstr
*EarlyTermMI
=
883 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::SI_EARLY_TERMINATE_SCC0
));
885 MachineInstr
*ExecMaskMI
=
886 BuildMI(MBB
, MI
, DL
, TII
->get(AndN2Opc
), Exec
).addReg(Exec
).addReg(VCC
);
888 assert(MBB
.succ_size() == 1);
889 MachineInstr
*NewTerm
= BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_BRANCH
))
890 .addMBB(*MBB
.succ_begin());
892 // Update live intervals
893 LIS
->ReplaceMachineInstrInMaps(MI
, *VcmpMI
);
896 LIS
->InsertMachineInstrInMaps(*MaskUpdateMI
);
897 LIS
->InsertMachineInstrInMaps(*ExecMaskMI
);
898 LIS
->InsertMachineInstrInMaps(*EarlyTermMI
);
899 LIS
->InsertMachineInstrInMaps(*NewTerm
);
904 MachineInstr
*SIWholeQuadMode::lowerKillI1(MachineBasicBlock
&MBB
,
905 MachineInstr
&MI
, bool IsWQM
) {
906 const DebugLoc
&DL
= MI
.getDebugLoc();
907 MachineInstr
*MaskUpdateMI
= nullptr;
909 const bool IsDemote
= IsWQM
&& (MI
.getOpcode() == AMDGPU::SI_DEMOTE_I1
);
910 const MachineOperand
&Op
= MI
.getOperand(0);
911 int64_t KillVal
= MI
.getOperand(1).getImm();
912 MachineInstr
*ComputeKilledMaskMI
= nullptr;
913 Register CndReg
= !Op
.isImm() ? Op
.getReg() : Register();
916 // Is this a static or dynamic kill?
918 if (Op
.getImm() == KillVal
) {
919 // Static: all active lanes are killed
920 MaskUpdateMI
= BuildMI(MBB
, MI
, DL
, TII
->get(AndN2Opc
), LiveMaskReg
)
924 // Static: kill does nothing
925 MachineInstr
*NewTerm
= nullptr;
926 if (MI
.getOpcode() == AMDGPU::SI_DEMOTE_I1
) {
927 LIS
->RemoveMachineInstrFromMaps(MI
);
929 assert(MBB
.succ_size() == 1);
930 NewTerm
= BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_BRANCH
))
931 .addMBB(*MBB
.succ_begin());
932 LIS
->ReplaceMachineInstrInMaps(MI
, *NewTerm
);
939 // Op represents live lanes after kill,
940 // so exec mask needs to be factored in.
941 TmpReg
= MRI
->createVirtualRegister(TRI
->getBoolRC());
942 ComputeKilledMaskMI
=
943 BuildMI(MBB
, MI
, DL
, TII
->get(XorOpc
), TmpReg
).add(Op
).addReg(Exec
);
944 MaskUpdateMI
= BuildMI(MBB
, MI
, DL
, TII
->get(AndN2Opc
), LiveMaskReg
)
948 // Op represents lanes to kill
949 MaskUpdateMI
= BuildMI(MBB
, MI
, DL
, TII
->get(AndN2Opc
), LiveMaskReg
)
955 // State of SCC represents whether any lanes are live in mask,
956 // if SCC is 0 then no lanes will be alive anymore.
957 MachineInstr
*EarlyTermMI
=
958 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::SI_EARLY_TERMINATE_SCC0
));
960 // In the case we got this far some lanes are still live,
961 // update EXEC to deactivate lanes as appropriate.
962 MachineInstr
*NewTerm
;
963 MachineInstr
*WQMMaskMI
= nullptr;
964 Register LiveMaskWQM
;
966 // Demotes deactive quads with only helper lanes
967 LiveMaskWQM
= MRI
->createVirtualRegister(TRI
->getBoolRC());
969 BuildMI(MBB
, MI
, DL
, TII
->get(WQMOpc
), LiveMaskWQM
).addReg(LiveMaskReg
);
970 NewTerm
= BuildMI(MBB
, MI
, DL
, TII
->get(AndOpc
), Exec
)
972 .addReg(LiveMaskWQM
);
974 // Kills deactivate lanes
976 unsigned MovOpc
= ST
->isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
977 NewTerm
= BuildMI(MBB
, &MI
, DL
, TII
->get(MovOpc
), Exec
).addImm(0);
979 NewTerm
= BuildMI(MBB
, &MI
, DL
, TII
->get(AndOpc
), Exec
)
981 .addReg(LiveMaskReg
);
983 unsigned Opcode
= KillVal
? AndN2Opc
: AndOpc
;
985 BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
), Exec
).addReg(Exec
).add(Op
);
989 // Update live intervals
990 LIS
->RemoveMachineInstrFromMaps(MI
);
993 assert(MaskUpdateMI
);
995 if (ComputeKilledMaskMI
)
996 LIS
->InsertMachineInstrInMaps(*ComputeKilledMaskMI
);
997 LIS
->InsertMachineInstrInMaps(*MaskUpdateMI
);
998 LIS
->InsertMachineInstrInMaps(*EarlyTermMI
);
1000 LIS
->InsertMachineInstrInMaps(*WQMMaskMI
);
1001 LIS
->InsertMachineInstrInMaps(*NewTerm
);
1004 LIS
->removeInterval(CndReg
);
1005 LIS
->createAndComputeVirtRegInterval(CndReg
);
1008 LIS
->createAndComputeVirtRegInterval(TmpReg
);
1010 LIS
->createAndComputeVirtRegInterval(LiveMaskWQM
);
1015 // Replace (or supplement) instructions accessing live mask.
1016 // This can only happen once all the live mask registers have been created
1017 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1018 void SIWholeQuadMode::lowerBlock(MachineBasicBlock
&MBB
) {
1019 auto BII
= Blocks
.find(&MBB
);
1020 if (BII
== Blocks
.end())
1023 const BlockInfo
&BI
= BII
->second
;
1024 if (!BI
.NeedsLowering
)
1027 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB
) << ":\n");
1029 SmallVector
<MachineInstr
*, 4> SplitPoints
;
1030 char State
= BI
.InitialState
;
1032 auto II
= MBB
.getFirstNonPHI(), IE
= MBB
.end();
1034 auto Next
= std::next(II
);
1035 MachineInstr
&MI
= *II
;
1037 if (StateTransition
.count(&MI
))
1038 State
= StateTransition
[&MI
];
1040 MachineInstr
*SplitPoint
= nullptr;
1041 switch (MI
.getOpcode()) {
1042 case AMDGPU::SI_DEMOTE_I1
:
1043 case AMDGPU::SI_KILL_I1_TERMINATOR
:
1044 SplitPoint
= lowerKillI1(MBB
, MI
, State
== StateWQM
);
1046 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
:
1047 SplitPoint
= lowerKillF32(MBB
, MI
);
1053 SplitPoints
.push_back(SplitPoint
);
1058 // Perform splitting after instruction scan to simplify iteration.
1059 if (!SplitPoints
.empty()) {
1060 MachineBasicBlock
*BB
= &MBB
;
1061 for (MachineInstr
*MI
: SplitPoints
) {
1062 BB
= splitBlock(BB
, MI
);
1067 // Return an iterator in the (inclusive) range [First, Last] at which
1068 // instructions can be safely inserted, keeping in mind that some of the
1069 // instructions we want to add necessarily clobber SCC.
1070 MachineBasicBlock::iterator
SIWholeQuadMode::prepareInsertion(
1071 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator First
,
1072 MachineBasicBlock::iterator Last
, bool PreferLast
, bool SaveSCC
) {
1074 return PreferLast
? Last
: First
;
1077 LIS
->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC
), TRI
));
1078 auto MBBE
= MBB
.end();
1079 SlotIndex FirstIdx
= First
!= MBBE
? LIS
->getInstructionIndex(*First
)
1080 : LIS
->getMBBEndIdx(&MBB
);
1082 Last
!= MBBE
? LIS
->getInstructionIndex(*Last
) : LIS
->getMBBEndIdx(&MBB
);
1083 SlotIndex Idx
= PreferLast
? LastIdx
: FirstIdx
;
1084 const LiveRange::Segment
*S
;
1087 S
= LR
.getSegmentContaining(Idx
);
1092 SlotIndex Next
= S
->start
.getBaseIndex();
1093 if (Next
< FirstIdx
)
1097 MachineInstr
*EndMI
= LIS
->getInstructionFromIndex(S
->end
.getBaseIndex());
1098 assert(EndMI
&& "Segment does not end on valid instruction");
1099 auto NextI
= std::next(EndMI
->getIterator());
1100 if (NextI
== MBB
.end())
1102 SlotIndex Next
= LIS
->getInstructionIndex(*NextI
);
1109 MachineBasicBlock::iterator MBBI
;
1111 if (MachineInstr
*MI
= LIS
->getInstructionFromIndex(Idx
))
1114 assert(Idx
== LIS
->getMBBEndIdx(&MBB
));
1118 // Move insertion point past any operations modifying EXEC.
1119 // This assumes that the value of SCC defined by any of these operations
1120 // does not need to be preserved.
1121 while (MBBI
!= Last
) {
1122 bool IsExecDef
= false;
1123 for (const MachineOperand
&MO
: MBBI
->operands()) {
1124 if (MO
.isReg() && MO
.isDef()) {
1126 MO
.getReg() == AMDGPU::EXEC_LO
|| MO
.getReg() == AMDGPU::EXEC
;
1136 MBBI
= saveSCC(MBB
, MBBI
);
1141 void SIWholeQuadMode::toExact(MachineBasicBlock
&MBB
,
1142 MachineBasicBlock::iterator Before
,
1147 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AndSaveExecOpc
), SaveWQM
)
1148 .addReg(LiveMaskReg
);
1150 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AndOpc
), Exec
)
1152 .addReg(LiveMaskReg
);
1155 LIS
->InsertMachineInstrInMaps(*MI
);
1156 StateTransition
[MI
] = StateExact
;
1159 void SIWholeQuadMode::toWQM(MachineBasicBlock
&MBB
,
1160 MachineBasicBlock::iterator Before
,
1161 Register SavedWQM
) {
1165 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::COPY
), Exec
)
1168 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(WQMOpc
), Exec
).addReg(Exec
);
1171 LIS
->InsertMachineInstrInMaps(*MI
);
1172 StateTransition
[MI
] = StateWQM
;
1175 void SIWholeQuadMode::toStrictMode(MachineBasicBlock
&MBB
,
1176 MachineBasicBlock::iterator Before
,
1177 Register SaveOrig
, char StrictStateNeeded
) {
1180 assert(StrictStateNeeded
== StateStrictWWM
||
1181 StrictStateNeeded
== StateStrictWQM
);
1183 if (StrictStateNeeded
== StateStrictWWM
) {
1184 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::ENTER_STRICT_WWM
),
1188 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::ENTER_STRICT_WQM
),
1192 LIS
->InsertMachineInstrInMaps(*MI
);
1193 StateTransition
[MI
] = StateStrictWWM
;
1196 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock
&MBB
,
1197 MachineBasicBlock::iterator Before
,
1198 Register SavedOrig
, char NonStrictState
,
1199 char CurrentStrictState
) {
1203 assert(CurrentStrictState
== StateStrictWWM
||
1204 CurrentStrictState
== StateStrictWQM
);
1206 if (CurrentStrictState
== StateStrictWWM
) {
1207 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::EXIT_STRICT_WWM
),
1211 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::EXIT_STRICT_WQM
),
1215 LIS
->InsertMachineInstrInMaps(*MI
);
1216 StateTransition
[MI
] = NonStrictState
;
1219 void SIWholeQuadMode::processBlock(MachineBasicBlock
&MBB
, bool IsEntry
) {
1220 auto BII
= Blocks
.find(&MBB
);
1221 if (BII
== Blocks
.end())
1224 BlockInfo
&BI
= BII
->second
;
1226 // This is a non-entry block that is WQM throughout, so no need to do
1228 if (!IsEntry
&& BI
.Needs
== StateWQM
&& BI
.OutNeeds
!= StateExact
) {
1229 BI
.InitialState
= StateWQM
;
1233 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB
)
1236 Register SavedWQMReg
;
1237 Register SavedNonStrictReg
;
1238 bool WQMFromExec
= IsEntry
;
1239 char State
= (IsEntry
|| !(BI
.InNeeds
& StateWQM
)) ? StateExact
: StateWQM
;
1240 char NonStrictState
= 0;
1241 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
1243 auto II
= MBB
.getFirstNonPHI(), IE
= MBB
.end();
1245 // Skip the instruction that saves LiveMask
1246 if (II
!= IE
&& II
->getOpcode() == AMDGPU::COPY
)
1250 // This stores the first instruction where it's safe to switch from WQM to
1251 // Exact or vice versa.
1252 MachineBasicBlock::iterator FirstWQM
= IE
;
1254 // This stores the first instruction where it's safe to switch from Strict
1255 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1256 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1257 // be safe to switch to/from WQM as well.
1258 MachineBasicBlock::iterator FirstStrict
= IE
;
1260 // Record initial state is block information.
1261 BI
.InitialState
= State
;
1264 MachineBasicBlock::iterator Next
= II
;
1265 char Needs
= StateExact
| StateWQM
; // Strict mode is disabled by default.
1271 if (FirstStrict
== IE
)
1274 // First, figure out the allowed states (Needs) based on the propagated
1277 MachineInstr
&MI
= *II
;
1279 if (MI
.isTerminator() || TII
->mayReadEXEC(*MRI
, MI
)) {
1280 auto III
= Instructions
.find(&MI
);
1281 if (III
!= Instructions
.end()) {
1282 if (III
->second
.Needs
& StateStrictWWM
)
1283 Needs
= StateStrictWWM
;
1284 else if (III
->second
.Needs
& StateStrictWQM
)
1285 Needs
= StateStrictWQM
;
1286 else if (III
->second
.Needs
& StateWQM
)
1289 Needs
&= ~III
->second
.Disabled
;
1290 OutNeeds
= III
->second
.OutNeeds
;
1293 // If the instruction doesn't actually need a correct EXEC, then we can
1294 // safely leave Strict mode enabled.
1295 Needs
= StateExact
| StateWQM
| StateStrict
;
1298 if (MI
.isTerminator() && OutNeeds
== StateExact
)
1303 // End of basic block
1304 if (BI
.OutNeeds
& StateWQM
)
1306 else if (BI
.OutNeeds
== StateExact
)
1309 Needs
= StateWQM
| StateExact
;
1312 // Now, transition if necessary.
1313 if (!(Needs
& State
)) {
1314 MachineBasicBlock::iterator First
;
1315 if (State
== StateStrictWWM
|| Needs
== StateStrictWWM
||
1316 State
== StateStrictWQM
|| Needs
== StateStrictWQM
) {
1317 // We must switch to or from Strict mode.
1318 First
= FirstStrict
;
1320 // We only need to switch to/from WQM, so we can use FirstWQM.
1324 // Whether we need to save SCC depends on start and end states.
1325 bool SaveSCC
= false;
1328 case StateStrictWWM
:
1329 case StateStrictWQM
:
1330 // Exact/Strict -> Strict: save SCC
1331 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1332 // Exact/Strict -> Exact: no save
1333 SaveSCC
= (Needs
& StateStrict
) || ((Needs
& StateWQM
) && WQMFromExec
);
1336 // WQM -> Exact/Strict: save SCC
1337 SaveSCC
= !(Needs
& StateWQM
);
1340 llvm_unreachable("Unknown state");
1343 MachineBasicBlock::iterator Before
=
1344 prepareInsertion(MBB
, First
, II
, Needs
== StateWQM
, SaveSCC
);
1346 if (State
& StateStrict
) {
1347 assert(State
== StateStrictWWM
|| State
== StateStrictWQM
);
1348 assert(SavedNonStrictReg
);
1349 fromStrictMode(MBB
, Before
, SavedNonStrictReg
, NonStrictState
, State
);
1351 LIS
->createAndComputeVirtRegInterval(SavedNonStrictReg
);
1352 SavedNonStrictReg
= 0;
1353 State
= NonStrictState
;
1356 if (Needs
& StateStrict
) {
1357 NonStrictState
= State
;
1358 assert(Needs
== StateStrictWWM
|| Needs
== StateStrictWQM
);
1359 assert(!SavedNonStrictReg
);
1360 SavedNonStrictReg
= MRI
->createVirtualRegister(BoolRC
);
1362 toStrictMode(MBB
, Before
, SavedNonStrictReg
, Needs
);
1366 if (State
== StateWQM
&& (Needs
& StateExact
) && !(Needs
& StateWQM
)) {
1367 if (!WQMFromExec
&& (OutNeeds
& StateWQM
)) {
1368 assert(!SavedWQMReg
);
1369 SavedWQMReg
= MRI
->createVirtualRegister(BoolRC
);
1372 toExact(MBB
, Before
, SavedWQMReg
);
1374 } else if (State
== StateExact
&& (Needs
& StateWQM
) &&
1375 !(Needs
& StateExact
)) {
1376 assert(WQMFromExec
== (SavedWQMReg
== 0));
1378 toWQM(MBB
, Before
, SavedWQMReg
);
1381 LIS
->createAndComputeVirtRegInterval(SavedWQMReg
);
1386 // We can get here if we transitioned from StrictWWM to a
1387 // non-StrictWWM state that already matches our needs, but we
1388 // shouldn't need to do anything.
1389 assert(Needs
& State
);
1394 if (Needs
!= (StateExact
| StateWQM
| StateStrict
)) {
1395 if (Needs
!= (StateExact
| StateWQM
))
1405 assert(!SavedWQMReg
);
1406 assert(!SavedNonStrictReg
);
1409 void SIWholeQuadMode::lowerLiveMaskQueries() {
1410 for (MachineInstr
*MI
: LiveMaskQueries
) {
1411 const DebugLoc
&DL
= MI
->getDebugLoc();
1412 Register Dest
= MI
->getOperand(0).getReg();
1414 MachineInstr
*Copy
=
1415 BuildMI(*MI
->getParent(), MI
, DL
, TII
->get(AMDGPU::COPY
), Dest
)
1416 .addReg(LiveMaskReg
);
1418 LIS
->ReplaceMachineInstrInMaps(*MI
, *Copy
);
1419 MI
->eraseFromParent();
1423 void SIWholeQuadMode::lowerCopyInstrs() {
1424 for (MachineInstr
*MI
: LowerToMovInstrs
) {
1425 assert(MI
->getNumExplicitOperands() == 2);
1427 const Register Reg
= MI
->getOperand(0).getReg();
1428 const unsigned SubReg
= MI
->getOperand(0).getSubReg();
1430 if (TRI
->isVGPR(*MRI
, Reg
)) {
1431 const TargetRegisterClass
*regClass
=
1432 Reg
.isVirtual() ? MRI
->getRegClass(Reg
) : TRI
->getPhysRegClass(Reg
);
1434 regClass
= TRI
->getSubRegClass(regClass
, SubReg
);
1436 const unsigned MovOp
= TII
->getMovOpcode(regClass
);
1437 MI
->setDesc(TII
->get(MovOp
));
1439 // Check that it already implicitly depends on exec (like all VALU movs
1441 assert(any_of(MI
->implicit_operands(), [](const MachineOperand
&MO
) {
1442 return MO
.isUse() && MO
.getReg() == AMDGPU::EXEC
;
1445 // Remove early-clobber and exec dependency from simple SGPR copies.
1446 // This allows some to be eliminated during/post RA.
1447 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI
);
1448 if (MI
->getOperand(0).isEarlyClobber()) {
1449 LIS
->removeInterval(Reg
);
1450 MI
->getOperand(0).setIsEarlyClobber(false);
1451 LIS
->createAndComputeVirtRegInterval(Reg
);
1453 int Index
= MI
->findRegisterUseOperandIdx(AMDGPU::EXEC
);
1454 while (Index
>= 0) {
1455 MI
->RemoveOperand(Index
);
1456 Index
= MI
->findRegisterUseOperandIdx(AMDGPU::EXEC
);
1458 MI
->setDesc(TII
->get(AMDGPU::COPY
));
1459 LLVM_DEBUG(dbgs() << " -> " << *MI
);
1462 for (MachineInstr
*MI
: LowerToCopyInstrs
) {
1463 if (MI
->getOpcode() == AMDGPU::V_SET_INACTIVE_B32
||
1464 MI
->getOpcode() == AMDGPU::V_SET_INACTIVE_B64
) {
1465 assert(MI
->getNumExplicitOperands() == 3);
1466 // the only reason we should be here is V_SET_INACTIVE has
1467 // an undef input so it is being replaced by a simple copy.
1468 // There should be a second undef source that we should remove.
1469 assert(MI
->getOperand(2).isUndef());
1470 MI
->RemoveOperand(2);
1471 MI
->untieRegOperand(1);
1473 assert(MI
->getNumExplicitOperands() == 2);
1476 MI
->setDesc(TII
->get(AMDGPU::COPY
));
1480 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM
) {
1481 for (MachineInstr
*MI
: KillInstrs
) {
1482 MachineBasicBlock
*MBB
= MI
->getParent();
1483 MachineInstr
*SplitPoint
= nullptr;
1484 switch (MI
->getOpcode()) {
1485 case AMDGPU::SI_DEMOTE_I1
:
1486 case AMDGPU::SI_KILL_I1_TERMINATOR
:
1487 SplitPoint
= lowerKillI1(*MBB
, *MI
, IsWQM
);
1489 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
:
1490 SplitPoint
= lowerKillF32(*MBB
, *MI
);
1496 splitBlock(MBB
, SplitPoint
);
1500 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction
&MF
) {
1501 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF
.getName()
1502 << " ------------- \n");
1503 LLVM_DEBUG(MF
.dump(););
1505 Instructions
.clear();
1507 LiveMaskQueries
.clear();
1508 LowerToCopyInstrs
.clear();
1509 LowerToMovInstrs
.clear();
1511 StateTransition
.clear();
1513 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1515 TII
= ST
->getInstrInfo();
1516 TRI
= &TII
->getRegisterInfo();
1517 MRI
= &MF
.getRegInfo();
1518 LIS
= &getAnalysis
<LiveIntervals
>();
1519 MDT
= &getAnalysis
<MachineDominatorTree
>();
1520 PDT
= &getAnalysis
<MachinePostDominatorTree
>();
1522 if (ST
->isWave32()) {
1523 AndOpc
= AMDGPU::S_AND_B32
;
1524 AndN2Opc
= AMDGPU::S_ANDN2_B32
;
1525 XorOpc
= AMDGPU::S_XOR_B32
;
1526 AndSaveExecOpc
= AMDGPU::S_AND_SAVEEXEC_B32
;
1527 OrSaveExecOpc
= AMDGPU::S_OR_SAVEEXEC_B32
;
1528 WQMOpc
= AMDGPU::S_WQM_B32
;
1529 Exec
= AMDGPU::EXEC_LO
;
1531 AndOpc
= AMDGPU::S_AND_B64
;
1532 AndN2Opc
= AMDGPU::S_ANDN2_B64
;
1533 XorOpc
= AMDGPU::S_XOR_B64
;
1534 AndSaveExecOpc
= AMDGPU::S_AND_SAVEEXEC_B64
;
1535 OrSaveExecOpc
= AMDGPU::S_OR_SAVEEXEC_B64
;
1536 WQMOpc
= AMDGPU::S_WQM_B64
;
1537 Exec
= AMDGPU::EXEC
;
1540 const char GlobalFlags
= analyzeFunction(MF
);
1541 const bool NeedsLiveMask
= !(KillInstrs
.empty() && LiveMaskQueries
.empty());
1545 // Shader is simple does not need any state changes or any complex lowering
1546 if (!(GlobalFlags
& (StateWQM
| StateStrict
)) && LowerToCopyInstrs
.empty() &&
1547 LowerToMovInstrs
.empty() && KillInstrs
.empty()) {
1548 lowerLiveMaskQueries();
1549 return !LiveMaskQueries
.empty();
1552 MachineBasicBlock
&Entry
= MF
.front();
1553 MachineBasicBlock::iterator EntryMI
= Entry
.getFirstNonPHI();
1555 // Store a copy of the original live mask when required
1556 if (NeedsLiveMask
|| (GlobalFlags
& StateWQM
)) {
1557 LiveMaskReg
= MRI
->createVirtualRegister(TRI
->getBoolRC());
1559 BuildMI(Entry
, EntryMI
, DebugLoc(), TII
->get(AMDGPU::COPY
), LiveMaskReg
)
1561 LIS
->InsertMachineInstrInMaps(*MI
);
1564 LLVM_DEBUG(printInfo());
1566 lowerLiveMaskQueries();
1569 // Shader only needs WQM
1570 if (GlobalFlags
== StateWQM
) {
1571 auto MI
= BuildMI(Entry
, EntryMI
, DebugLoc(), TII
->get(WQMOpc
), Exec
)
1573 LIS
->InsertMachineInstrInMaps(*MI
);
1574 lowerKillInstrs(true);
1576 for (auto BII
: Blocks
)
1577 processBlock(*BII
.first
, BII
.first
== &Entry
);
1578 // Lowering blocks causes block splitting so perform as a second pass.
1579 for (auto BII
: Blocks
)
1580 lowerBlock(*BII
.first
);
1583 // Compute live range for live mask
1584 if (LiveMaskReg
!= Exec
)
1585 LIS
->createAndComputeVirtRegInterval(LiveMaskReg
);
1587 // Physical registers like SCC aren't tracked by default anyway, so just
1588 // removing the ranges we computed is the simplest option for maintaining
1589 // the analysis results.
1590 LIS
->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC
), TRI
));
1592 // If we performed any kills then recompute EXEC
1593 if (!KillInstrs
.empty())
1594 LIS
->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC
, TRI
));