1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
19 /// When necessary, this pass creates a function prolog
21 /// S_MOV_B64 LiveMask, EXEC
22 /// S_WQM_B64 EXEC, EXEC
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
27 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 /// S_MOV_B64 EXEC, Tmp
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
36 /// S_MOV_B64 EXEC, Tmp
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
45 /// There is room for improvement given better control flow analysis:
47 /// (1) at the top level (outside of control flow statements, and as long as
48 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
49 /// the LiveMask (this is implemented for the entry block).
51 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
52 /// consist of exact and don't-care instructions, the switch only has to
53 /// be done at the entry and exit points rather than potentially in each
54 /// block of the region.
56 //===----------------------------------------------------------------------===//
59 #include "AMDGPUSubtarget.h"
60 #include "SIInstrInfo.h"
61 #include "SIMachineFunctionInfo.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/PostOrderIterator.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
67 #include "llvm/CodeGen/LiveInterval.h"
68 #include "llvm/CodeGen/LiveIntervals.h"
69 #include "llvm/CodeGen/MachineBasicBlock.h"
70 #include "llvm/CodeGen/MachineFunction.h"
71 #include "llvm/CodeGen/MachineFunctionPass.h"
72 #include "llvm/CodeGen/MachineInstr.h"
73 #include "llvm/CodeGen/MachineInstrBuilder.h"
74 #include "llvm/CodeGen/MachineOperand.h"
75 #include "llvm/CodeGen/MachineRegisterInfo.h"
76 #include "llvm/CodeGen/SlotIndexes.h"
77 #include "llvm/CodeGen/TargetRegisterInfo.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/raw_ostream.h"
89 #define DEBUG_TYPE "si-wqm"
103 explicit PrintState(int State
) : State(State
) {}
107 static raw_ostream
&operator<<(raw_ostream
&OS
, const PrintState
&PS
) {
108 if (PS
.State
& StateWQM
)
110 if (PS
.State
& StateWWM
) {
111 if (PS
.State
& StateWQM
)
115 if (PS
.State
& StateExact
) {
116 if (PS
.State
& (StateWQM
| StateWWM
))
138 MachineBasicBlock
*MBB
= nullptr;
139 MachineInstr
*MI
= nullptr;
141 WorkItem() = default;
142 WorkItem(MachineBasicBlock
*MBB
) : MBB(MBB
) {}
143 WorkItem(MachineInstr
*MI
) : MI(MI
) {}
146 class SIWholeQuadMode
: public MachineFunctionPass
{
148 CallingConv::ID CallingConv
;
149 const SIInstrInfo
*TII
;
150 const SIRegisterInfo
*TRI
;
151 const GCNSubtarget
*ST
;
152 MachineRegisterInfo
*MRI
;
155 DenseMap
<const MachineInstr
*, InstrInfo
> Instructions
;
156 DenseMap
<MachineBasicBlock
*, BlockInfo
> Blocks
;
157 SmallVector
<MachineInstr
*, 1> LiveMaskQueries
;
158 SmallVector
<MachineInstr
*, 4> LowerToCopyInstrs
;
162 void markInstruction(MachineInstr
&MI
, char Flag
,
163 std::vector
<WorkItem
> &Worklist
);
164 void markInstructionUses(const MachineInstr
&MI
, char Flag
,
165 std::vector
<WorkItem
> &Worklist
);
166 char scanInstructions(MachineFunction
&MF
, std::vector
<WorkItem
> &Worklist
);
167 void propagateInstruction(MachineInstr
&MI
, std::vector
<WorkItem
> &Worklist
);
168 void propagateBlock(MachineBasicBlock
&MBB
, std::vector
<WorkItem
> &Worklist
);
169 char analyzeFunction(MachineFunction
&MF
);
171 bool requiresCorrectState(const MachineInstr
&MI
) const;
173 MachineBasicBlock::iterator
saveSCC(MachineBasicBlock
&MBB
,
174 MachineBasicBlock::iterator Before
);
175 MachineBasicBlock::iterator
176 prepareInsertion(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator First
,
177 MachineBasicBlock::iterator Last
, bool PreferLast
,
179 void toExact(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
180 unsigned SaveWQM
, unsigned LiveMaskReg
);
181 void toWQM(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
183 void toWWM(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
185 void fromWWM(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator Before
,
187 void processBlock(MachineBasicBlock
&MBB
, unsigned LiveMaskReg
, bool isEntry
);
189 void lowerLiveMaskQueries(unsigned LiveMaskReg
);
190 void lowerCopyInstrs();
196 MachineFunctionPass(ID
) { }
198 bool runOnMachineFunction(MachineFunction
&MF
) override
;
200 StringRef
getPassName() const override
{ return "SI Whole Quad Mode"; }
202 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
203 AU
.addRequired
<LiveIntervals
>();
204 AU
.addPreserved
<SlotIndexes
>();
205 AU
.addPreserved
<LiveIntervals
>();
206 AU
.setPreservesCFG();
207 MachineFunctionPass::getAnalysisUsage(AU
);
211 } // end anonymous namespace
213 char SIWholeQuadMode::ID
= 0;
215 INITIALIZE_PASS_BEGIN(SIWholeQuadMode
, DEBUG_TYPE
, "SI Whole Quad Mode", false,
217 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
218 INITIALIZE_PASS_END(SIWholeQuadMode
, DEBUG_TYPE
, "SI Whole Quad Mode", false,
221 char &llvm::SIWholeQuadModeID
= SIWholeQuadMode::ID
;
223 FunctionPass
*llvm::createSIWholeQuadModePass() {
224 return new SIWholeQuadMode
;
228 LLVM_DUMP_METHOD
void SIWholeQuadMode::printInfo() {
229 for (const auto &BII
: Blocks
) {
231 << printMBBReference(*BII
.first
) << ":\n"
232 << " InNeeds = " << PrintState(BII
.second
.InNeeds
)
233 << ", Needs = " << PrintState(BII
.second
.Needs
)
234 << ", OutNeeds = " << PrintState(BII
.second
.OutNeeds
) << "\n\n";
236 for (const MachineInstr
&MI
: *BII
.first
) {
237 auto III
= Instructions
.find(&MI
);
238 if (III
== Instructions
.end())
241 dbgs() << " " << MI
<< " Needs = " << PrintState(III
->second
.Needs
)
242 << ", OutNeeds = " << PrintState(III
->second
.OutNeeds
) << '\n';
248 void SIWholeQuadMode::markInstruction(MachineInstr
&MI
, char Flag
,
249 std::vector
<WorkItem
> &Worklist
) {
250 InstrInfo
&II
= Instructions
[&MI
];
252 assert(!(Flag
& StateExact
) && Flag
!= 0);
254 // Remove any disabled states from the flag. The user that required it gets
255 // an undefined value in the helper lanes. For example, this can happen if
256 // the result of an atomic is used by instruction that requires WQM, where
257 // ignoring the request for WQM is correct as per the relevant specs.
258 Flag
&= ~II
.Disabled
;
260 // Ignore if the flag is already encompassed by the existing needs, or we
261 // just disabled everything.
262 if ((II
.Needs
& Flag
) == Flag
)
266 Worklist
.push_back(&MI
);
269 /// Mark all instructions defining the uses in \p MI with \p Flag.
270 void SIWholeQuadMode::markInstructionUses(const MachineInstr
&MI
, char Flag
,
271 std::vector
<WorkItem
> &Worklist
) {
272 for (const MachineOperand
&Use
: MI
.uses()) {
273 if (!Use
.isReg() || !Use
.isUse())
276 Register Reg
= Use
.getReg();
278 // Handle physical registers that we need to track; this is mostly relevant
279 // for VCC, which can appear as the (implicit) input of a uniform branch,
280 // e.g. when a loop counter is stored in a VGPR.
281 if (!Register::isVirtualRegister(Reg
)) {
282 if (Reg
== AMDGPU::EXEC
|| Reg
== AMDGPU::EXEC_LO
)
285 for (MCRegUnitIterator
RegUnit(Reg
, TRI
); RegUnit
.isValid(); ++RegUnit
) {
286 LiveRange
&LR
= LIS
->getRegUnit(*RegUnit
);
287 const VNInfo
*Value
= LR
.Query(LIS
->getInstructionIndex(MI
)).valueIn();
291 // Since we're in machine SSA, we do not need to track physical
292 // registers across basic blocks.
293 if (Value
->isPHIDef())
296 markInstruction(*LIS
->getInstructionFromIndex(Value
->def
), Flag
,
303 for (MachineInstr
&DefMI
: MRI
->def_instructions(Use
.getReg()))
304 markInstruction(DefMI
, Flag
, Worklist
);
308 // Scan instructions to determine which ones require an Exact execmask and
309 // which ones seed WQM requirements.
310 char SIWholeQuadMode::scanInstructions(MachineFunction
&MF
,
311 std::vector
<WorkItem
> &Worklist
) {
312 char GlobalFlags
= 0;
313 bool WQMOutputs
= MF
.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
314 SmallVector
<MachineInstr
*, 4> SetInactiveInstrs
;
315 SmallVector
<MachineInstr
*, 4> SoftWQMInstrs
;
317 // We need to visit the basic blocks in reverse post-order so that we visit
318 // defs before uses, in particular so that we don't accidentally mark an
319 // instruction as needing e.g. WQM before visiting it and realizing it needs
321 ReversePostOrderTraversal
<MachineFunction
*> RPOT(&MF
);
322 for (auto BI
= RPOT
.begin(), BE
= RPOT
.end(); BI
!= BE
; ++BI
) {
323 MachineBasicBlock
&MBB
= **BI
;
324 BlockInfo
&BBI
= Blocks
[&MBB
];
326 for (auto II
= MBB
.begin(), IE
= MBB
.end(); II
!= IE
; ++II
) {
327 MachineInstr
&MI
= *II
;
328 InstrInfo
&III
= Instructions
[&MI
];
329 unsigned Opcode
= MI
.getOpcode();
332 if (TII
->isWQM(Opcode
)) {
333 // Sampling instructions don't need to produce results for all pixels
334 // in a quad, they just require all inputs of a quad to have been
335 // computed for derivatives.
336 markInstructionUses(MI
, StateWQM
, Worklist
);
337 GlobalFlags
|= StateWQM
;
339 } else if (Opcode
== AMDGPU::WQM
) {
340 // The WQM intrinsic requires its output to have all the helper lanes
341 // correct, so we need it to be in WQM.
343 LowerToCopyInstrs
.push_back(&MI
);
344 } else if (Opcode
== AMDGPU::SOFT_WQM
) {
345 LowerToCopyInstrs
.push_back(&MI
);
346 SoftWQMInstrs
.push_back(&MI
);
348 } else if (Opcode
== AMDGPU::WWM
) {
349 // The WWM intrinsic doesn't make the same guarantee, and plus it needs
350 // to be executed in WQM or Exact so that its copy doesn't clobber
352 markInstructionUses(MI
, StateWWM
, Worklist
);
353 GlobalFlags
|= StateWWM
;
354 LowerToCopyInstrs
.push_back(&MI
);
356 } else if (Opcode
== AMDGPU::V_SET_INACTIVE_B32
||
357 Opcode
== AMDGPU::V_SET_INACTIVE_B64
) {
358 III
.Disabled
= StateWWM
;
359 MachineOperand
&Inactive
= MI
.getOperand(2);
360 if (Inactive
.isReg()) {
361 if (Inactive
.isUndef()) {
362 LowerToCopyInstrs
.push_back(&MI
);
364 Register Reg
= Inactive
.getReg();
365 if (Register::isVirtualRegister(Reg
)) {
366 for (MachineInstr
&DefMI
: MRI
->def_instructions(Reg
))
367 markInstruction(DefMI
, StateWWM
, Worklist
);
371 SetInactiveInstrs
.push_back(&MI
);
373 } else if (TII
->isDisableWQM(MI
)) {
374 BBI
.Needs
|= StateExact
;
375 if (!(BBI
.InNeeds
& StateExact
)) {
376 BBI
.InNeeds
|= StateExact
;
377 Worklist
.push_back(&MBB
);
379 GlobalFlags
|= StateExact
;
380 III
.Disabled
= StateWQM
| StateWWM
;
383 if (Opcode
== AMDGPU::SI_PS_LIVE
) {
384 LiveMaskQueries
.push_back(&MI
);
385 } else if (WQMOutputs
) {
386 // The function is in machine SSA form, which means that physical
387 // VGPRs correspond to shader inputs and outputs. Inputs are
388 // only used, outputs are only defined.
389 for (const MachineOperand
&MO
: MI
.defs()) {
393 Register Reg
= MO
.getReg();
395 if (!Register::isVirtualRegister(Reg
) &&
396 TRI
->hasVectorRegisters(TRI
->getPhysRegClass(Reg
))) {
407 markInstruction(MI
, Flags
, Worklist
);
408 GlobalFlags
|= Flags
;
412 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
413 // ever used anywhere in the function. This implements the corresponding
414 // semantics of @llvm.amdgcn.set.inactive.
415 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
416 if (GlobalFlags
& StateWQM
) {
417 for (MachineInstr
*MI
: SetInactiveInstrs
)
418 markInstruction(*MI
, StateWQM
, Worklist
);
419 for (MachineInstr
*MI
: SoftWQMInstrs
)
420 markInstruction(*MI
, StateWQM
, Worklist
);
426 void SIWholeQuadMode::propagateInstruction(MachineInstr
&MI
,
427 std::vector
<WorkItem
>& Worklist
) {
428 MachineBasicBlock
*MBB
= MI
.getParent();
429 InstrInfo II
= Instructions
[&MI
]; // take a copy to prevent dangling references
430 BlockInfo
&BI
= Blocks
[MBB
];
432 // Control flow-type instructions and stores to temporary memory that are
433 // followed by WQM computations must themselves be in WQM.
434 if ((II
.OutNeeds
& StateWQM
) && !(II
.Disabled
& StateWQM
) &&
435 (MI
.isTerminator() || (TII
->usesVM_CNT(MI
) && MI
.mayStore()))) {
436 Instructions
[&MI
].Needs
= StateWQM
;
440 // Propagate to block level
441 if (II
.Needs
& StateWQM
) {
442 BI
.Needs
|= StateWQM
;
443 if (!(BI
.InNeeds
& StateWQM
)) {
444 BI
.InNeeds
|= StateWQM
;
445 Worklist
.push_back(MBB
);
449 // Propagate backwards within block
450 if (MachineInstr
*PrevMI
= MI
.getPrevNode()) {
451 char InNeeds
= (II
.Needs
& ~StateWWM
) | II
.OutNeeds
;
452 if (!PrevMI
->isPHI()) {
453 InstrInfo
&PrevII
= Instructions
[PrevMI
];
454 if ((PrevII
.OutNeeds
| InNeeds
) != PrevII
.OutNeeds
) {
455 PrevII
.OutNeeds
|= InNeeds
;
456 Worklist
.push_back(PrevMI
);
461 // Propagate WQM flag to instruction inputs
462 assert(!(II
.Needs
& StateExact
));
465 markInstructionUses(MI
, II
.Needs
, Worklist
);
467 // Ensure we process a block containing WWM, even if it does not require any
469 if (II
.Needs
& StateWWM
)
470 BI
.Needs
|= StateWWM
;
473 void SIWholeQuadMode::propagateBlock(MachineBasicBlock
&MBB
,
474 std::vector
<WorkItem
>& Worklist
) {
475 BlockInfo BI
= Blocks
[&MBB
]; // Make a copy to prevent dangling references.
477 // Propagate through instructions
479 MachineInstr
*LastMI
= &*MBB
.rbegin();
480 InstrInfo
&LastII
= Instructions
[LastMI
];
481 if ((LastII
.OutNeeds
| BI
.OutNeeds
) != LastII
.OutNeeds
) {
482 LastII
.OutNeeds
|= BI
.OutNeeds
;
483 Worklist
.push_back(LastMI
);
487 // Predecessor blocks must provide for our WQM/Exact needs.
488 for (MachineBasicBlock
*Pred
: MBB
.predecessors()) {
489 BlockInfo
&PredBI
= Blocks
[Pred
];
490 if ((PredBI
.OutNeeds
| BI
.InNeeds
) == PredBI
.OutNeeds
)
493 PredBI
.OutNeeds
|= BI
.InNeeds
;
494 PredBI
.InNeeds
|= BI
.InNeeds
;
495 Worklist
.push_back(Pred
);
498 // All successors must be prepared to accept the same set of WQM/Exact data.
499 for (MachineBasicBlock
*Succ
: MBB
.successors()) {
500 BlockInfo
&SuccBI
= Blocks
[Succ
];
501 if ((SuccBI
.InNeeds
| BI
.OutNeeds
) == SuccBI
.InNeeds
)
504 SuccBI
.InNeeds
|= BI
.OutNeeds
;
505 Worklist
.push_back(Succ
);
509 char SIWholeQuadMode::analyzeFunction(MachineFunction
&MF
) {
510 std::vector
<WorkItem
> Worklist
;
511 char GlobalFlags
= scanInstructions(MF
, Worklist
);
513 while (!Worklist
.empty()) {
514 WorkItem WI
= Worklist
.back();
518 propagateInstruction(*WI
.MI
, Worklist
);
520 propagateBlock(*WI
.MBB
, Worklist
);
526 /// Whether \p MI really requires the exec state computed during analysis.
528 /// Scalar instructions must occasionally be marked WQM for correct propagation
529 /// (e.g. thread masks leading up to branches), but when it comes to actual
530 /// execution, they don't care about EXEC.
531 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr
&MI
) const {
532 if (MI
.isTerminator())
535 // Skip instructions that are not affected by EXEC
536 if (TII
->isScalarUnit(MI
))
539 // Generic instructions such as COPY will either disappear by register
540 // coalescing or be lowered to SALU or VALU instructions.
541 if (MI
.isTransient()) {
542 if (MI
.getNumExplicitOperands() >= 1) {
543 const MachineOperand
&Op
= MI
.getOperand(0);
545 if (TRI
->isSGPRReg(*MRI
, Op
.getReg())) {
546 // SGPR instructions are not affected by EXEC
556 MachineBasicBlock::iterator
557 SIWholeQuadMode::saveSCC(MachineBasicBlock
&MBB
,
558 MachineBasicBlock::iterator Before
) {
559 Register SaveReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
562 BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::COPY
), SaveReg
)
563 .addReg(AMDGPU::SCC
);
564 MachineInstr
*Restore
=
565 BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::COPY
), AMDGPU::SCC
)
568 LIS
->InsertMachineInstrInMaps(*Save
);
569 LIS
->InsertMachineInstrInMaps(*Restore
);
570 LIS
->createAndComputeVirtRegInterval(SaveReg
);
575 // Return an iterator in the (inclusive) range [First, Last] at which
576 // instructions can be safely inserted, keeping in mind that some of the
577 // instructions we want to add necessarily clobber SCC.
578 MachineBasicBlock::iterator
SIWholeQuadMode::prepareInsertion(
579 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator First
,
580 MachineBasicBlock::iterator Last
, bool PreferLast
, bool SaveSCC
) {
582 return PreferLast
? Last
: First
;
584 LiveRange
&LR
= LIS
->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC
, TRI
));
585 auto MBBE
= MBB
.end();
586 SlotIndex FirstIdx
= First
!= MBBE
? LIS
->getInstructionIndex(*First
)
587 : LIS
->getMBBEndIdx(&MBB
);
589 Last
!= MBBE
? LIS
->getInstructionIndex(*Last
) : LIS
->getMBBEndIdx(&MBB
);
590 SlotIndex Idx
= PreferLast
? LastIdx
: FirstIdx
;
591 const LiveRange::Segment
*S
;
594 S
= LR
.getSegmentContaining(Idx
);
599 SlotIndex Next
= S
->start
.getBaseIndex();
604 SlotIndex Next
= S
->end
.getNextIndex().getBaseIndex();
611 MachineBasicBlock::iterator MBBI
;
613 if (MachineInstr
*MI
= LIS
->getInstructionFromIndex(Idx
))
616 assert(Idx
== LIS
->getMBBEndIdx(&MBB
));
621 MBBI
= saveSCC(MBB
, MBBI
);
626 void SIWholeQuadMode::toExact(MachineBasicBlock
&MBB
,
627 MachineBasicBlock::iterator Before
,
628 unsigned SaveWQM
, unsigned LiveMaskReg
) {
632 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(ST
->isWave32() ?
633 AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64
),
635 .addReg(LiveMaskReg
);
637 unsigned Exec
= ST
->isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
638 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(ST
->isWave32() ?
639 AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
),
642 .addReg(LiveMaskReg
);
645 LIS
->InsertMachineInstrInMaps(*MI
);
648 void SIWholeQuadMode::toWQM(MachineBasicBlock
&MBB
,
649 MachineBasicBlock::iterator Before
,
653 unsigned Exec
= ST
->isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
655 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::COPY
), Exec
)
658 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(ST
->isWave32() ?
659 AMDGPU::S_WQM_B32
: AMDGPU::S_WQM_B64
),
664 LIS
->InsertMachineInstrInMaps(*MI
);
667 void SIWholeQuadMode::toWWM(MachineBasicBlock
&MBB
,
668 MachineBasicBlock::iterator Before
,
673 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::ENTER_WWM
), SaveOrig
)
675 LIS
->InsertMachineInstrInMaps(*MI
);
678 void SIWholeQuadMode::fromWWM(MachineBasicBlock
&MBB
,
679 MachineBasicBlock::iterator Before
,
680 unsigned SavedOrig
) {
684 MI
= BuildMI(MBB
, Before
, DebugLoc(), TII
->get(AMDGPU::EXIT_WWM
),
685 ST
->isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
)
687 LIS
->InsertMachineInstrInMaps(*MI
);
690 void SIWholeQuadMode::processBlock(MachineBasicBlock
&MBB
, unsigned LiveMaskReg
,
692 auto BII
= Blocks
.find(&MBB
);
693 if (BII
== Blocks
.end())
696 const BlockInfo
&BI
= BII
->second
;
698 // This is a non-entry block that is WQM throughout, so no need to do
700 if (!isEntry
&& BI
.Needs
== StateWQM
&& BI
.OutNeeds
!= StateExact
)
703 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB
)
706 unsigned SavedWQMReg
= 0;
707 unsigned SavedNonWWMReg
= 0;
708 bool WQMFromExec
= isEntry
;
709 char State
= (isEntry
|| !(BI
.InNeeds
& StateWQM
)) ? StateExact
: StateWQM
;
710 char NonWWMState
= 0;
711 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
713 auto II
= MBB
.getFirstNonPHI(), IE
= MBB
.end();
715 ++II
; // Skip the instruction that saves LiveMask
717 // This stores the first instruction where it's safe to switch from WQM to
718 // Exact or vice versa.
719 MachineBasicBlock::iterator FirstWQM
= IE
;
721 // This stores the first instruction where it's safe to switch from WWM to
722 // Exact/WQM or to switch to WWM. It must always be the same as, or after,
723 // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
724 // switch to/from WQM as well.
725 MachineBasicBlock::iterator FirstWWM
= IE
;
727 MachineBasicBlock::iterator Next
= II
;
728 char Needs
= StateExact
| StateWQM
; // WWM is disabled by default
737 // First, figure out the allowed states (Needs) based on the propagated
740 MachineInstr
&MI
= *II
;
742 if (requiresCorrectState(MI
)) {
743 auto III
= Instructions
.find(&MI
);
744 if (III
!= Instructions
.end()) {
745 if (III
->second
.Needs
& StateWWM
)
747 else if (III
->second
.Needs
& StateWQM
)
750 Needs
&= ~III
->second
.Disabled
;
751 OutNeeds
= III
->second
.OutNeeds
;
754 // If the instruction doesn't actually need a correct EXEC, then we can
755 // safely leave WWM enabled.
756 Needs
= StateExact
| StateWQM
| StateWWM
;
759 if (MI
.isTerminator() && OutNeeds
== StateExact
)
762 if (MI
.getOpcode() == AMDGPU::SI_ELSE
&& BI
.OutNeeds
== StateExact
)
763 MI
.getOperand(3).setImm(1);
767 // End of basic block
768 if (BI
.OutNeeds
& StateWQM
)
770 else if (BI
.OutNeeds
== StateExact
)
773 Needs
= StateWQM
| StateExact
;
776 // Now, transition if necessary.
777 if (!(Needs
& State
)) {
778 MachineBasicBlock::iterator First
;
779 if (State
== StateWWM
|| Needs
== StateWWM
) {
780 // We must switch to or from WWM
783 // We only need to switch to/from WQM, so we can use FirstWQM
787 MachineBasicBlock::iterator Before
=
788 prepareInsertion(MBB
, First
, II
, Needs
== StateWQM
,
789 Needs
== StateExact
|| WQMFromExec
);
791 if (State
== StateWWM
) {
792 assert(SavedNonWWMReg
);
793 fromWWM(MBB
, Before
, SavedNonWWMReg
);
797 if (Needs
== StateWWM
) {
799 SavedNonWWMReg
= MRI
->createVirtualRegister(BoolRC
);
800 toWWM(MBB
, Before
, SavedNonWWMReg
);
803 if (State
== StateWQM
&& (Needs
& StateExact
) && !(Needs
& StateWQM
)) {
804 if (!WQMFromExec
&& (OutNeeds
& StateWQM
))
805 SavedWQMReg
= MRI
->createVirtualRegister(BoolRC
);
807 toExact(MBB
, Before
, SavedWQMReg
, LiveMaskReg
);
809 } else if (State
== StateExact
&& (Needs
& StateWQM
) &&
810 !(Needs
& StateExact
)) {
811 assert(WQMFromExec
== (SavedWQMReg
== 0));
813 toWQM(MBB
, Before
, SavedWQMReg
);
816 LIS
->createAndComputeVirtRegInterval(SavedWQMReg
);
821 // We can get here if we transitioned from WWM to a non-WWM state that
822 // already matches our needs, but we shouldn't need to do anything.
823 assert(Needs
& State
);
828 if (Needs
!= (StateExact
| StateWQM
| StateWWM
)) {
829 if (Needs
!= (StateExact
| StateWQM
))
840 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg
) {
841 for (MachineInstr
*MI
: LiveMaskQueries
) {
842 const DebugLoc
&DL
= MI
->getDebugLoc();
843 Register Dest
= MI
->getOperand(0).getReg();
845 BuildMI(*MI
->getParent(), MI
, DL
, TII
->get(AMDGPU::COPY
), Dest
)
846 .addReg(LiveMaskReg
);
848 LIS
->ReplaceMachineInstrInMaps(*MI
, *Copy
);
849 MI
->eraseFromParent();
853 void SIWholeQuadMode::lowerCopyInstrs() {
854 for (MachineInstr
*MI
: LowerToCopyInstrs
) {
855 for (unsigned i
= MI
->getNumExplicitOperands() - 1; i
> 1; i
--)
856 MI
->RemoveOperand(i
);
858 const Register Reg
= MI
->getOperand(0).getReg();
860 if (TRI
->isVGPR(*MRI
, Reg
)) {
861 const TargetRegisterClass
*regClass
= Register::isVirtualRegister(Reg
)
862 ? MRI
->getRegClass(Reg
)
863 : TRI
->getPhysRegClass(Reg
);
865 const unsigned MovOp
= TII
->getMovOpcode(regClass
);
866 MI
->setDesc(TII
->get(MovOp
));
868 // And make it implicitly depend on exec (like all VALU movs should do).
869 MI
->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC
, false, true));
871 MI
->setDesc(TII
->get(AMDGPU::COPY
));
876 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction
&MF
) {
877 Instructions
.clear();
879 LiveMaskQueries
.clear();
880 LowerToCopyInstrs
.clear();
881 CallingConv
= MF
.getFunction().getCallingConv();
883 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
885 TII
= ST
->getInstrInfo();
886 TRI
= &TII
->getRegisterInfo();
887 MRI
= &MF
.getRegInfo();
888 LIS
= &getAnalysis
<LiveIntervals
>();
890 char GlobalFlags
= analyzeFunction(MF
);
891 unsigned LiveMaskReg
= 0;
892 unsigned Exec
= ST
->isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
893 if (!(GlobalFlags
& StateWQM
)) {
894 lowerLiveMaskQueries(Exec
);
895 if (!(GlobalFlags
& StateWWM
) && LowerToCopyInstrs
.empty())
896 return !LiveMaskQueries
.empty();
898 // Store a copy of the original live mask when required
899 MachineBasicBlock
&Entry
= MF
.front();
900 MachineBasicBlock::iterator EntryMI
= Entry
.getFirstNonPHI();
902 if (GlobalFlags
& StateExact
|| !LiveMaskQueries
.empty()) {
903 LiveMaskReg
= MRI
->createVirtualRegister(TRI
->getBoolRC());
904 MachineInstr
*MI
= BuildMI(Entry
, EntryMI
, DebugLoc(),
905 TII
->get(AMDGPU::COPY
), LiveMaskReg
)
907 LIS
->InsertMachineInstrInMaps(*MI
);
910 lowerLiveMaskQueries(LiveMaskReg
);
912 if (GlobalFlags
== StateWQM
) {
913 // For a shader that needs only WQM, we can just set it once.
914 BuildMI(Entry
, EntryMI
, DebugLoc(), TII
->get(ST
->isWave32() ?
915 AMDGPU::S_WQM_B32
: AMDGPU::S_WQM_B64
),
920 // EntryMI may become invalid here
925 LLVM_DEBUG(printInfo());
929 // Handle the general case
930 for (auto BII
: Blocks
)
931 processBlock(*BII
.first
, LiveMaskReg
, BII
.first
== &*MF
.begin());
933 // Physical registers like SCC aren't tracked by default anyway, so just
934 // removing the ranges we computed is the simplest option for maintaining
935 // the analysis results.
936 LIS
->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC
, TRI
));