1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// Insert wait instructions for memory reads and writes.
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
17 //===----------------------------------------------------------------------===//
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/PostOrderIterator.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/DebugCounter.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/raw_ostream.h"
56 #define DEBUG_TYPE "si-insert-waitcnts"
58 DEBUG_COUNTER(ForceExpCounter
, DEBUG_TYPE
"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60 DEBUG_COUNTER(ForceLgkmCounter
, DEBUG_TYPE
"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62 DEBUG_COUNTER(ForceVMCounter
, DEBUG_TYPE
"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
65 static cl::opt
<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden
);
72 // Class of object that encapsulates latest instruction counter score
73 // associated with the operand. Used for determining whether
74 // s_waitcnt instruction needs to be emited.
76 #define CNT_MASK(t) (1u << (t))
78 enum InstCounterType
{ VM_CNT
= 0, LGKM_CNT
, EXP_CNT
, NUM_INST_CNTS
};
80 using RegInterval
= std::pair
<signed, signed>;
98 VMEM_ACCESS
, // vector-memory read & write
99 LDS_ACCESS
, // lds read & write
100 GDS_ACCESS
, // gds read & write
101 SQ_MESSAGE
, // send message
102 SMEM_ACCESS
, // scalar-memory read & write
103 EXP_GPR_LOCK
, // export holding on its data src
104 GDS_GPR_LOCK
, // GDS holding on its data and addr src
105 EXP_POS_ACCESS
, // write to export position
106 EXP_PARAM_ACCESS
, // write to export parameter
107 VMW_GPR_LOCK
, // vector-memory write holding on its data src
112 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115 // We reserve a fixed number of VGPR slots in the scoring tables for
116 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
117 enum RegisterMapping
{
118 SQ_MAX_PGM_VGPRS
= 256, // Maximum programmable VGPRs across all targets.
119 SQ_MAX_PGM_SGPRS
= 256, // Maximum programmable SGPRs across all targets.
120 NUM_EXTRA_VGPRS
= 1, // A reserved slot for DS.
121 EXTRA_VGPR_LDS
= 0, // This is a placeholder the Shader algorithm uses.
122 NUM_ALL_VGPRS
= SQ_MAX_PGM_VGPRS
+ NUM_EXTRA_VGPRS
, // Where SGPR starts.
125 #define ForAllWaitEventType(w) \
126 for (enum WaitEventType w = (enum WaitEventType)0; \
127 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128 (w) = (enum WaitEventType)((w) + 1))
130 // This is a per-basic-block object that maintains current score brackets
131 // of each wait counter, and a per-register scoreboard for each wait counter.
132 // We also maintain the latest score for every event type that can change the
133 // waitcnt in order to know if there are multiple types of events within
134 // the brackets. When multiple types of event happen in the bracket,
135 // wait count may get decreased out of order, therefore we need to put in
136 // "s_waitcnt 0" before use.
137 class BlockWaitcntBrackets
{
139 BlockWaitcntBrackets(const GCNSubtarget
*SubTarget
) : ST(SubTarget
) {
140 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
141 T
= (enum InstCounterType
)(T
+ 1)) {
142 memset(VgprScores
[T
], 0, sizeof(VgprScores
[T
]));
146 ~BlockWaitcntBrackets() = default;
148 static int32_t getWaitCountMax(InstCounterType T
) {
151 return HardwareLimits
.VmcntMax
;
153 return HardwareLimits
.LgkmcntMax
;
155 return HardwareLimits
.ExpcntMax
;
162 void setScoreLB(InstCounterType T
, int32_t Val
) {
163 assert(T
< NUM_INST_CNTS
);
164 if (T
>= NUM_INST_CNTS
)
169 void setScoreUB(InstCounterType T
, int32_t Val
) {
170 assert(T
< NUM_INST_CNTS
);
171 if (T
>= NUM_INST_CNTS
)
175 int32_t UB
= (int)(ScoreUBs
[T
] - getWaitCountMax(EXP_CNT
));
176 if (ScoreLBs
[T
] < UB
)
181 int32_t getScoreLB(InstCounterType T
) {
182 assert(T
< NUM_INST_CNTS
);
183 if (T
>= NUM_INST_CNTS
)
188 int32_t getScoreUB(InstCounterType T
) {
189 assert(T
< NUM_INST_CNTS
);
190 if (T
>= NUM_INST_CNTS
)
195 // Mapping from event to counter.
196 InstCounterType
eventCounter(WaitEventType E
) {
209 case EXP_PARAM_ACCESS
:
212 llvm_unreachable("unhandled event type");
214 return NUM_INST_CNTS
;
217 void setRegScore(int GprNo
, InstCounterType T
, int32_t Val
) {
218 if (GprNo
< NUM_ALL_VGPRS
) {
219 if (GprNo
> VgprUB
) {
222 VgprScores
[T
][GprNo
] = Val
;
224 assert(T
== LGKM_CNT
);
225 if (GprNo
- NUM_ALL_VGPRS
> SgprUB
) {
226 SgprUB
= GprNo
- NUM_ALL_VGPRS
;
228 SgprScores
[GprNo
- NUM_ALL_VGPRS
] = Val
;
232 int32_t getRegScore(int GprNo
, InstCounterType T
) {
233 if (GprNo
< NUM_ALL_VGPRS
) {
234 return VgprScores
[T
][GprNo
];
236 return SgprScores
[GprNo
- NUM_ALL_VGPRS
];
240 memset(ScoreLBs
, 0, sizeof(ScoreLBs
));
241 memset(ScoreUBs
, 0, sizeof(ScoreUBs
));
242 memset(EventUBs
, 0, sizeof(EventUBs
));
243 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
244 T
= (enum InstCounterType
)(T
+ 1)) {
245 memset(VgprScores
[T
], 0, sizeof(VgprScores
[T
]));
247 memset(SgprScores
, 0, sizeof(SgprScores
));
250 RegInterval
getRegInterval(const MachineInstr
*MI
, const SIInstrInfo
*TII
,
251 const MachineRegisterInfo
*MRI
,
252 const SIRegisterInfo
*TRI
, unsigned OpNo
,
255 void setExpScore(const MachineInstr
*MI
, const SIInstrInfo
*TII
,
256 const SIRegisterInfo
*TRI
, const MachineRegisterInfo
*MRI
,
257 unsigned OpNo
, int32_t Val
);
259 void setWaitAtBeginning() { WaitAtBeginning
= true; }
260 void clearWaitAtBeginning() { WaitAtBeginning
= false; }
261 bool getWaitAtBeginning() const { return WaitAtBeginning
; }
262 void setEventUB(enum WaitEventType W
, int32_t Val
) { EventUBs
[W
] = Val
; }
263 int32_t getMaxVGPR() const { return VgprUB
; }
264 int32_t getMaxSGPR() const { return SgprUB
; }
266 int32_t getEventUB(enum WaitEventType W
) const {
267 assert(W
< NUM_WAIT_EVENTS
);
271 bool counterOutOfOrder(InstCounterType T
);
272 unsigned int updateByWait(InstCounterType T
, int ScoreToWait
);
273 void updateByEvent(const SIInstrInfo
*TII
, const SIRegisterInfo
*TRI
,
274 const MachineRegisterInfo
*MRI
, WaitEventType E
,
277 bool hasPendingSMEM() const {
278 return (EventUBs
[SMEM_ACCESS
] > ScoreLBs
[LGKM_CNT
] &&
279 EventUBs
[SMEM_ACCESS
] <= ScoreUBs
[LGKM_CNT
]);
282 bool hasPendingFlat() const {
283 return ((LastFlat
[LGKM_CNT
] > ScoreLBs
[LGKM_CNT
] &&
284 LastFlat
[LGKM_CNT
] <= ScoreUBs
[LGKM_CNT
]) ||
285 (LastFlat
[VM_CNT
] > ScoreLBs
[VM_CNT
] &&
286 LastFlat
[VM_CNT
] <= ScoreUBs
[VM_CNT
]));
289 void setPendingFlat() {
290 LastFlat
[VM_CNT
] = ScoreUBs
[VM_CNT
];
291 LastFlat
[LGKM_CNT
] = ScoreUBs
[LGKM_CNT
];
294 int pendingFlat(InstCounterType Ct
) const { return LastFlat
[Ct
]; }
296 void setLastFlat(InstCounterType Ct
, int Val
) { LastFlat
[Ct
] = Val
; }
298 bool getRevisitLoop() const { return RevisitLoop
; }
299 void setRevisitLoop(bool RevisitLoopIn
) { RevisitLoop
= RevisitLoopIn
; }
301 void setPostOrder(int32_t PostOrderIn
) { PostOrder
= PostOrderIn
; }
302 int32_t getPostOrder() const { return PostOrder
; }
304 void setWaitcnt(MachineInstr
*WaitcntIn
) { Waitcnt
= WaitcntIn
; }
305 void clearWaitcnt() { Waitcnt
= nullptr; }
306 MachineInstr
*getWaitcnt() const { return Waitcnt
; }
308 bool mixedExpTypes() const { return MixedExpTypes
; }
309 void setMixedExpTypes(bool MixedExpTypesIn
) {
310 MixedExpTypes
= MixedExpTypesIn
;
313 void print(raw_ostream
&);
314 void dump() { print(dbgs()); }
317 const GCNSubtarget
*ST
= nullptr;
318 bool WaitAtBeginning
= false;
319 bool RevisitLoop
= false;
320 bool MixedExpTypes
= false;
321 int32_t PostOrder
= 0;
322 MachineInstr
*Waitcnt
= nullptr;
323 int32_t ScoreLBs
[NUM_INST_CNTS
] = {0};
324 int32_t ScoreUBs
[NUM_INST_CNTS
] = {0};
325 int32_t EventUBs
[NUM_WAIT_EVENTS
] = {0};
326 // Remember the last flat memory operation.
327 int32_t LastFlat
[NUM_INST_CNTS
] = {0};
328 // wait_cnt scores for every vgpr.
329 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
332 int32_t VgprScores
[NUM_INST_CNTS
][NUM_ALL_VGPRS
];
333 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
334 int32_t SgprScores
[SQ_MAX_PGM_SGPRS
] = {0};
337 // This is a per-loop-region object that records waitcnt status at the end of
338 // loop footer from the previous iteration. We also maintain an iteration
339 // count to track the number of times the loop has been visited. When it
340 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
341 // at the end of the loop footer.
342 class LoopWaitcntData
{
344 LoopWaitcntData() = default;
345 ~LoopWaitcntData() = default;
347 void incIterCnt() { IterCnt
++; }
348 void resetIterCnt() { IterCnt
= 0; }
349 unsigned getIterCnt() { return IterCnt
; }
351 void setWaitcnt(MachineInstr
*WaitcntIn
) { LfWaitcnt
= WaitcntIn
; }
352 MachineInstr
*getWaitcnt() const { return LfWaitcnt
; }
354 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt
<< '\n';); }
357 // s_waitcnt added at the end of loop footer to stablize wait scores
358 // at the end of the loop footer.
359 MachineInstr
*LfWaitcnt
= nullptr;
360 // Number of iterations the loop has been visited, not including the initial
365 class SIInsertWaitcnts
: public MachineFunctionPass
{
367 const GCNSubtarget
*ST
= nullptr;
368 const SIInstrInfo
*TII
= nullptr;
369 const SIRegisterInfo
*TRI
= nullptr;
370 const MachineRegisterInfo
*MRI
= nullptr;
371 const MachineLoopInfo
*MLI
= nullptr;
372 AMDGPU::IsaVersion IV
;
374 DenseSet
<MachineBasicBlock
*> BlockVisitedSet
;
375 DenseSet
<MachineInstr
*> TrackedWaitcntSet
;
376 DenseSet
<MachineInstr
*> VCCZBugHandledSet
;
378 DenseMap
<MachineBasicBlock
*, std::unique_ptr
<BlockWaitcntBrackets
>>
379 BlockWaitcntBracketsMap
;
381 std::vector
<MachineBasicBlock
*> BlockWaitcntProcessedSet
;
383 DenseMap
<MachineLoop
*, std::unique_ptr
<LoopWaitcntData
>> LoopWaitcntDataMap
;
385 std::vector
<std::unique_ptr
<BlockWaitcntBrackets
>> KillWaitBrackets
;
387 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
388 // because of amdgpu-waitcnt-forcezero flag
389 bool ForceEmitZeroWaitcnts
;
390 bool ForceEmitWaitcnt
[NUM_INST_CNTS
];
395 SIInsertWaitcnts() : MachineFunctionPass(ID
) {
396 (void)ForceExpCounter
;
397 (void)ForceLgkmCounter
;
398 (void)ForceVMCounter
;
401 bool runOnMachineFunction(MachineFunction
&MF
) override
;
403 StringRef
getPassName() const override
{
404 return "SI insert wait instructions";
407 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
408 AU
.setPreservesCFG();
409 AU
.addRequired
<MachineLoopInfo
>();
410 MachineFunctionPass::getAnalysisUsage(AU
);
413 void addKillWaitBracket(BlockWaitcntBrackets
*Bracket
) {
414 // The waitcnt information is copied because it changes as the block is
416 KillWaitBrackets
.push_back(
417 llvm::make_unique
<BlockWaitcntBrackets
>(*Bracket
));
420 bool isForceEmitWaitcnt() const {
421 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
422 T
= (enum InstCounterType
)(T
+ 1))
423 if (ForceEmitWaitcnt
[T
])
428 void setForceEmitWaitcnt() {
429 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
430 // For debug builds, get the debug counter info and adjust if need be
432 if (DebugCounter::isCounterSet(ForceExpCounter
) &&
433 DebugCounter::shouldExecute(ForceExpCounter
)) {
434 ForceEmitWaitcnt
[EXP_CNT
] = true;
436 ForceEmitWaitcnt
[EXP_CNT
] = false;
439 if (DebugCounter::isCounterSet(ForceLgkmCounter
) &&
440 DebugCounter::shouldExecute(ForceLgkmCounter
)) {
441 ForceEmitWaitcnt
[LGKM_CNT
] = true;
443 ForceEmitWaitcnt
[LGKM_CNT
] = false;
446 if (DebugCounter::isCounterSet(ForceVMCounter
) &&
447 DebugCounter::shouldExecute(ForceVMCounter
)) {
448 ForceEmitWaitcnt
[VM_CNT
] = true;
450 ForceEmitWaitcnt
[VM_CNT
] = false;
455 bool mayAccessLDSThroughFlat(const MachineInstr
&MI
) const;
456 void generateWaitcntInstBefore(MachineInstr
&MI
,
457 BlockWaitcntBrackets
*ScoreBrackets
);
458 void updateEventWaitcntAfter(MachineInstr
&Inst
,
459 BlockWaitcntBrackets
*ScoreBrackets
);
460 void mergeInputScoreBrackets(MachineBasicBlock
&Block
);
461 bool isLoopBottom(const MachineLoop
*Loop
, const MachineBasicBlock
*Block
);
462 unsigned countNumBottomBlocks(const MachineLoop
*Loop
);
463 void insertWaitcntInBlock(MachineFunction
&MF
, MachineBasicBlock
&Block
);
464 void insertWaitcntBeforeCF(MachineBasicBlock
&Block
, MachineInstr
*Inst
);
465 bool isWaitcntStronger(unsigned LHS
, unsigned RHS
);
466 unsigned combineWaitcnt(unsigned LHS
, unsigned RHS
);
469 } // end anonymous namespace
471 RegInterval
BlockWaitcntBrackets::getRegInterval(const MachineInstr
*MI
,
472 const SIInstrInfo
*TII
,
473 const MachineRegisterInfo
*MRI
,
474 const SIRegisterInfo
*TRI
,
477 const MachineOperand
&Op
= MI
->getOperand(OpNo
);
478 if (!Op
.isReg() || !TRI
->isInAllocatableClass(Op
.getReg()) ||
479 (Def
&& !Op
.isDef()))
482 // A use via a PW operand does not need a waitcnt.
483 // A partial write is not a WAW.
484 assert(!Op
.getSubReg() || !Op
.isUndef());
487 const MachineRegisterInfo
&MRIA
= *MRI
;
489 unsigned Reg
= TRI
->getEncodingValue(Op
.getReg());
491 if (TRI
->isVGPR(MRIA
, Op
.getReg())) {
492 assert(Reg
>= RegisterEncoding
.VGPR0
&& Reg
<= RegisterEncoding
.VGPRL
);
493 Result
.first
= Reg
- RegisterEncoding
.VGPR0
;
494 assert(Result
.first
>= 0 && Result
.first
< SQ_MAX_PGM_VGPRS
);
495 } else if (TRI
->isSGPRReg(MRIA
, Op
.getReg())) {
496 assert(Reg
>= RegisterEncoding
.SGPR0
&& Reg
< SQ_MAX_PGM_SGPRS
);
497 Result
.first
= Reg
- RegisterEncoding
.SGPR0
+ NUM_ALL_VGPRS
;
498 assert(Result
.first
>= NUM_ALL_VGPRS
&&
499 Result
.first
< SQ_MAX_PGM_SGPRS
+ NUM_ALL_VGPRS
);
502 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
506 const MachineInstr
&MIA
= *MI
;
507 const TargetRegisterClass
*RC
= TII
->getOpRegClass(MIA
, OpNo
);
508 unsigned Size
= TRI
->getRegSizeInBits(*RC
);
509 Result
.second
= Result
.first
+ (Size
/ 32);
514 void BlockWaitcntBrackets::setExpScore(const MachineInstr
*MI
,
515 const SIInstrInfo
*TII
,
516 const SIRegisterInfo
*TRI
,
517 const MachineRegisterInfo
*MRI
,
518 unsigned OpNo
, int32_t Val
) {
519 RegInterval Interval
= getRegInterval(MI
, TII
, MRI
, TRI
, OpNo
, false);
521 const MachineOperand
&Opnd
= MI
->getOperand(OpNo
);
522 assert(TRI
->isVGPR(*MRI
, Opnd
.getReg()));
524 for (signed RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
525 setRegScore(RegNo
, EXP_CNT
, Val
);
529 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo
*TII
,
530 const SIRegisterInfo
*TRI
,
531 const MachineRegisterInfo
*MRI
,
532 WaitEventType E
, MachineInstr
&Inst
) {
533 const MachineRegisterInfo
&MRIA
= *MRI
;
534 InstCounterType T
= eventCounter(E
);
535 int32_t CurrScore
= getScoreUB(T
) + 1;
536 // EventUB and ScoreUB need to be update regardless if this event changes
537 // the score of a register or not.
538 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
539 EventUBs
[E
] = CurrScore
;
540 setScoreUB(T
, CurrScore
);
543 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
545 if (!MixedExpTypes
) {
546 MixedExpTypes
= counterOutOfOrder(EXP_CNT
);
549 // Put score on the source vgprs. If this is a store, just use those
550 // specific register(s).
551 if (TII
->isDS(Inst
) && (Inst
.mayStore() || Inst
.mayLoad())) {
552 // All GDS operations must protect their address register (same as
554 if (Inst
.getOpcode() != AMDGPU::DS_APPEND
&&
555 Inst
.getOpcode() != AMDGPU::DS_CONSUME
) {
557 &Inst
, TII
, TRI
, MRI
,
558 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::addr
),
561 if (Inst
.mayStore()) {
563 &Inst
, TII
, TRI
, MRI
,
564 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data0
),
566 if (AMDGPU::getNamedOperandIdx(Inst
.getOpcode(),
567 AMDGPU::OpName::data1
) != -1) {
568 setExpScore(&Inst
, TII
, TRI
, MRI
,
569 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(),
570 AMDGPU::OpName::data1
),
573 } else if (AMDGPU::getAtomicNoRetOp(Inst
.getOpcode()) != -1 &&
574 Inst
.getOpcode() != AMDGPU::DS_GWS_INIT
&&
575 Inst
.getOpcode() != AMDGPU::DS_GWS_SEMA_V
&&
576 Inst
.getOpcode() != AMDGPU::DS_GWS_SEMA_BR
&&
577 Inst
.getOpcode() != AMDGPU::DS_GWS_SEMA_P
&&
578 Inst
.getOpcode() != AMDGPU::DS_GWS_BARRIER
&&
579 Inst
.getOpcode() != AMDGPU::DS_APPEND
&&
580 Inst
.getOpcode() != AMDGPU::DS_CONSUME
&&
581 Inst
.getOpcode() != AMDGPU::DS_ORDERED_COUNT
) {
582 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
583 const MachineOperand
&Op
= Inst
.getOperand(I
);
584 if (Op
.isReg() && !Op
.isDef() && TRI
->isVGPR(MRIA
, Op
.getReg())) {
585 setExpScore(&Inst
, TII
, TRI
, MRI
, I
, CurrScore
);
589 } else if (TII
->isFLAT(Inst
)) {
590 if (Inst
.mayStore()) {
592 &Inst
, TII
, TRI
, MRI
,
593 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
595 } else if (AMDGPU::getAtomicNoRetOp(Inst
.getOpcode()) != -1) {
597 &Inst
, TII
, TRI
, MRI
,
598 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
601 } else if (TII
->isMIMG(Inst
)) {
602 if (Inst
.mayStore()) {
603 setExpScore(&Inst
, TII
, TRI
, MRI
, 0, CurrScore
);
604 } else if (AMDGPU::getAtomicNoRetOp(Inst
.getOpcode()) != -1) {
606 &Inst
, TII
, TRI
, MRI
,
607 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
610 } else if (TII
->isMTBUF(Inst
)) {
611 if (Inst
.mayStore()) {
612 setExpScore(&Inst
, TII
, TRI
, MRI
, 0, CurrScore
);
614 } else if (TII
->isMUBUF(Inst
)) {
615 if (Inst
.mayStore()) {
616 setExpScore(&Inst
, TII
, TRI
, MRI
, 0, CurrScore
);
617 } else if (AMDGPU::getAtomicNoRetOp(Inst
.getOpcode()) != -1) {
619 &Inst
, TII
, TRI
, MRI
,
620 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
624 if (TII
->isEXP(Inst
)) {
625 // For export the destination registers are really temps that
626 // can be used as the actual source after export patching, so
627 // we need to treat them like sources and set the EXP_CNT
629 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
630 MachineOperand
&DefMO
= Inst
.getOperand(I
);
631 if (DefMO
.isReg() && DefMO
.isDef() &&
632 TRI
->isVGPR(MRIA
, DefMO
.getReg())) {
633 setRegScore(TRI
->getEncodingValue(DefMO
.getReg()), EXP_CNT
,
638 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
639 MachineOperand
&MO
= Inst
.getOperand(I
);
640 if (MO
.isReg() && !MO
.isDef() && TRI
->isVGPR(MRIA
, MO
.getReg())) {
641 setExpScore(&Inst
, TII
, TRI
, MRI
, I
, CurrScore
);
645 #if 0 // TODO: check if this is handled by MUBUF code above.
646 } else if (Inst
.getOpcode() == AMDGPU::BUFFER_STORE_DWORD
||
647 Inst
.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2
||
648 Inst
.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4
) {
649 MachineOperand
*MO
= TII
->getNamedOperand(Inst
, AMDGPU::OpName::data
);
650 unsigned OpNo
;//TODO: find the OpNo for this operand;
651 RegInterval Interval
= getRegInterval(&Inst
, TII
, MRI
, TRI
, OpNo
, false);
652 for (signed RegNo
= Interval
.first
; RegNo
< Interval
.second
;
654 setRegScore(RegNo
+ NUM_ALL_VGPRS
, t
, CurrScore
);
658 // Match the score to the destination registers.
659 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
660 RegInterval Interval
= getRegInterval(&Inst
, TII
, MRI
, TRI
, I
, true);
661 if (T
== VM_CNT
&& Interval
.first
>= NUM_ALL_VGPRS
)
663 for (signed RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
664 setRegScore(RegNo
, T
, CurrScore
);
667 if (TII
->isDS(Inst
) && Inst
.mayStore()) {
668 setRegScore(SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
, T
, CurrScore
);
673 void BlockWaitcntBrackets::print(raw_ostream
&OS
) {
675 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
676 T
= (enum InstCounterType
)(T
+ 1)) {
677 int LB
= getScoreLB(T
);
678 int UB
= getScoreUB(T
);
682 OS
<< " VM_CNT(" << UB
- LB
<< "): ";
685 OS
<< " LGKM_CNT(" << UB
- LB
<< "): ";
688 OS
<< " EXP_CNT(" << UB
- LB
<< "): ";
691 OS
<< " UNKNOWN(" << UB
- LB
<< "): ";
696 // Print vgpr scores.
697 for (int J
= 0; J
<= getMaxVGPR(); J
++) {
698 int RegScore
= getRegScore(J
, T
);
701 int RelScore
= RegScore
- LB
- 1;
702 if (J
< SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
) {
703 OS
<< RelScore
<< ":v" << J
<< " ";
705 OS
<< RelScore
<< ":ds ";
708 // Also need to print sgpr scores for lgkm_cnt.
710 for (int J
= 0; J
<= getMaxSGPR(); J
++) {
711 int RegScore
= getRegScore(J
+ NUM_ALL_VGPRS
, LGKM_CNT
);
714 int RelScore
= RegScore
- LB
- 1;
715 OS
<< RelScore
<< ":s" << J
<< " ";
724 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T
,
726 unsigned int NeedWait
= 0;
727 if (ScoreToWait
== -1) {
728 // The score to wait is unknown. This implies that it was not encountered
729 // during the path of the CFG walk done during the current traversal but
730 // may be seen on a different path. Emit an s_wait counter with a
731 // conservative value of 0 for the counter.
732 NeedWait
= CNT_MASK(T
);
733 setScoreLB(T
, getScoreUB(T
));
737 // If the score of src_operand falls within the bracket, we need an
738 // s_waitcnt instruction.
739 const int32_t LB
= getScoreLB(T
);
740 const int32_t UB
= getScoreUB(T
);
741 if ((UB
>= ScoreToWait
) && (ScoreToWait
> LB
)) {
742 if ((T
== VM_CNT
|| T
== LGKM_CNT
) &&
744 !ST
->hasFlatLgkmVMemCountInOrder()) {
745 // If there is a pending FLAT operation, and this is a VMem or LGKM
746 // waitcnt and the target can report early completion, then we need
747 // to force a waitcnt 0.
748 NeedWait
= CNT_MASK(T
);
749 setScoreLB(T
, getScoreUB(T
));
750 } else if (counterOutOfOrder(T
)) {
751 // Counter can get decremented out-of-order when there
752 // are multiple types event in the bracket. Also emit an s_wait counter
753 // with a conservative value of 0 for the counter.
754 NeedWait
= CNT_MASK(T
);
755 setScoreLB(T
, getScoreUB(T
));
757 NeedWait
= CNT_MASK(T
);
758 setScoreLB(T
, ScoreToWait
);
765 // Where there are multiple types of event in the bracket of a counter,
766 // the decrement may go out of order.
767 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T
) {
772 if (EventUBs
[SMEM_ACCESS
] > ScoreLBs
[LGKM_CNT
] &&
773 EventUBs
[SMEM_ACCESS
] <= ScoreUBs
[LGKM_CNT
]) {
774 // Scalar memory read always can go out of order.
777 int NumEventTypes
= 0;
778 if (EventUBs
[LDS_ACCESS
] > ScoreLBs
[LGKM_CNT
] &&
779 EventUBs
[LDS_ACCESS
] <= ScoreUBs
[LGKM_CNT
]) {
782 if (EventUBs
[GDS_ACCESS
] > ScoreLBs
[LGKM_CNT
] &&
783 EventUBs
[GDS_ACCESS
] <= ScoreUBs
[LGKM_CNT
]) {
786 if (EventUBs
[SQ_MESSAGE
] > ScoreLBs
[LGKM_CNT
] &&
787 EventUBs
[SQ_MESSAGE
] <= ScoreUBs
[LGKM_CNT
]) {
790 if (NumEventTypes
<= 1) {
796 // If there has been a mixture of export types, then a waitcnt exp(0) is
800 int NumEventTypes
= 0;
801 if (EventUBs
[EXP_GPR_LOCK
] > ScoreLBs
[EXP_CNT
] &&
802 EventUBs
[EXP_GPR_LOCK
] <= ScoreUBs
[EXP_CNT
]) {
805 if (EventUBs
[GDS_GPR_LOCK
] > ScoreLBs
[EXP_CNT
] &&
806 EventUBs
[GDS_GPR_LOCK
] <= ScoreUBs
[EXP_CNT
]) {
809 if (EventUBs
[VMW_GPR_LOCK
] > ScoreLBs
[EXP_CNT
] &&
810 EventUBs
[VMW_GPR_LOCK
] <= ScoreUBs
[EXP_CNT
]) {
813 if (EventUBs
[EXP_PARAM_ACCESS
] > ScoreLBs
[EXP_CNT
] &&
814 EventUBs
[EXP_PARAM_ACCESS
] <= ScoreUBs
[EXP_CNT
]) {
818 if (EventUBs
[EXP_POS_ACCESS
] > ScoreLBs
[EXP_CNT
] &&
819 EventUBs
[EXP_POS_ACCESS
] <= ScoreUBs
[EXP_CNT
]) {
823 if (NumEventTypes
<= 1) {
834 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts
, DEBUG_TYPE
, "SI Insert Waitcnts", false,
836 INITIALIZE_PASS_END(SIInsertWaitcnts
, DEBUG_TYPE
, "SI Insert Waitcnts", false,
839 char SIInsertWaitcnts::ID
= 0;
841 char &llvm::SIInsertWaitcntsID
= SIInsertWaitcnts::ID
;
843 FunctionPass
*llvm::createSIInsertWaitcntsPass() {
844 return new SIInsertWaitcnts();
847 static bool readsVCCZ(const MachineInstr
&MI
) {
848 unsigned Opc
= MI
.getOpcode();
849 return (Opc
== AMDGPU::S_CBRANCH_VCCNZ
|| Opc
== AMDGPU::S_CBRANCH_VCCZ
) &&
850 !MI
.getOperand(1).isUndef();
853 /// Given wait count encodings checks if LHS is stronger than RHS.
854 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS
, unsigned RHS
) {
855 if (AMDGPU::decodeVmcnt(IV
, LHS
) > AMDGPU::decodeVmcnt(IV
, RHS
))
857 if (AMDGPU::decodeLgkmcnt(IV
, LHS
) > AMDGPU::decodeLgkmcnt(IV
, RHS
))
859 if (AMDGPU::decodeExpcnt(IV
, LHS
) > AMDGPU::decodeExpcnt(IV
, RHS
))
864 /// Given wait count encodings create a new encoding which is stronger
865 /// or equal to both.
866 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS
, unsigned RHS
) {
867 unsigned VmCnt
= std::min(AMDGPU::decodeVmcnt(IV
, LHS
),
868 AMDGPU::decodeVmcnt(IV
, RHS
));
869 unsigned LgkmCnt
= std::min(AMDGPU::decodeLgkmcnt(IV
, LHS
),
870 AMDGPU::decodeLgkmcnt(IV
, RHS
));
871 unsigned ExpCnt
= std::min(AMDGPU::decodeExpcnt(IV
, LHS
),
872 AMDGPU::decodeExpcnt(IV
, RHS
));
873 return AMDGPU::encodeWaitcnt(IV
, VmCnt
, ExpCnt
, LgkmCnt
);
876 /// Generate s_waitcnt instruction to be placed before cur_Inst.
877 /// Instructions of a given type are returned in order,
878 /// but instructions of different types can complete out of order.
879 /// We rely on this in-order completion
880 /// and simply assign a score to the memory access instructions.
881 /// We keep track of the active "score bracket" to determine
882 /// if an access of a memory read requires an s_waitcnt
883 /// and if so what the value of each counter is.
884 /// The "score bracket" is bound by the lower bound and upper bound
885 /// scores (*_score_LB and *_score_ub respectively).
886 void SIInsertWaitcnts::generateWaitcntInstBefore(
887 MachineInstr
&MI
, BlockWaitcntBrackets
*ScoreBrackets
) {
888 // To emit, or not to emit - that's the question!
889 // Start with an assumption that there is no need to emit.
890 unsigned int EmitWaitcnt
= 0;
892 // No need to wait before phi. If a phi-move exists, then the wait should
893 // has been inserted before the move. If a phi-move does not exist, then
894 // wait should be inserted before the real use. The same is true for
895 // sc-merge. It is not a coincident that all these cases correspond to the
896 // instructions that are skipped in the assembling loop.
897 bool NeedLineMapping
= false; // TODO: Check on this.
899 // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
900 bool ForceEmitZeroWaitcnt
= false;
902 setForceEmitWaitcnt();
903 bool IsForceEmitWaitcnt
= isForceEmitWaitcnt();
905 if (MI
.isDebugInstr() &&
906 // TODO: any other opcode?
911 // See if an s_waitcnt is forced at block entry, or is needed at
913 if (ScoreBrackets
->getWaitAtBeginning()) {
914 // Note that we have already cleared the state, so we don't need to update
916 ScoreBrackets
->clearWaitAtBeginning();
917 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
918 T
= (enum InstCounterType
)(T
+ 1)) {
919 EmitWaitcnt
|= CNT_MASK(T
);
920 ScoreBrackets
->setScoreLB(T
, ScoreBrackets
->getScoreUB(T
));
924 // See if this instruction has a forced S_WAITCNT VM.
925 // TODO: Handle other cases of NeedsWaitcntVmBefore()
926 else if (MI
.getOpcode() == AMDGPU::BUFFER_WBINVL1
||
927 MI
.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC
||
928 MI
.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL
) {
930 ScoreBrackets
->updateByWait(VM_CNT
, ScoreBrackets
->getScoreUB(VM_CNT
));
933 // All waits must be resolved at call return.
934 // NOTE: this could be improved with knowledge of all call sites or
935 // with knowledge of the called routines.
936 if (MI
.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG
||
937 MI
.getOpcode() == AMDGPU::S_SETPC_B64_return
) {
938 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
939 T
= (enum InstCounterType
)(T
+ 1)) {
940 if (ScoreBrackets
->getScoreUB(T
) > ScoreBrackets
->getScoreLB(T
)) {
941 ScoreBrackets
->setScoreLB(T
, ScoreBrackets
->getScoreUB(T
));
942 EmitWaitcnt
|= CNT_MASK(T
);
946 // Resolve vm waits before gs-done.
947 else if ((MI
.getOpcode() == AMDGPU::S_SENDMSG
||
948 MI
.getOpcode() == AMDGPU::S_SENDMSGHALT
) &&
949 ((MI
.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_
) ==
950 AMDGPU::SendMsg::ID_GS_DONE
)) {
951 if (ScoreBrackets
->getScoreUB(VM_CNT
) > ScoreBrackets
->getScoreLB(VM_CNT
)) {
952 ScoreBrackets
->setScoreLB(VM_CNT
, ScoreBrackets
->getScoreUB(VM_CNT
));
953 EmitWaitcnt
|= CNT_MASK(VM_CNT
);
956 #if 0 // TODO: the following blocks of logic when we have fence.
957 else if (MI
.getOpcode() == SC_FENCE
) {
958 const unsigned int group_size
=
959 context
->shader_info
->GetMaxThreadGroupSize();
960 // group_size == 0 means thread group size is unknown at compile time
961 const bool group_is_multi_wave
=
962 (group_size
== 0 || group_size
> target_info
->GetWaveFrontSize());
963 const bool fence_is_global
= !((SCInstInternalMisc
*)Inst
)->IsGroupFence();
965 for (unsigned int i
= 0; i
< Inst
->NumSrcOperands(); i
++) {
966 SCRegType src_type
= Inst
->GetSrcType(i
);
969 if (group_is_multi_wave
||
970 context
->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT
)) {
971 EmitWaitcnt
|= ScoreBrackets
->updateByWait(LGKM_CNT
,
972 ScoreBrackets
->getScoreUB(LGKM_CNT
));
973 // LDS may have to wait for VM_CNT after buffer load to LDS
974 if (target_info
->HasBufferLoadToLDS()) {
975 EmitWaitcnt
|= ScoreBrackets
->updateByWait(VM_CNT
,
976 ScoreBrackets
->getScoreUB(VM_CNT
));
982 if (group_is_multi_wave
|| fence_is_global
) {
983 EmitWaitcnt
|= ScoreBrackets
->updateByWait(EXP_CNT
,
984 ScoreBrackets
->getScoreUB(EXP_CNT
));
985 EmitWaitcnt
|= ScoreBrackets
->updateByWait(LGKM_CNT
,
986 ScoreBrackets
->getScoreUB(LGKM_CNT
));
994 if (group_is_multi_wave
|| fence_is_global
) {
995 EmitWaitcnt
|= ScoreBrackets
->updateByWait(EXP_CNT
,
996 ScoreBrackets
->getScoreUB(EXP_CNT
));
997 EmitWaitcnt
|= ScoreBrackets
->updateByWait(VM_CNT
,
998 ScoreBrackets
->getScoreUB(VM_CNT
));
1010 // Export & GDS instructions do not read the EXEC mask until after the export
1011 // is granted (which can occur well after the instruction is issued).
1012 // The shader program must flush all EXP operations on the export-count
1013 // before overwriting the EXEC mask.
1015 if (MI
.modifiesRegister(AMDGPU::EXEC
, TRI
)) {
1016 // Export and GDS are tracked individually, either may trigger a waitcnt
1018 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1019 EXP_CNT
, ScoreBrackets
->getEventUB(EXP_GPR_LOCK
));
1020 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1021 EXP_CNT
, ScoreBrackets
->getEventUB(EXP_PARAM_ACCESS
));
1022 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1023 EXP_CNT
, ScoreBrackets
->getEventUB(EXP_POS_ACCESS
));
1024 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1025 EXP_CNT
, ScoreBrackets
->getEventUB(GDS_GPR_LOCK
));
1028 #if 0 // TODO: the following code to handle CALL.
1029 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1030 // However, there is a problem with EXP_CNT, because the call cannot
1031 // easily tell if a register is used in the function, and if it did, then
1032 // the referring instruction would have to have an S_WAITCNT, which is
1033 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1035 if (MI
.getOpcode() == SC_CALL
) {
1036 if (ScoreBrackets
->getScoreUB(EXP_CNT
) >
1037 ScoreBrackets
->getScoreLB(EXP_CNT
)) {
1038 ScoreBrackets
->setScoreLB(EXP_CNT
, ScoreBrackets
->getScoreUB(EXP_CNT
));
1039 EmitWaitcnt
|= CNT_MASK(EXP_CNT
);
1044 // FIXME: Should not be relying on memoperands.
1045 // Look at the source operands of every instruction to see if
1046 // any of them results from a previous memory operation that affects
1047 // its current usage. If so, an s_waitcnt instruction needs to be
1049 // If the source operand was defined by a load, add the s_waitcnt
1051 for (const MachineMemOperand
*Memop
: MI
.memoperands()) {
1052 unsigned AS
= Memop
->getAddrSpace();
1053 if (AS
!= AMDGPUAS::LOCAL_ADDRESS
)
1055 unsigned RegNo
= SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
;
1056 // VM_CNT is only relevant to vgpr or LDS.
1057 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1058 VM_CNT
, ScoreBrackets
->getRegScore(RegNo
, VM_CNT
));
1061 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
1062 const MachineOperand
&Op
= MI
.getOperand(I
);
1063 const MachineRegisterInfo
&MRIA
= *MRI
;
1064 RegInterval Interval
=
1065 ScoreBrackets
->getRegInterval(&MI
, TII
, MRI
, TRI
, I
, false);
1066 for (signed RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
1067 if (TRI
->isVGPR(MRIA
, Op
.getReg())) {
1068 // VM_CNT is only relevant to vgpr or LDS.
1069 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1070 VM_CNT
, ScoreBrackets
->getRegScore(RegNo
, VM_CNT
));
1072 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1073 LGKM_CNT
, ScoreBrackets
->getRegScore(RegNo
, LGKM_CNT
));
1076 // End of for loop that looks at all source operands to decide vm_wait_cnt
1077 // and lgk_wait_cnt.
1079 // Two cases are handled for destination operands:
1080 // 1) If the destination operand was defined by a load, add the s_waitcnt
1081 // instruction to guarantee the right WAW order.
1082 // 2) If a destination operand that was used by a recent export/store ins,
1083 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1084 if (MI
.mayStore()) {
1085 // FIXME: Should not be relying on memoperands.
1086 for (const MachineMemOperand
*Memop
: MI
.memoperands()) {
1087 unsigned AS
= Memop
->getAddrSpace();
1088 if (AS
!= AMDGPUAS::LOCAL_ADDRESS
)
1090 unsigned RegNo
= SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
;
1091 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1092 VM_CNT
, ScoreBrackets
->getRegScore(RegNo
, VM_CNT
));
1093 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1094 EXP_CNT
, ScoreBrackets
->getRegScore(RegNo
, EXP_CNT
));
1097 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
1098 MachineOperand
&Def
= MI
.getOperand(I
);
1099 const MachineRegisterInfo
&MRIA
= *MRI
;
1100 RegInterval Interval
=
1101 ScoreBrackets
->getRegInterval(&MI
, TII
, MRI
, TRI
, I
, true);
1102 for (signed RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
1103 if (TRI
->isVGPR(MRIA
, Def
.getReg())) {
1104 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1105 VM_CNT
, ScoreBrackets
->getRegScore(RegNo
, VM_CNT
));
1106 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1107 EXP_CNT
, ScoreBrackets
->getRegScore(RegNo
, EXP_CNT
));
1109 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1110 LGKM_CNT
, ScoreBrackets
->getRegScore(RegNo
, LGKM_CNT
));
1112 } // End of for loop that looks at all dest operands.
1115 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1116 // occurs before the instruction. Doing it here prevents any additional
1117 // S_WAITCNTs from being emitted if the instruction was marked as
1118 // requiring a WAITCNT beforehand.
1119 if (MI
.getOpcode() == AMDGPU::S_BARRIER
&&
1120 !ST
->hasAutoWaitcntBeforeBarrier()) {
1122 ScoreBrackets
->updateByWait(VM_CNT
, ScoreBrackets
->getScoreUB(VM_CNT
));
1123 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1124 EXP_CNT
, ScoreBrackets
->getScoreUB(EXP_CNT
));
1125 EmitWaitcnt
|= ScoreBrackets
->updateByWait(
1126 LGKM_CNT
, ScoreBrackets
->getScoreUB(LGKM_CNT
));
1129 // TODO: Remove this work-around, enable the assert for Bug 457939
1130 // after fixing the scheduler. Also, the Shader Compiler code is
1131 // independent of target.
1132 if (readsVCCZ(MI
) && ST
->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS
) {
1133 if (ScoreBrackets
->getScoreLB(LGKM_CNT
) <
1134 ScoreBrackets
->getScoreUB(LGKM_CNT
) &&
1135 ScoreBrackets
->hasPendingSMEM()) {
1136 // Wait on everything, not just LGKM. vccz reads usually come from
1137 // terminators, and we always wait on everything at the end of the
1138 // block, so if we only wait on LGKM here, we might end up with
1139 // another s_waitcnt inserted right after this if there are non-LGKM
1140 // instructions still outstanding.
1141 // FIXME: this is too conservative / the comment is wrong.
1142 // We don't wait on everything at the end of the block and we combine
1143 // waitcnts so we should never have back-to-back waitcnts.
1144 ForceEmitZeroWaitcnt
= true;
1149 // Does this operand processing indicate s_wait counter update?
1150 if (EmitWaitcnt
|| IsForceEmitWaitcnt
) {
1151 int CntVal
[NUM_INST_CNTS
];
1153 bool UseDefaultWaitcntStrategy
= true;
1154 if (ForceEmitZeroWaitcnt
|| ForceEmitZeroWaitcnts
) {
1155 // Force all waitcnts to 0.
1156 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1157 T
= (enum InstCounterType
)(T
+ 1)) {
1158 ScoreBrackets
->setScoreLB(T
, ScoreBrackets
->getScoreUB(T
));
1161 CntVal
[EXP_CNT
] = 0;
1162 CntVal
[LGKM_CNT
] = 0;
1163 UseDefaultWaitcntStrategy
= false;
1166 if (UseDefaultWaitcntStrategy
) {
1167 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1168 T
= (enum InstCounterType
)(T
+ 1)) {
1169 if (EmitWaitcnt
& CNT_MASK(T
)) {
1171 ScoreBrackets
->getScoreUB(T
) - ScoreBrackets
->getScoreLB(T
);
1172 int MaxDelta
= ScoreBrackets
->getWaitCountMax(T
);
1173 if (Delta
>= MaxDelta
) {
1176 ScoreBrackets
->setScoreLB(
1177 T
, ScoreBrackets
->getScoreUB(T
) - MaxDelta
);
1179 EmitWaitcnt
&= ~CNT_MASK(T
);
1183 // If we are not waiting for a particular counter then encode
1184 // it as -1 which means "don't care."
1190 // If we are not waiting on any counter we can skip the wait altogether.
1191 if (EmitWaitcnt
!= 0 || IsForceEmitWaitcnt
) {
1192 MachineInstr
*OldWaitcnt
= ScoreBrackets
->getWaitcnt();
1193 int Imm
= (!OldWaitcnt
) ? 0 : OldWaitcnt
->getOperand(0).getImm();
1195 (AMDGPU::decodeVmcnt(IV
, Imm
) !=
1196 (CntVal
[VM_CNT
] & AMDGPU::getVmcntBitMask(IV
))) ||
1197 (AMDGPU::decodeExpcnt(IV
, Imm
) !=
1198 (CntVal
[EXP_CNT
] & AMDGPU::getExpcntBitMask(IV
))) ||
1199 (AMDGPU::decodeLgkmcnt(IV
, Imm
) !=
1200 (CntVal
[LGKM_CNT
] & AMDGPU::getLgkmcntBitMask(IV
)))) {
1201 MachineLoop
*ContainingLoop
= MLI
->getLoopFor(MI
.getParent());
1202 if (ContainingLoop
) {
1203 MachineBasicBlock
*TBB
= ContainingLoop
->getHeader();
1204 BlockWaitcntBrackets
*ScoreBracket
=
1205 BlockWaitcntBracketsMap
[TBB
].get();
1206 if (!ScoreBracket
) {
1207 assert(!BlockVisitedSet
.count(TBB
));
1208 BlockWaitcntBracketsMap
[TBB
] =
1209 llvm::make_unique
<BlockWaitcntBrackets
>(ST
);
1210 ScoreBracket
= BlockWaitcntBracketsMap
[TBB
].get();
1212 ScoreBracket
->setRevisitLoop(true);
1214 << "set-revisit2: Block"
1215 << ContainingLoop
->getHeader()->getNumber() << '\n';);
1219 // Update an existing waitcount, or make a new one.
1220 unsigned Enc
= AMDGPU::encodeWaitcnt(IV
,
1221 ForceEmitWaitcnt
[VM_CNT
] ? 0 : CntVal
[VM_CNT
],
1222 ForceEmitWaitcnt
[EXP_CNT
] ? 0 : CntVal
[EXP_CNT
],
1223 ForceEmitWaitcnt
[LGKM_CNT
] ? 0 : CntVal
[LGKM_CNT
]);
1224 // We don't remove waitcnts that existed prior to the waitcnt
1225 // pass. Check if the waitcnt to-be-inserted can be avoided
1226 // or if the prev waitcnt can be updated.
1227 bool insertSWaitInst
= true;
1228 for (MachineBasicBlock::iterator I
= MI
.getIterator(),
1229 B
= MI
.getParent()->begin();
1230 insertSWaitInst
&& I
!= B
; --I
) {
1231 if (I
== MI
.getIterator())
1234 switch (I
->getOpcode()) {
1235 case AMDGPU::S_WAITCNT
:
1236 if (isWaitcntStronger(I
->getOperand(0).getImm(), Enc
))
1237 insertSWaitInst
= false;
1238 else if (!OldWaitcnt
) {
1240 Enc
= combineWaitcnt(I
->getOperand(0).getImm(), Enc
);
1243 // TODO: skip over instructions which never require wait.
1247 if (insertSWaitInst
) {
1248 if (OldWaitcnt
&& OldWaitcnt
->getOpcode() == AMDGPU::S_WAITCNT
) {
1249 if (ForceEmitZeroWaitcnts
)
1252 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1253 if (IsForceEmitWaitcnt
)
1255 << "Force emit a s_waitcnt due to debug counter\n");
1257 OldWaitcnt
->getOperand(0).setImm(Enc
);
1258 if (!OldWaitcnt
->getParent())
1259 MI
.getParent()->insert(MI
, OldWaitcnt
);
1261 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1262 << "Old Instr: " << MI
<< '\n'
1263 << "New Instr: " << *OldWaitcnt
<< '\n');
1265 auto SWaitInst
= BuildMI(*MI
.getParent(), MI
.getIterator(),
1266 MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
1268 TrackedWaitcntSet
.insert(SWaitInst
);
1270 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1271 << "Old Instr: " << MI
<< '\n'
1272 << "New Instr: " << *SWaitInst
<< '\n');
1276 if (CntVal
[EXP_CNT
] == 0) {
1277 ScoreBrackets
->setMixedExpTypes(false);
1283 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock
&MBB
,
1284 MachineInstr
*Waitcnt
) {
1286 MBB
.push_back(Waitcnt
);
1290 MachineBasicBlock::iterator It
= MBB
.end();
1291 MachineInstr
*MI
= &*(--It
);
1292 if (MI
->isBranch()) {
1293 MBB
.insert(It
, Waitcnt
);
1295 MBB
.push_back(Waitcnt
);
1299 // This is a flat memory operation. Check to see if it has memory
1300 // tokens for both LDS and Memory, and if so mark it as a flat.
1301 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr
&MI
) const {
1302 if (MI
.memoperands_empty())
1305 for (const MachineMemOperand
*Memop
: MI
.memoperands()) {
1306 unsigned AS
= Memop
->getAddrSpace();
1307 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
)
1314 void SIInsertWaitcnts::updateEventWaitcntAfter(
1315 MachineInstr
&Inst
, BlockWaitcntBrackets
*ScoreBrackets
) {
1316 // Now look at the instruction opcode. If it is a memory access
1317 // instruction, update the upper-bound of the appropriate counter's
1318 // bracket and the destination operand scores.
1319 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1320 if (TII
->isDS(Inst
) && TII
->usesLGKM_CNT(Inst
)) {
1321 if (TII
->hasModifiersSet(Inst
, AMDGPU::OpName::gds
)) {
1322 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, GDS_ACCESS
, Inst
);
1323 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, GDS_GPR_LOCK
, Inst
);
1325 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, LDS_ACCESS
, Inst
);
1327 } else if (TII
->isFLAT(Inst
)) {
1328 assert(Inst
.mayLoad() || Inst
.mayStore());
1330 if (TII
->usesVM_CNT(Inst
))
1331 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_ACCESS
, Inst
);
1333 if (TII
->usesLGKM_CNT(Inst
)) {
1334 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, LDS_ACCESS
, Inst
);
1336 // This is a flat memory operation, so note it - it will require
1337 // that both the VM and LGKM be flushed to zero if it is pending when
1338 // a VM or LGKM dependency occurs.
1339 if (mayAccessLDSThroughFlat(Inst
))
1340 ScoreBrackets
->setPendingFlat();
1342 } else if (SIInstrInfo::isVMEM(Inst
) &&
1343 // TODO: get a better carve out.
1344 Inst
.getOpcode() != AMDGPU::BUFFER_WBINVL1
&&
1345 Inst
.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC
&&
1346 Inst
.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL
) {
1347 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_ACCESS
, Inst
);
1348 if (ST
->vmemWriteNeedsExpWaitcnt() &&
1349 (Inst
.mayStore() || AMDGPU::getAtomicNoRetOp(Inst
.getOpcode()) != -1)) {
1350 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMW_GPR_LOCK
, Inst
);
1352 } else if (TII
->isSMRD(Inst
)) {
1353 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, SMEM_ACCESS
, Inst
);
1355 switch (Inst
.getOpcode()) {
1356 case AMDGPU::S_SENDMSG
:
1357 case AMDGPU::S_SENDMSGHALT
:
1358 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, SQ_MESSAGE
, Inst
);
1361 case AMDGPU::EXP_DONE
: {
1362 int Imm
= TII
->getNamedOperand(Inst
, AMDGPU::OpName::tgt
)->getImm();
1363 if (Imm
>= 32 && Imm
<= 63)
1364 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, EXP_PARAM_ACCESS
, Inst
);
1365 else if (Imm
>= 12 && Imm
<= 15)
1366 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, EXP_POS_ACCESS
, Inst
);
1368 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, EXP_GPR_LOCK
, Inst
);
1371 case AMDGPU::S_MEMTIME
:
1372 case AMDGPU::S_MEMREALTIME
:
1373 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, SMEM_ACCESS
, Inst
);
1381 // Merge the score brackets of the Block's predecessors;
1382 // this merged score bracket is used when adding waitcnts to the Block
1383 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock
&Block
) {
1384 BlockWaitcntBrackets
*ScoreBrackets
= BlockWaitcntBracketsMap
[&Block
].get();
1385 int32_t MaxPending
[NUM_INST_CNTS
] = {0};
1386 int32_t MaxFlat
[NUM_INST_CNTS
] = {0};
1387 bool MixedExpTypes
= false;
1389 // For single basic block loops, we need to retain the Block's
1390 // score bracket to have accurate Pred info. So, make a copy of Block's
1391 // score bracket, clear() it (which retains several important bits of info),
1392 // populate, and then replace en masse. For non-single basic block loops,
1393 // just clear Block's current score bracket and repopulate in-place.
1395 std::unique_ptr
<BlockWaitcntBrackets
> S
;
1397 IsSelfPred
= (std::find(Block
.pred_begin(), Block
.pred_end(), &Block
))
1398 != Block
.pred_end();
1400 S
= llvm::make_unique
<BlockWaitcntBrackets
>(*ScoreBrackets
);
1401 ScoreBrackets
= S
.get();
1404 ScoreBrackets
->clear();
1406 // See if there are any uninitialized predecessors. If so, emit an
1407 // s_waitcnt 0 at the beginning of the block.
1408 for (MachineBasicBlock
*Pred
: Block
.predecessors()) {
1409 BlockWaitcntBrackets
*PredScoreBrackets
=
1410 BlockWaitcntBracketsMap
[Pred
].get();
1411 bool Visited
= BlockVisitedSet
.count(Pred
);
1412 if (!Visited
|| PredScoreBrackets
->getWaitAtBeginning()) {
1415 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1416 T
= (enum InstCounterType
)(T
+ 1)) {
1418 PredScoreBrackets
->getScoreUB(T
) - PredScoreBrackets
->getScoreLB(T
);
1419 MaxPending
[T
] = std::max(MaxPending
[T
], span
);
1421 PredScoreBrackets
->pendingFlat(T
) - PredScoreBrackets
->getScoreLB(T
);
1422 MaxFlat
[T
] = std::max(MaxFlat
[T
], span
);
1425 MixedExpTypes
|= PredScoreBrackets
->mixedExpTypes();
1428 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1429 // Also handle kills for exit block.
1430 if (Block
.succ_empty() && !KillWaitBrackets
.empty()) {
1431 for (unsigned int I
= 0; I
< KillWaitBrackets
.size(); I
++) {
1432 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1433 T
= (enum InstCounterType
)(T
+ 1)) {
1434 int Span
= KillWaitBrackets
[I
]->getScoreUB(T
) -
1435 KillWaitBrackets
[I
]->getScoreLB(T
);
1436 MaxPending
[T
] = std::max(MaxPending
[T
], Span
);
1437 Span
= KillWaitBrackets
[I
]->pendingFlat(T
) -
1438 KillWaitBrackets
[I
]->getScoreLB(T
);
1439 MaxFlat
[T
] = std::max(MaxFlat
[T
], Span
);
1442 MixedExpTypes
|= KillWaitBrackets
[I
]->mixedExpTypes();
1446 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1447 for (MachineBasicBlock
*Pred
: Block
.predecessors()) {
1448 BlockWaitcntBrackets
*PredScoreBrackets
=
1449 BlockWaitcntBracketsMap
[Pred
].get();
1450 bool Visited
= BlockVisitedSet
.count(Pred
);
1451 if (!Visited
|| PredScoreBrackets
->getWaitAtBeginning()) {
1455 int GDSSpan
= PredScoreBrackets
->getEventUB(GDS_GPR_LOCK
) -
1456 PredScoreBrackets
->getScoreLB(EXP_CNT
);
1457 MaxPending
[EXP_CNT
] = std::max(MaxPending
[EXP_CNT
], GDSSpan
);
1458 int EXPSpan
= PredScoreBrackets
->getEventUB(EXP_GPR_LOCK
) -
1459 PredScoreBrackets
->getScoreLB(EXP_CNT
);
1460 MaxPending
[EXP_CNT
] = std::max(MaxPending
[EXP_CNT
], EXPSpan
);
1463 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1464 if (Block
.succ_empty() && !KillWaitBrackets
.empty()) {
1465 for (unsigned int I
= 0; I
< KillWaitBrackets
.size(); I
++) {
1466 int GDSSpan
= KillWaitBrackets
[I
]->getEventUB(GDS_GPR_LOCK
) -
1467 KillWaitBrackets
[I
]->getScoreLB(EXP_CNT
);
1468 MaxPending
[EXP_CNT
] = std::max(MaxPending
[EXP_CNT
], GDSSpan
);
1469 int EXPSpan
= KillWaitBrackets
[I
]->getEventUB(EXP_GPR_LOCK
) -
1470 KillWaitBrackets
[I
]->getScoreLB(EXP_CNT
);
1471 MaxPending
[EXP_CNT
] = std::max(MaxPending
[EXP_CNT
], EXPSpan
);
1476 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1477 // TODO: how does LC distinguish between function entry and main entry?
1478 // If this is the entry to a function, force a wait.
1479 MachineBasicBlock
&Entry
= Block
.getParent()->front();
1480 if (Entry
.getNumber() == Block
.getNumber()) {
1481 ScoreBrackets
->setWaitAtBeginning();
1486 // Now set the current Block's brackets to the largest ending bracket.
1487 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1488 T
= (enum InstCounterType
)(T
+ 1)) {
1489 ScoreBrackets
->setScoreUB(T
, MaxPending
[T
]);
1490 ScoreBrackets
->setScoreLB(T
, 0);
1491 ScoreBrackets
->setLastFlat(T
, MaxFlat
[T
]);
1494 ScoreBrackets
->setMixedExpTypes(MixedExpTypes
);
1496 // Set the register scoreboard.
1497 for (MachineBasicBlock
*Pred
: Block
.predecessors()) {
1498 if (!BlockVisitedSet
.count(Pred
)) {
1502 BlockWaitcntBrackets
*PredScoreBrackets
=
1503 BlockWaitcntBracketsMap
[Pred
].get();
1505 // Now merge the gpr_reg_score information
1506 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1507 T
= (enum InstCounterType
)(T
+ 1)) {
1508 int PredLB
= PredScoreBrackets
->getScoreLB(T
);
1509 int PredUB
= PredScoreBrackets
->getScoreUB(T
);
1510 if (PredLB
< PredUB
) {
1511 int PredScale
= MaxPending
[T
] - PredUB
;
1512 // Merge vgpr scores.
1513 for (int J
= 0; J
<= PredScoreBrackets
->getMaxVGPR(); J
++) {
1514 int PredRegScore
= PredScoreBrackets
->getRegScore(J
, T
);
1515 if (PredRegScore
<= PredLB
)
1517 int NewRegScore
= PredScale
+ PredRegScore
;
1518 ScoreBrackets
->setRegScore(
1519 J
, T
, std::max(ScoreBrackets
->getRegScore(J
, T
), NewRegScore
));
1521 // Also need to merge sgpr scores for lgkm_cnt.
1522 if (T
== LGKM_CNT
) {
1523 for (int J
= 0; J
<= PredScoreBrackets
->getMaxSGPR(); J
++) {
1525 PredScoreBrackets
->getRegScore(J
+ NUM_ALL_VGPRS
, LGKM_CNT
);
1526 if (PredRegScore
<= PredLB
)
1528 int NewRegScore
= PredScale
+ PredRegScore
;
1529 ScoreBrackets
->setRegScore(
1530 J
+ NUM_ALL_VGPRS
, LGKM_CNT
,
1532 ScoreBrackets
->getRegScore(J
+ NUM_ALL_VGPRS
, LGKM_CNT
),
1539 // Also merge the WaitEvent information.
1540 ForAllWaitEventType(W
) {
1541 enum InstCounterType T
= PredScoreBrackets
->eventCounter(W
);
1542 int PredEventUB
= PredScoreBrackets
->getEventUB(W
);
1543 if (PredEventUB
> PredScoreBrackets
->getScoreLB(T
)) {
1545 MaxPending
[T
] + PredEventUB
- PredScoreBrackets
->getScoreUB(T
);
1546 if (NewEventUB
> 0) {
1547 ScoreBrackets
->setEventUB(
1548 W
, std::max(ScoreBrackets
->getEventUB(W
), NewEventUB
));
1554 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1555 // Set the register scoreboard.
1556 if (Block
.succ_empty() && !KillWaitBrackets
.empty()) {
1557 for (unsigned int I
= 0; I
< KillWaitBrackets
.size(); I
++) {
1558 // Now merge the gpr_reg_score information.
1559 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1560 T
= (enum InstCounterType
)(T
+ 1)) {
1561 int PredLB
= KillWaitBrackets
[I
]->getScoreLB(T
);
1562 int PredUB
= KillWaitBrackets
[I
]->getScoreUB(T
);
1563 if (PredLB
< PredUB
) {
1564 int PredScale
= MaxPending
[T
] - PredUB
;
1565 // Merge vgpr scores.
1566 for (int J
= 0; J
<= KillWaitBrackets
[I
]->getMaxVGPR(); J
++) {
1567 int PredRegScore
= KillWaitBrackets
[I
]->getRegScore(J
, T
);
1568 if (PredRegScore
<= PredLB
)
1570 int NewRegScore
= PredScale
+ PredRegScore
;
1571 ScoreBrackets
->setRegScore(
1572 J
, T
, std::max(ScoreBrackets
->getRegScore(J
, T
), NewRegScore
));
1574 // Also need to merge sgpr scores for lgkm_cnt.
1575 if (T
== LGKM_CNT
) {
1576 for (int J
= 0; J
<= KillWaitBrackets
[I
]->getMaxSGPR(); J
++) {
1578 KillWaitBrackets
[I
]->getRegScore(J
+ NUM_ALL_VGPRS
, LGKM_CNT
);
1579 if (PredRegScore
<= PredLB
)
1581 int NewRegScore
= PredScale
+ PredRegScore
;
1582 ScoreBrackets
->setRegScore(
1583 J
+ NUM_ALL_VGPRS
, LGKM_CNT
,
1585 ScoreBrackets
->getRegScore(J
+ NUM_ALL_VGPRS
, LGKM_CNT
),
1592 // Also merge the WaitEvent information.
1593 ForAllWaitEventType(W
) {
1594 enum InstCounterType T
= KillWaitBrackets
[I
]->eventCounter(W
);
1595 int PredEventUB
= KillWaitBrackets
[I
]->getEventUB(W
);
1596 if (PredEventUB
> KillWaitBrackets
[I
]->getScoreLB(T
)) {
1598 MaxPending
[T
] + PredEventUB
- KillWaitBrackets
[I
]->getScoreUB(T
);
1599 if (NewEventUB
> 0) {
1600 ScoreBrackets
->setEventUB(
1601 W
, std::max(ScoreBrackets
->getEventUB(W
), NewEventUB
));
1608 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1609 // sequencing predecessors, because changes to EXEC require waitcnts due to
1610 // the delayed nature of these operations.
1611 for (MachineBasicBlock
*Pred
: Block
.predecessors()) {
1612 if (!BlockVisitedSet
.count(Pred
)) {
1616 BlockWaitcntBrackets
*PredScoreBrackets
=
1617 BlockWaitcntBracketsMap
[Pred
].get();
1619 int pred_gds_ub
= PredScoreBrackets
->getEventUB(GDS_GPR_LOCK
);
1620 if (pred_gds_ub
> PredScoreBrackets
->getScoreLB(EXP_CNT
)) {
1621 int new_gds_ub
= MaxPending
[EXP_CNT
] + pred_gds_ub
-
1622 PredScoreBrackets
->getScoreUB(EXP_CNT
);
1623 if (new_gds_ub
> 0) {
1624 ScoreBrackets
->setEventUB(
1626 std::max(ScoreBrackets
->getEventUB(GDS_GPR_LOCK
), new_gds_ub
));
1629 int pred_exp_ub
= PredScoreBrackets
->getEventUB(EXP_GPR_LOCK
);
1630 if (pred_exp_ub
> PredScoreBrackets
->getScoreLB(EXP_CNT
)) {
1631 int new_exp_ub
= MaxPending
[EXP_CNT
] + pred_exp_ub
-
1632 PredScoreBrackets
->getScoreUB(EXP_CNT
);
1633 if (new_exp_ub
> 0) {
1634 ScoreBrackets
->setEventUB(
1636 std::max(ScoreBrackets
->getEventUB(EXP_GPR_LOCK
), new_exp_ub
));
1641 // if a single block loop, update the score brackets. Not needed for other
1642 // blocks, as we did this in-place
1644 BlockWaitcntBracketsMap
[&Block
] = llvm::make_unique
<BlockWaitcntBrackets
>(*ScoreBrackets
);
1648 /// Return true if the given basic block is a "bottom" block of a loop.
1649 /// This works even if the loop is discontiguous. This also handles
1650 /// multiple back-edges for the same "header" block of a loop.
1651 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop
*Loop
,
1652 const MachineBasicBlock
*Block
) {
1653 for (MachineBasicBlock
*MBB
: Loop
->blocks()) {
1654 if (MBB
== Block
&& MBB
->isSuccessor(Loop
->getHeader())) {
1661 /// Count the number of "bottom" basic blocks of a loop.
1662 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop
*Loop
) {
1664 for (MachineBasicBlock
*MBB
: Loop
->blocks()) {
1665 if (MBB
->isSuccessor(Loop
->getHeader())) {
1672 // Generate s_waitcnt instructions where needed.
1673 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction
&MF
,
1674 MachineBasicBlock
&Block
) {
1675 // Initialize the state information.
1676 mergeInputScoreBrackets(Block
);
1678 BlockWaitcntBrackets
*ScoreBrackets
= BlockWaitcntBracketsMap
[&Block
].get();
1681 dbgs() << "*** Block" << Block
.getNumber() << " ***";
1682 ScoreBrackets
->dump();
1685 // Walk over the instructions.
1686 for (MachineBasicBlock::iterator Iter
= Block
.begin(), E
= Block
.end();
1688 MachineInstr
&Inst
= *Iter
;
1689 // Remove any previously existing waitcnts.
1690 if (Inst
.getOpcode() == AMDGPU::S_WAITCNT
) {
1691 // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1692 // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1694 if (!TrackedWaitcntSet
.count(&Inst
))
1698 Inst
.removeFromParent();
1700 ScoreBrackets
->setWaitcnt(&Inst
);
1704 // Kill instructions generate a conditional branch to the endmain block.
1705 // Merge the current waitcnt state into the endmain block information.
1706 // TODO: Are there other flavors of KILL instruction?
1707 if (Inst
.getOpcode() == AMDGPU::KILL
) {
1708 addKillWaitBracket(ScoreBrackets
);
1711 bool VCCZBugWorkAround
= false;
1712 if (readsVCCZ(Inst
) &&
1713 (!VCCZBugHandledSet
.count(&Inst
))) {
1714 if (ScoreBrackets
->getScoreLB(LGKM_CNT
) <
1715 ScoreBrackets
->getScoreUB(LGKM_CNT
) &&
1716 ScoreBrackets
->hasPendingSMEM()) {
1717 if (ST
->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS
)
1718 VCCZBugWorkAround
= true;
1722 // Generate an s_waitcnt instruction to be placed before
1723 // cur_Inst, if needed.
1724 generateWaitcntInstBefore(Inst
, ScoreBrackets
);
1726 updateEventWaitcntAfter(Inst
, ScoreBrackets
);
1728 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1729 // If this instruction generates a S_SETVSKIP because it is an
1730 // indexed resource, and we are on Tahiti, then it will also force
1731 // an S_WAITCNT vmcnt(0)
1732 if (RequireCheckResourceType(Inst
, context
)) {
1733 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1734 ScoreBrackets
->setScoreLB(VM_CNT
,
1735 ScoreBrackets
->getScoreUB(VM_CNT
));
1739 ScoreBrackets
->clearWaitcnt();
1743 ScoreBrackets
->dump();
1746 // Check to see if this is a GWS instruction. If so, and if this is CI or
1747 // VI, then the generated code sequence will include an S_WAITCNT 0.
1748 // TODO: Are these the only GWS instructions?
1749 if (Inst
.getOpcode() == AMDGPU::DS_GWS_INIT
||
1750 Inst
.getOpcode() == AMDGPU::DS_GWS_SEMA_V
||
1751 Inst
.getOpcode() == AMDGPU::DS_GWS_SEMA_BR
||
1752 Inst
.getOpcode() == AMDGPU::DS_GWS_SEMA_P
||
1753 Inst
.getOpcode() == AMDGPU::DS_GWS_BARRIER
) {
1754 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1755 ScoreBrackets
->updateByWait(VM_CNT
, ScoreBrackets
->getScoreUB(VM_CNT
));
1756 ScoreBrackets
->updateByWait(EXP_CNT
, ScoreBrackets
->getScoreUB(EXP_CNT
));
1757 ScoreBrackets
->updateByWait(LGKM_CNT
,
1758 ScoreBrackets
->getScoreUB(LGKM_CNT
));
1761 // TODO: Remove this work-around after fixing the scheduler and enable the
1763 if (VCCZBugWorkAround
) {
1764 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1765 // bit is updated, so we can restore the bit by reading the value of
1766 // vcc and then writing it back to the register.
1767 BuildMI(Block
, Inst
, Inst
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
1769 .addReg(AMDGPU::VCC
);
1770 VCCZBugHandledSet
.insert(&Inst
);
1776 // Check if we need to force convergence at loop footer.
1777 MachineLoop
*ContainingLoop
= MLI
->getLoopFor(&Block
);
1778 if (ContainingLoop
&& isLoopBottom(ContainingLoop
, &Block
)) {
1779 LoopWaitcntData
*WaitcntData
= LoopWaitcntDataMap
[ContainingLoop
].get();
1780 WaitcntData
->print();
1781 LLVM_DEBUG(dbgs() << '\n';);
1783 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1784 // placement, but doesn't guarantee convergence for a loop. Each
1785 // loop should take at most (n+1) iterations for it to converge naturally,
1786 // where n is the number of bottom blocks. If this threshold is reached and
1787 // the result hasn't converged, then we force convergence by inserting
1788 // a s_waitcnt at the end of loop footer.
1789 if (WaitcntData
->getIterCnt() > (countNumBottomBlocks(ContainingLoop
) + 1)) {
1790 // To ensure convergence, need to make wait events at loop footer be no
1791 // more than those from the previous iteration.
1792 // As a simplification, instead of tracking individual scores and
1793 // generating the precise wait count, just wait on 0.
1794 bool HasPending
= false;
1795 MachineInstr
*SWaitInst
= WaitcntData
->getWaitcnt();
1796 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1797 T
= (enum InstCounterType
)(T
+ 1)) {
1798 if (ScoreBrackets
->getScoreUB(T
) > ScoreBrackets
->getScoreLB(T
)) {
1799 ScoreBrackets
->setScoreLB(T
, ScoreBrackets
->getScoreUB(T
));
1807 SWaitInst
= BuildMI(Block
, Block
.getFirstNonPHI(),
1808 DebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
1810 TrackedWaitcntSet
.insert(SWaitInst
);
1811 #if 0 // TODO: Format the debug output
1812 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context
);
1813 OutputTransformAdd(SWaitInst
, context
);
1817 _DEV( REPORTED_STATS
->force_waitcnt_converge
= 1; )
1823 SWaitInst
->print(dbgs());
1824 dbgs() << "\nAdjusted score board:";
1825 ScoreBrackets
->dump();
1828 // Add this waitcnt to the block. It is either newly created or
1829 // created in previous iterations and added back since block traversal
1830 // always removes waitcnts.
1831 insertWaitcntBeforeCF(Block
, SWaitInst
);
1832 WaitcntData
->setWaitcnt(SWaitInst
);
1838 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction
&MF
) {
1839 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1840 TII
= ST
->getInstrInfo();
1841 TRI
= &TII
->getRegisterInfo();
1842 MRI
= &MF
.getRegInfo();
1843 MLI
= &getAnalysis
<MachineLoopInfo
>();
1844 IV
= AMDGPU::getIsaVersion(ST
->getCPU());
1845 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1847 ForceEmitZeroWaitcnts
= ForceEmitZeroFlag
;
1848 for (enum InstCounterType T
= VM_CNT
; T
< NUM_INST_CNTS
;
1849 T
= (enum InstCounterType
)(T
+ 1))
1850 ForceEmitWaitcnt
[T
] = false;
1852 HardwareLimits
.VmcntMax
= AMDGPU::getVmcntBitMask(IV
);
1853 HardwareLimits
.ExpcntMax
= AMDGPU::getExpcntBitMask(IV
);
1854 HardwareLimits
.LgkmcntMax
= AMDGPU::getLgkmcntBitMask(IV
);
1856 HardwareLimits
.NumVGPRsMax
= ST
->getAddressableNumVGPRs();
1857 HardwareLimits
.NumSGPRsMax
= ST
->getAddressableNumSGPRs();
1858 assert(HardwareLimits
.NumVGPRsMax
<= SQ_MAX_PGM_VGPRS
);
1859 assert(HardwareLimits
.NumSGPRsMax
<= SQ_MAX_PGM_SGPRS
);
1861 RegisterEncoding
.VGPR0
= TRI
->getEncodingValue(AMDGPU::VGPR0
);
1862 RegisterEncoding
.VGPRL
=
1863 RegisterEncoding
.VGPR0
+ HardwareLimits
.NumVGPRsMax
- 1;
1864 RegisterEncoding
.SGPR0
= TRI
->getEncodingValue(AMDGPU::SGPR0
);
1865 RegisterEncoding
.SGPRL
=
1866 RegisterEncoding
.SGPR0
+ HardwareLimits
.NumSGPRsMax
- 1;
1868 TrackedWaitcntSet
.clear();
1869 BlockVisitedSet
.clear();
1870 VCCZBugHandledSet
.clear();
1871 LoopWaitcntDataMap
.clear();
1872 BlockWaitcntProcessedSet
.clear();
1874 // Walk over the blocks in reverse post-dominator order, inserting
1875 // s_waitcnt where needed.
1876 ReversePostOrderTraversal
<MachineFunction
*> RPOT(&MF
);
1877 bool Modified
= false;
1878 for (ReversePostOrderTraversal
<MachineFunction
*>::rpo_iterator
1880 E
= RPOT
.end(), J
= RPOT
.begin();
1882 MachineBasicBlock
&MBB
= **I
;
1884 BlockVisitedSet
.insert(&MBB
);
1886 BlockWaitcntBrackets
*ScoreBrackets
= BlockWaitcntBracketsMap
[&MBB
].get();
1887 if (!ScoreBrackets
) {
1888 BlockWaitcntBracketsMap
[&MBB
] = llvm::make_unique
<BlockWaitcntBrackets
>(ST
);
1889 ScoreBrackets
= BlockWaitcntBracketsMap
[&MBB
].get();
1891 ScoreBrackets
->setPostOrder(MBB
.getNumber());
1892 MachineLoop
*ContainingLoop
= MLI
->getLoopFor(&MBB
);
1893 if (ContainingLoop
&& LoopWaitcntDataMap
[ContainingLoop
] == nullptr)
1894 LoopWaitcntDataMap
[ContainingLoop
] = llvm::make_unique
<LoopWaitcntData
>();
1896 // If we are walking into the block from before the loop, then guarantee
1897 // at least 1 re-walk over the loop to propagate the information, even if
1898 // no S_WAITCNT instructions were generated.
1899 if (ContainingLoop
&& ContainingLoop
->getHeader() == &MBB
) {
1900 unsigned Count
= countNumBottomBlocks(ContainingLoop
);
1902 // If the loop has multiple back-edges, and so more than one "bottom"
1903 // basic block, we have to guarantee a re-walk over every blocks.
1904 if ((std::count(BlockWaitcntProcessedSet
.begin(),
1905 BlockWaitcntProcessedSet
.end(), &MBB
) < (int)Count
)) {
1906 BlockWaitcntBracketsMap
[&MBB
]->setRevisitLoop(true);
1907 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
1908 << ContainingLoop
->getHeader()->getNumber() << '\n';);
1912 // Walk over the instructions.
1913 insertWaitcntInBlock(MF
, MBB
);
1915 // Record that waitcnts have been processed at least once for this block.
1916 BlockWaitcntProcessedSet
.push_back(&MBB
);
1918 // See if we want to revisit the loop. If a loop has multiple back-edges,
1919 // we shouldn't revisit the same "bottom" basic block.
1920 if (ContainingLoop
&& isLoopBottom(ContainingLoop
, &MBB
) &&
1921 std::count(BlockWaitcntProcessedSet
.begin(),
1922 BlockWaitcntProcessedSet
.end(), &MBB
) == 1) {
1923 MachineBasicBlock
*EntryBB
= ContainingLoop
->getHeader();
1924 BlockWaitcntBrackets
*EntrySB
= BlockWaitcntBracketsMap
[EntryBB
].get();
1925 if (EntrySB
&& EntrySB
->getRevisitLoop()) {
1926 EntrySB
->setRevisitLoop(false);
1928 int32_t PostOrder
= EntrySB
->getPostOrder();
1929 // TODO: Avoid this loop. Find another way to set I.
1930 for (ReversePostOrderTraversal
<MachineFunction
*>::rpo_iterator
1934 MachineBasicBlock
&MBBX
= **X
;
1935 if (MBBX
.getNumber() == PostOrder
) {
1940 LoopWaitcntData
*WaitcntData
= LoopWaitcntDataMap
[ContainingLoop
].get();
1941 WaitcntData
->incIterCnt();
1942 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB
->getNumber() << '\n';);
1945 LoopWaitcntData
*WaitcntData
= LoopWaitcntDataMap
[ContainingLoop
].get();
1946 // Loop converged, reset iteration count. If this loop gets revisited,
1947 // it must be from an outer loop, the counter will restart, this will
1948 // ensure we don't force convergence on such revisits.
1949 WaitcntData
->resetIterCnt();
1957 SmallVector
<MachineBasicBlock
*, 4> EndPgmBlocks
;
1959 bool HaveScalarStores
= false;
1961 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end(); BI
!= BE
;
1963 MachineBasicBlock
&MBB
= *BI
;
1965 for (MachineBasicBlock::iterator I
= MBB
.begin(), E
= MBB
.end(); I
!= E
;
1967 if (!HaveScalarStores
&& TII
->isScalarStore(*I
))
1968 HaveScalarStores
= true;
1970 if (I
->getOpcode() == AMDGPU::S_ENDPGM
||
1971 I
->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG
)
1972 EndPgmBlocks
.push_back(&MBB
);
1976 if (HaveScalarStores
) {
1977 // If scalar writes are used, the cache must be flushed or else the next
1978 // wave to reuse the same scratch memory can be clobbered.
1980 // Insert s_dcache_wb at wave termination points if there were any scalar
1981 // stores, and only if the cache hasn't already been flushed. This could be
1982 // improved by looking across blocks for flushes in postdominating blocks
1983 // from the stores but an explicitly requested flush is probably very rare.
1984 for (MachineBasicBlock
*MBB
: EndPgmBlocks
) {
1985 bool SeenDCacheWB
= false;
1987 for (MachineBasicBlock::iterator I
= MBB
->begin(), E
= MBB
->end(); I
!= E
;
1989 if (I
->getOpcode() == AMDGPU::S_DCACHE_WB
)
1990 SeenDCacheWB
= true;
1991 else if (TII
->isScalarStore(*I
))
1992 SeenDCacheWB
= false;
1994 // FIXME: It would be better to insert this before a waitcnt if any.
1995 if ((I
->getOpcode() == AMDGPU::S_ENDPGM
||
1996 I
->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG
) &&
1999 BuildMI(*MBB
, I
, I
->getDebugLoc(), TII
->get(AMDGPU::S_DCACHE_WB
));
2005 if (!MFI
->isEntryFunction()) {
2006 // Wait for any outstanding memory operations that the input registers may
2007 // depend on. We can't track them and it's better to the wait after the
2008 // costly call sequence.
2010 // TODO: Could insert earlier and schedule more liberally with operations
2011 // that only use caller preserved registers.
2012 MachineBasicBlock
&EntryBB
= MF
.front();
2013 BuildMI(EntryBB
, EntryBB
.getFirstNonPHI(), DebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))