1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Insert wait instructions for memory reads and writes.
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
24 //===----------------------------------------------------------------------===//
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/PostOrderIterator.h"
33 #include "llvm/CodeGen/MachinePostDominators.h"
34 #include "llvm/InitializePasses.h"
35 #include "llvm/Support/DebugCounter.h"
36 #include "llvm/Support/TargetParser.h"
39 #define DEBUG_TYPE "si-insert-waitcnts"
41 DEBUG_COUNTER(ForceExpCounter
, DEBUG_TYPE
"-forceexp",
42 "Force emit s_waitcnt expcnt(0) instrs");
43 DEBUG_COUNTER(ForceLgkmCounter
, DEBUG_TYPE
"-forcelgkm",
44 "Force emit s_waitcnt lgkmcnt(0) instrs");
45 DEBUG_COUNTER(ForceVMCounter
, DEBUG_TYPE
"-forcevm",
46 "Force emit s_waitcnt vmcnt(0) instrs");
48 static cl::opt
<bool> ForceEmitZeroFlag(
49 "amdgpu-waitcnt-forcezero",
50 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
51 cl::init(false), cl::Hidden
);
55 template <typename EnumT
>
57 : public iterator_facade_base
<enum_iterator
<EnumT
>,
58 std::forward_iterator_tag
, const EnumT
> {
61 enum_iterator() = default;
62 enum_iterator(EnumT Value
) : Value(Value
) {}
64 enum_iterator
&operator++() {
65 Value
= static_cast<EnumT
>(Value
+ 1);
69 bool operator==(const enum_iterator
&RHS
) const { return Value
== RHS
.Value
; }
71 EnumT
operator*() const { return Value
; }
74 // Class of object that encapsulates latest instruction counter score
75 // associated with the operand. Used for determining whether
76 // s_waitcnt instruction needs to be emited.
78 #define CNT_MASK(t) (1u << (t))
80 enum InstCounterType
{ VM_CNT
= 0, LGKM_CNT
, EXP_CNT
, VS_CNT
, NUM_INST_CNTS
};
82 iterator_range
<enum_iterator
<InstCounterType
>> inst_counter_types() {
83 return make_range(enum_iterator
<InstCounterType
>(VM_CNT
),
84 enum_iterator
<InstCounterType
>(NUM_INST_CNTS
));
87 using RegInterval
= std::pair
<int, int>;
104 VMEM_ACCESS
, // vector-memory read & write
105 VMEM_READ_ACCESS
, // vector-memory read
106 VMEM_WRITE_ACCESS
,// vector-memory write
107 LDS_ACCESS
, // lds read & write
108 GDS_ACCESS
, // gds read & write
109 SQ_MESSAGE
, // send message
110 SMEM_ACCESS
, // scalar-memory read & write
111 EXP_GPR_LOCK
, // export holding on its data src
112 GDS_GPR_LOCK
, // GDS holding on its data and addr src
113 EXP_POS_ACCESS
, // write to export position
114 EXP_PARAM_ACCESS
, // write to export parameter
115 VMW_GPR_LOCK
, // vector-memory write holding on its data src
119 static const unsigned WaitEventMaskForInst
[NUM_INST_CNTS
] = {
120 (1 << VMEM_ACCESS
) | (1 << VMEM_READ_ACCESS
),
121 (1 << SMEM_ACCESS
) | (1 << LDS_ACCESS
) | (1 << GDS_ACCESS
) |
123 (1 << EXP_GPR_LOCK
) | (1 << GDS_GPR_LOCK
) | (1 << VMW_GPR_LOCK
) |
124 (1 << EXP_PARAM_ACCESS
) | (1 << EXP_POS_ACCESS
),
125 (1 << VMEM_WRITE_ACCESS
)
129 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132 // We reserve a fixed number of VGPR slots in the scoring tables for
133 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
134 enum RegisterMapping
{
135 SQ_MAX_PGM_VGPRS
= 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET
= 226, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS
= 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS
= 1, // A reserved slot for DS.
139 EXTRA_VGPR_LDS
= 0, // This is a placeholder the Shader algorithm uses.
140 NUM_ALL_VGPRS
= SQ_MAX_PGM_VGPRS
+ NUM_EXTRA_VGPRS
, // Where SGPR starts.
143 // Enumerate different types of result-returning VMEM operations. Although
144 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
145 // s_waitcnt only instructions of the same VmemType are guaranteed to write
146 // their results in order -- so there is no need to insert an s_waitcnt between
147 // two instructions of the same type that write the same vgpr.
149 // BUF instructions and MIMG instructions without a sampler.
151 // MIMG instructions with a sampler.
155 VmemType
getVmemType(const MachineInstr
&Inst
) {
156 assert(SIInstrInfo::isVMEM(Inst
));
157 if (!SIInstrInfo::isMIMG(Inst
))
158 return VMEM_NOSAMPLER
;
159 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(Inst
.getOpcode());
160 return AMDGPU::getMIMGBaseOpcodeInfo(Info
->BaseOpcode
)->Sampler
165 void addWait(AMDGPU::Waitcnt
&Wait
, InstCounterType T
, unsigned Count
) {
168 Wait
.VmCnt
= std::min(Wait
.VmCnt
, Count
);
171 Wait
.ExpCnt
= std::min(Wait
.ExpCnt
, Count
);
174 Wait
.LgkmCnt
= std::min(Wait
.LgkmCnt
, Count
);
177 Wait
.VsCnt
= std::min(Wait
.VsCnt
, Count
);
180 llvm_unreachable("bad InstCounterType");
184 // This objects maintains the current score brackets of each wait counter, and
185 // a per-register scoreboard for each wait counter.
187 // We also maintain the latest score for every event type that can change the
188 // waitcnt in order to know if there are multiple types of events within
189 // the brackets. When multiple types of event happen in the bracket,
190 // wait count may get decreased out of order, therefore we need to put in
191 // "s_waitcnt 0" before use.
192 class WaitcntBrackets
{
194 WaitcntBrackets(const GCNSubtarget
*SubTarget
) : ST(SubTarget
) {}
196 static unsigned getWaitCountMax(InstCounterType T
) {
199 return HardwareLimits
.VmcntMax
;
201 return HardwareLimits
.LgkmcntMax
;
203 return HardwareLimits
.ExpcntMax
;
205 return HardwareLimits
.VscntMax
;
212 unsigned getScoreLB(InstCounterType T
) const {
213 assert(T
< NUM_INST_CNTS
);
217 unsigned getScoreUB(InstCounterType T
) const {
218 assert(T
< NUM_INST_CNTS
);
222 // Mapping from event to counter.
223 InstCounterType
eventCounter(WaitEventType E
) {
224 if (WaitEventMaskForInst
[VM_CNT
] & (1 << E
))
226 if (WaitEventMaskForInst
[LGKM_CNT
] & (1 << E
))
228 if (WaitEventMaskForInst
[VS_CNT
] & (1 << E
))
230 assert(WaitEventMaskForInst
[EXP_CNT
] & (1 << E
));
234 unsigned getRegScore(int GprNo
, InstCounterType T
) {
235 if (GprNo
< NUM_ALL_VGPRS
) {
236 return VgprScores
[T
][GprNo
];
238 assert(T
== LGKM_CNT
);
239 return SgprScores
[GprNo
- NUM_ALL_VGPRS
];
242 bool merge(const WaitcntBrackets
&Other
);
244 RegInterval
getRegInterval(const MachineInstr
*MI
, const SIInstrInfo
*TII
,
245 const MachineRegisterInfo
*MRI
,
246 const SIRegisterInfo
*TRI
, unsigned OpNo
) const;
248 bool counterOutOfOrder(InstCounterType T
) const;
249 void simplifyWaitcnt(AMDGPU::Waitcnt
&Wait
) const;
250 void simplifyWaitcnt(InstCounterType T
, unsigned &Count
) const;
251 void determineWait(InstCounterType T
, unsigned ScoreToWait
,
252 AMDGPU::Waitcnt
&Wait
) const;
253 void applyWaitcnt(const AMDGPU::Waitcnt
&Wait
);
254 void applyWaitcnt(InstCounterType T
, unsigned Count
);
255 void updateByEvent(const SIInstrInfo
*TII
, const SIRegisterInfo
*TRI
,
256 const MachineRegisterInfo
*MRI
, WaitEventType E
,
259 bool hasPending() const { return PendingEvents
!= 0; }
260 bool hasPendingEvent(WaitEventType E
) const {
261 return PendingEvents
& (1 << E
);
264 bool hasMixedPendingEvents(InstCounterType T
) const {
265 unsigned Events
= PendingEvents
& WaitEventMaskForInst
[T
];
266 // Return true if more than one bit is set in Events.
267 return Events
& (Events
- 1);
270 bool hasPendingFlat() const {
271 return ((LastFlat
[LGKM_CNT
] > ScoreLBs
[LGKM_CNT
] &&
272 LastFlat
[LGKM_CNT
] <= ScoreUBs
[LGKM_CNT
]) ||
273 (LastFlat
[VM_CNT
] > ScoreLBs
[VM_CNT
] &&
274 LastFlat
[VM_CNT
] <= ScoreUBs
[VM_CNT
]));
277 void setPendingFlat() {
278 LastFlat
[VM_CNT
] = ScoreUBs
[VM_CNT
];
279 LastFlat
[LGKM_CNT
] = ScoreUBs
[LGKM_CNT
];
282 // Return true if there might be pending writes to the specified vgpr by VMEM
283 // instructions with types different from V.
284 bool hasOtherPendingVmemTypes(int GprNo
, VmemType V
) const {
285 assert(GprNo
< NUM_ALL_VGPRS
);
286 return VgprVmemTypes
[GprNo
] & ~(1 << V
);
289 void clearVgprVmemTypes(int GprNo
) {
290 assert(GprNo
< NUM_ALL_VGPRS
);
291 VgprVmemTypes
[GprNo
] = 0;
294 void print(raw_ostream
&);
295 void dump() { print(dbgs()); }
304 static bool mergeScore(const MergeInfo
&M
, unsigned &Score
,
305 unsigned OtherScore
);
307 void setScoreLB(InstCounterType T
, unsigned Val
) {
308 assert(T
< NUM_INST_CNTS
);
312 void setScoreUB(InstCounterType T
, unsigned Val
) {
313 assert(T
< NUM_INST_CNTS
);
316 unsigned UB
= ScoreUBs
[T
] - getWaitCountMax(EXP_CNT
);
317 if (ScoreLBs
[T
] < UB
&& UB
< ScoreUBs
[T
])
322 void setRegScore(int GprNo
, InstCounterType T
, unsigned Val
) {
323 if (GprNo
< NUM_ALL_VGPRS
) {
324 VgprUB
= std::max(VgprUB
, GprNo
);
325 VgprScores
[T
][GprNo
] = Val
;
327 assert(T
== LGKM_CNT
);
328 SgprUB
= std::max(SgprUB
, GprNo
- NUM_ALL_VGPRS
);
329 SgprScores
[GprNo
- NUM_ALL_VGPRS
] = Val
;
333 void setExpScore(const MachineInstr
*MI
, const SIInstrInfo
*TII
,
334 const SIRegisterInfo
*TRI
, const MachineRegisterInfo
*MRI
,
335 unsigned OpNo
, unsigned Val
);
337 const GCNSubtarget
*ST
= nullptr;
338 unsigned ScoreLBs
[NUM_INST_CNTS
] = {0};
339 unsigned ScoreUBs
[NUM_INST_CNTS
] = {0};
340 unsigned PendingEvents
= 0;
341 // Remember the last flat memory operation.
342 unsigned LastFlat
[NUM_INST_CNTS
] = {0};
343 // wait_cnt scores for every vgpr.
344 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
347 unsigned VgprScores
[NUM_INST_CNTS
][NUM_ALL_VGPRS
] = {{0}};
348 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
349 unsigned SgprScores
[SQ_MAX_PGM_SGPRS
] = {0};
350 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
351 // write to each vgpr.
352 unsigned char VgprVmemTypes
[NUM_ALL_VGPRS
] = {0};
355 class SIInsertWaitcnts
: public MachineFunctionPass
{
357 const GCNSubtarget
*ST
= nullptr;
358 const SIInstrInfo
*TII
= nullptr;
359 const SIRegisterInfo
*TRI
= nullptr;
360 const MachineRegisterInfo
*MRI
= nullptr;
361 AMDGPU::IsaVersion IV
;
363 DenseSet
<MachineInstr
*> TrackedWaitcntSet
;
364 DenseMap
<const Value
*, MachineBasicBlock
*> SLoadAddresses
;
365 MachinePostDominatorTree
*PDT
;
368 MachineBasicBlock
*MBB
;
369 std::unique_ptr
<WaitcntBrackets
> Incoming
;
372 explicit BlockInfo(MachineBasicBlock
*MBB
) : MBB(MBB
) {}
375 MapVector
<MachineBasicBlock
*, BlockInfo
> BlockInfos
;
377 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
378 // because of amdgpu-waitcnt-forcezero flag
379 bool ForceEmitZeroWaitcnts
;
380 bool ForceEmitWaitcnt
[NUM_INST_CNTS
];
385 SIInsertWaitcnts() : MachineFunctionPass(ID
) {
386 (void)ForceExpCounter
;
387 (void)ForceLgkmCounter
;
388 (void)ForceVMCounter
;
391 bool runOnMachineFunction(MachineFunction
&MF
) override
;
393 StringRef
getPassName() const override
{
394 return "SI insert wait instructions";
397 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
398 AU
.setPreservesCFG();
399 AU
.addRequired
<MachinePostDominatorTree
>();
400 MachineFunctionPass::getAnalysisUsage(AU
);
403 bool isForceEmitWaitcnt() const {
404 for (auto T
: inst_counter_types())
405 if (ForceEmitWaitcnt
[T
])
410 void setForceEmitWaitcnt() {
411 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
412 // For debug builds, get the debug counter info and adjust if need be
414 if (DebugCounter::isCounterSet(ForceExpCounter
) &&
415 DebugCounter::shouldExecute(ForceExpCounter
)) {
416 ForceEmitWaitcnt
[EXP_CNT
] = true;
418 ForceEmitWaitcnt
[EXP_CNT
] = false;
421 if (DebugCounter::isCounterSet(ForceLgkmCounter
) &&
422 DebugCounter::shouldExecute(ForceLgkmCounter
)) {
423 ForceEmitWaitcnt
[LGKM_CNT
] = true;
425 ForceEmitWaitcnt
[LGKM_CNT
] = false;
428 if (DebugCounter::isCounterSet(ForceVMCounter
) &&
429 DebugCounter::shouldExecute(ForceVMCounter
)) {
430 ForceEmitWaitcnt
[VM_CNT
] = true;
432 ForceEmitWaitcnt
[VM_CNT
] = false;
437 bool mayAccessVMEMThroughFlat(const MachineInstr
&MI
) const;
438 bool mayAccessLDSThroughFlat(const MachineInstr
&MI
) const;
439 bool generateWaitcntInstBefore(MachineInstr
&MI
,
440 WaitcntBrackets
&ScoreBrackets
,
441 MachineInstr
*OldWaitcntInstr
);
442 void updateEventWaitcntAfter(MachineInstr
&Inst
,
443 WaitcntBrackets
*ScoreBrackets
);
444 bool insertWaitcntInBlock(MachineFunction
&MF
, MachineBasicBlock
&Block
,
445 WaitcntBrackets
&ScoreBrackets
);
446 bool applyPreexistingWaitcnt(WaitcntBrackets
&ScoreBrackets
,
447 MachineInstr
&OldWaitcntInstr
,
448 AMDGPU::Waitcnt
&Wait
, const MachineInstr
*MI
);
451 } // end anonymous namespace
453 RegInterval
WaitcntBrackets::getRegInterval(const MachineInstr
*MI
,
454 const SIInstrInfo
*TII
,
455 const MachineRegisterInfo
*MRI
,
456 const SIRegisterInfo
*TRI
,
457 unsigned OpNo
) const {
458 const MachineOperand
&Op
= MI
->getOperand(OpNo
);
459 if (!TRI
->isInAllocatableClass(Op
.getReg()))
462 // A use via a PW operand does not need a waitcnt.
463 // A partial write is not a WAW.
464 assert(!Op
.getSubReg() || !Op
.isUndef());
468 unsigned Reg
= TRI
->getEncodingValue(AMDGPU::getMCReg(Op
.getReg(), *ST
));
470 if (TRI
->isVectorRegister(*MRI
, Op
.getReg())) {
471 assert(Reg
>= RegisterEncoding
.VGPR0
&& Reg
<= RegisterEncoding
.VGPRL
);
472 Result
.first
= Reg
- RegisterEncoding
.VGPR0
;
473 if (TRI
->isAGPR(*MRI
, Op
.getReg()))
474 Result
.first
+= AGPR_OFFSET
;
475 assert(Result
.first
>= 0 && Result
.first
< SQ_MAX_PGM_VGPRS
);
476 } else if (TRI
->isSGPRReg(*MRI
, Op
.getReg())) {
477 assert(Reg
>= RegisterEncoding
.SGPR0
&& Reg
< SQ_MAX_PGM_SGPRS
);
478 Result
.first
= Reg
- RegisterEncoding
.SGPR0
+ NUM_ALL_VGPRS
;
479 assert(Result
.first
>= NUM_ALL_VGPRS
&&
480 Result
.first
< SQ_MAX_PGM_SGPRS
+ NUM_ALL_VGPRS
);
483 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
487 const TargetRegisterClass
*RC
= TII
->getOpRegClass(*MI
, OpNo
);
488 unsigned Size
= TRI
->getRegSizeInBits(*RC
);
489 Result
.second
= Result
.first
+ ((Size
+ 16) / 32);
494 void WaitcntBrackets::setExpScore(const MachineInstr
*MI
,
495 const SIInstrInfo
*TII
,
496 const SIRegisterInfo
*TRI
,
497 const MachineRegisterInfo
*MRI
, unsigned OpNo
,
499 RegInterval Interval
= getRegInterval(MI
, TII
, MRI
, TRI
, OpNo
);
500 assert(TRI
->isVectorRegister(*MRI
, MI
->getOperand(OpNo
).getReg()));
501 for (int RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
502 setRegScore(RegNo
, EXP_CNT
, Val
);
506 void WaitcntBrackets::updateByEvent(const SIInstrInfo
*TII
,
507 const SIRegisterInfo
*TRI
,
508 const MachineRegisterInfo
*MRI
,
509 WaitEventType E
, MachineInstr
&Inst
) {
510 InstCounterType T
= eventCounter(E
);
511 unsigned CurrScore
= getScoreUB(T
) + 1;
513 report_fatal_error("InsertWaitcnt score wraparound");
514 // PendingEvents and ScoreUB need to be update regardless if this event
515 // changes the score of a register or not.
516 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
517 PendingEvents
|= 1 << E
;
518 setScoreUB(T
, CurrScore
);
521 // Put score on the source vgprs. If this is a store, just use those
522 // specific register(s).
523 if (TII
->isDS(Inst
) && (Inst
.mayStore() || Inst
.mayLoad())) {
525 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::addr
);
526 // All GDS operations must protect their address register (same as
528 if (AddrOpIdx
!= -1) {
529 setExpScore(&Inst
, TII
, TRI
, MRI
, AddrOpIdx
, CurrScore
);
532 if (Inst
.mayStore()) {
533 if (AMDGPU::getNamedOperandIdx(Inst
.getOpcode(),
534 AMDGPU::OpName::data0
) != -1) {
536 &Inst
, TII
, TRI
, MRI
,
537 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data0
),
540 if (AMDGPU::getNamedOperandIdx(Inst
.getOpcode(),
541 AMDGPU::OpName::data1
) != -1) {
542 setExpScore(&Inst
, TII
, TRI
, MRI
,
543 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(),
544 AMDGPU::OpName::data1
),
547 } else if (SIInstrInfo::isAtomicRet(Inst
) &&
548 Inst
.getOpcode() != AMDGPU::DS_GWS_INIT
&&
549 Inst
.getOpcode() != AMDGPU::DS_GWS_SEMA_V
&&
550 Inst
.getOpcode() != AMDGPU::DS_GWS_SEMA_BR
&&
551 Inst
.getOpcode() != AMDGPU::DS_GWS_SEMA_P
&&
552 Inst
.getOpcode() != AMDGPU::DS_GWS_BARRIER
&&
553 Inst
.getOpcode() != AMDGPU::DS_APPEND
&&
554 Inst
.getOpcode() != AMDGPU::DS_CONSUME
&&
555 Inst
.getOpcode() != AMDGPU::DS_ORDERED_COUNT
) {
556 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
557 const MachineOperand
&Op
= Inst
.getOperand(I
);
558 if (Op
.isReg() && !Op
.isDef() &&
559 TRI
->isVectorRegister(*MRI
, Op
.getReg())) {
560 setExpScore(&Inst
, TII
, TRI
, MRI
, I
, CurrScore
);
564 } else if (TII
->isFLAT(Inst
)) {
565 if (Inst
.mayStore()) {
567 &Inst
, TII
, TRI
, MRI
,
568 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
570 } else if (SIInstrInfo::isAtomicRet(Inst
)) {
572 &Inst
, TII
, TRI
, MRI
,
573 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
576 } else if (TII
->isMIMG(Inst
)) {
577 if (Inst
.mayStore()) {
578 setExpScore(&Inst
, TII
, TRI
, MRI
, 0, CurrScore
);
579 } else if (SIInstrInfo::isAtomicRet(Inst
)) {
581 &Inst
, TII
, TRI
, MRI
,
582 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
585 } else if (TII
->isMTBUF(Inst
)) {
586 if (Inst
.mayStore()) {
587 setExpScore(&Inst
, TII
, TRI
, MRI
, 0, CurrScore
);
589 } else if (TII
->isMUBUF(Inst
)) {
590 if (Inst
.mayStore()) {
591 setExpScore(&Inst
, TII
, TRI
, MRI
, 0, CurrScore
);
592 } else if (SIInstrInfo::isAtomicRet(Inst
)) {
594 &Inst
, TII
, TRI
, MRI
,
595 AMDGPU::getNamedOperandIdx(Inst
.getOpcode(), AMDGPU::OpName::data
),
599 if (TII
->isEXP(Inst
)) {
600 // For export the destination registers are really temps that
601 // can be used as the actual source after export patching, so
602 // we need to treat them like sources and set the EXP_CNT
604 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
605 MachineOperand
&DefMO
= Inst
.getOperand(I
);
606 if (DefMO
.isReg() && DefMO
.isDef() &&
607 TRI
->isVGPR(*MRI
, DefMO
.getReg())) {
609 TRI
->getEncodingValue(AMDGPU::getMCReg(DefMO
.getReg(), *ST
)),
614 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
615 MachineOperand
&MO
= Inst
.getOperand(I
);
616 if (MO
.isReg() && !MO
.isDef() &&
617 TRI
->isVectorRegister(*MRI
, MO
.getReg())) {
618 setExpScore(&Inst
, TII
, TRI
, MRI
, I
, CurrScore
);
622 #if 0 // TODO: check if this is handled by MUBUF code above.
623 } else if (Inst
.getOpcode() == AMDGPU::BUFFER_STORE_DWORD
||
624 Inst
.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2
||
625 Inst
.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4
) {
626 MachineOperand
*MO
= TII
->getNamedOperand(Inst
, AMDGPU::OpName::data
);
627 unsigned OpNo
;//TODO: find the OpNo for this operand;
628 RegInterval Interval
= getRegInterval(&Inst
, TII
, MRI
, TRI
, OpNo
);
629 for (int RegNo
= Interval
.first
; RegNo
< Interval
.second
;
631 setRegScore(RegNo
+ NUM_ALL_VGPRS
, t
, CurrScore
);
635 // Match the score to the destination registers.
636 for (unsigned I
= 0, E
= Inst
.getNumOperands(); I
!= E
; ++I
) {
637 auto &Op
= Inst
.getOperand(I
);
638 if (!Op
.isReg() || !Op
.isDef())
640 RegInterval Interval
= getRegInterval(&Inst
, TII
, MRI
, TRI
, I
);
642 if (Interval
.first
>= NUM_ALL_VGPRS
)
644 if (SIInstrInfo::isVMEM(Inst
)) {
645 VmemType V
= getVmemType(Inst
);
646 for (int RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
)
647 VgprVmemTypes
[RegNo
] |= 1 << V
;
650 for (int RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
651 setRegScore(RegNo
, T
, CurrScore
);
654 if (TII
->isDS(Inst
) && Inst
.mayStore()) {
655 setRegScore(SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
, T
, CurrScore
);
660 void WaitcntBrackets::print(raw_ostream
&OS
) {
662 for (auto T
: inst_counter_types()) {
663 unsigned LB
= getScoreLB(T
);
664 unsigned UB
= getScoreUB(T
);
668 OS
<< " VM_CNT(" << UB
- LB
<< "): ";
671 OS
<< " LGKM_CNT(" << UB
- LB
<< "): ";
674 OS
<< " EXP_CNT(" << UB
- LB
<< "): ";
677 OS
<< " VS_CNT(" << UB
- LB
<< "): ";
680 OS
<< " UNKNOWN(" << UB
- LB
<< "): ";
685 // Print vgpr scores.
686 for (int J
= 0; J
<= VgprUB
; J
++) {
687 unsigned RegScore
= getRegScore(J
, T
);
690 unsigned RelScore
= RegScore
- LB
- 1;
691 if (J
< SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
) {
692 OS
<< RelScore
<< ":v" << J
<< " ";
694 OS
<< RelScore
<< ":ds ";
697 // Also need to print sgpr scores for lgkm_cnt.
699 for (int J
= 0; J
<= SgprUB
; J
++) {
700 unsigned RegScore
= getRegScore(J
+ NUM_ALL_VGPRS
, LGKM_CNT
);
703 unsigned RelScore
= RegScore
- LB
- 1;
704 OS
<< RelScore
<< ":s" << J
<< " ";
713 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
714 /// whether a waitcnt instruction is needed at all.
715 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt
&Wait
) const {
716 simplifyWaitcnt(VM_CNT
, Wait
.VmCnt
);
717 simplifyWaitcnt(EXP_CNT
, Wait
.ExpCnt
);
718 simplifyWaitcnt(LGKM_CNT
, Wait
.LgkmCnt
);
719 simplifyWaitcnt(VS_CNT
, Wait
.VsCnt
);
722 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T
,
723 unsigned &Count
) const {
724 const unsigned LB
= getScoreLB(T
);
725 const unsigned UB
= getScoreUB(T
);
727 // The number of outstanding events for this type, T, can be calculated
728 // as (UB - LB). If the current Count is greater than or equal to the number
729 // of outstanding events, then the wait for this counter is redundant.
730 if (Count
>= UB
- LB
)
734 void WaitcntBrackets::determineWait(InstCounterType T
, unsigned ScoreToWait
,
735 AMDGPU::Waitcnt
&Wait
) const {
736 // If the score of src_operand falls within the bracket, we need an
737 // s_waitcnt instruction.
738 const unsigned LB
= getScoreLB(T
);
739 const unsigned UB
= getScoreUB(T
);
740 if ((UB
>= ScoreToWait
) && (ScoreToWait
> LB
)) {
741 if ((T
== VM_CNT
|| T
== LGKM_CNT
) &&
743 !ST
->hasFlatLgkmVMemCountInOrder()) {
744 // If there is a pending FLAT operation, and this is a VMem or LGKM
745 // waitcnt and the target can report early completion, then we need
746 // to force a waitcnt 0.
748 } else if (counterOutOfOrder(T
)) {
749 // Counter can get decremented out-of-order when there
750 // are multiple types event in the bracket. Also emit an s_wait counter
751 // with a conservative value of 0 for the counter.
754 // If a counter has been maxed out avoid overflow by waiting for
755 // MAX(CounterType) - 1 instead.
756 unsigned NeededWait
= std::min(UB
- ScoreToWait
, getWaitCountMax(T
) - 1);
757 addWait(Wait
, T
, NeededWait
);
762 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt
&Wait
) {
763 applyWaitcnt(VM_CNT
, Wait
.VmCnt
);
764 applyWaitcnt(EXP_CNT
, Wait
.ExpCnt
);
765 applyWaitcnt(LGKM_CNT
, Wait
.LgkmCnt
);
766 applyWaitcnt(VS_CNT
, Wait
.VsCnt
);
769 void WaitcntBrackets::applyWaitcnt(InstCounterType T
, unsigned Count
) {
770 const unsigned UB
= getScoreUB(T
);
774 if (counterOutOfOrder(T
))
776 setScoreLB(T
, std::max(getScoreLB(T
), UB
- Count
));
779 PendingEvents
&= ~WaitEventMaskForInst
[T
];
783 // Where there are multiple types of event in the bracket of a counter,
784 // the decrement may go out of order.
785 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T
) const {
786 // Scalar memory read always can go out of order.
787 if (T
== LGKM_CNT
&& hasPendingEvent(SMEM_ACCESS
))
789 return hasMixedPendingEvents(T
);
792 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts
, DEBUG_TYPE
, "SI Insert Waitcnts", false,
794 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree
)
795 INITIALIZE_PASS_END(SIInsertWaitcnts
, DEBUG_TYPE
, "SI Insert Waitcnts", false,
798 char SIInsertWaitcnts::ID
= 0;
800 char &llvm::SIInsertWaitcntsID
= SIInsertWaitcnts::ID
;
802 FunctionPass
*llvm::createSIInsertWaitcntsPass() {
803 return new SIInsertWaitcnts();
806 /// Combine consecutive waitcnt instructions that precede \p MI and follow
807 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
808 /// by previous passes. Currently this pass conservatively assumes that these
809 /// preexisting waitcnt are required for correctness.
810 bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets
&ScoreBrackets
,
811 MachineInstr
&OldWaitcntInstr
,
812 AMDGPU::Waitcnt
&Wait
,
813 const MachineInstr
*MI
) {
814 bool Modified
= false;
815 MachineInstr
*WaitcntInstr
= nullptr;
816 MachineInstr
*WaitcntVsCntInstr
= nullptr;
817 for (auto II
= OldWaitcntInstr
.getIterator(), NextI
= std::next(II
);
818 &*II
!= MI
; II
= NextI
, ++NextI
) {
819 if (II
->isMetaInstruction())
822 if (II
->getOpcode() == AMDGPU::S_WAITCNT
) {
823 // Conservatively update required wait if this waitcnt was added in an
824 // earlier pass. In this case it will not exist in the tracked waitcnt
826 if (!TrackedWaitcntSet
.count(&*II
)) {
827 unsigned IEnc
= II
->getOperand(0).getImm();
828 AMDGPU::Waitcnt OldWait
= AMDGPU::decodeWaitcnt(IV
, IEnc
);
829 Wait
= Wait
.combined(OldWait
);
832 // Merge consecutive waitcnt of the same type by erasing multiples.
836 II
->eraseFromParent();
841 assert(II
->getOpcode() == AMDGPU::S_WAITCNT_VSCNT
);
842 assert(II
->getOperand(0).getReg() == AMDGPU::SGPR_NULL
);
843 if (!TrackedWaitcntSet
.count(&*II
)) {
845 TII
->getNamedOperand(*II
, AMDGPU::OpName::simm16
)->getImm();
846 Wait
.VsCnt
= std::min(Wait
.VsCnt
, OldVSCnt
);
849 if (!WaitcntVsCntInstr
) {
850 WaitcntVsCntInstr
= &*II
;
852 II
->eraseFromParent();
858 // Updated encoding of merged waitcnt with the required wait.
860 if (Wait
.hasWaitExceptVsCnt()) {
861 unsigned NewEnc
= AMDGPU::encodeWaitcnt(IV
, Wait
);
862 unsigned OldEnc
= WaitcntInstr
->getOperand(0).getImm();
863 if (OldEnc
!= NewEnc
) {
864 WaitcntInstr
->getOperand(0).setImm(NewEnc
);
867 ScoreBrackets
.applyWaitcnt(Wait
);
872 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
873 << "Old Instr: " << MI
<< "New Instr: " << *WaitcntInstr
876 WaitcntInstr
->eraseFromParent();
881 if (WaitcntVsCntInstr
) {
882 if (Wait
.hasWaitVsCnt()) {
883 assert(ST
->hasVscnt());
885 TII
->getNamedOperand(*WaitcntVsCntInstr
, AMDGPU::OpName::simm16
)
887 if (Wait
.VsCnt
!= OldVSCnt
) {
888 TII
->getNamedOperand(*WaitcntVsCntInstr
, AMDGPU::OpName::simm16
)
889 ->setImm(Wait
.VsCnt
);
892 ScoreBrackets
.applyWaitcnt(Wait
);
895 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
896 << "Old Instr: " << MI
897 << "New Instr: " << *WaitcntVsCntInstr
<< '\n');
899 WaitcntVsCntInstr
->eraseFromParent();
907 static bool readsVCCZ(const MachineInstr
&MI
) {
908 unsigned Opc
= MI
.getOpcode();
909 return (Opc
== AMDGPU::S_CBRANCH_VCCNZ
|| Opc
== AMDGPU::S_CBRANCH_VCCZ
) &&
910 !MI
.getOperand(1).isUndef();
913 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
914 static bool callWaitsOnFunctionEntry(const MachineInstr
&MI
) {
915 // Currently all conventions wait, but this may not always be the case.
917 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
918 // senses to omit the wait and do it in the caller.
922 /// \returns true if the callee is expected to wait for any outstanding waits
923 /// before returning.
924 static bool callWaitsOnFunctionReturn(const MachineInstr
&MI
) {
928 /// Generate s_waitcnt instruction to be placed before cur_Inst.
929 /// Instructions of a given type are returned in order,
930 /// but instructions of different types can complete out of order.
931 /// We rely on this in-order completion
932 /// and simply assign a score to the memory access instructions.
933 /// We keep track of the active "score bracket" to determine
934 /// if an access of a memory read requires an s_waitcnt
935 /// and if so what the value of each counter is.
936 /// The "score bracket" is bound by the lower bound and upper bound
937 /// scores (*_score_LB and *_score_ub respectively).
938 bool SIInsertWaitcnts::generateWaitcntInstBefore(
939 MachineInstr
&MI
, WaitcntBrackets
&ScoreBrackets
,
940 MachineInstr
*OldWaitcntInstr
) {
941 setForceEmitWaitcnt();
943 if (MI
.isMetaInstruction())
946 AMDGPU::Waitcnt Wait
;
947 bool Modified
= false;
949 // FIXME: This should have already been handled by the memory legalizer.
950 // Removing this currently doesn't affect any lit tests, but we need to
951 // verify that nothing was relying on this. The number of buffer invalidates
952 // being handled here should not be expanded.
953 if (MI
.getOpcode() == AMDGPU::BUFFER_WBINVL1
||
954 MI
.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC
||
955 MI
.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL
||
956 MI
.getOpcode() == AMDGPU::BUFFER_GL0_INV
||
957 MI
.getOpcode() == AMDGPU::BUFFER_GL1_INV
) {
961 // All waits must be resolved at call return.
962 // NOTE: this could be improved with knowledge of all call sites or
963 // with knowledge of the called routines.
964 if (MI
.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG
||
965 MI
.getOpcode() == AMDGPU::S_SETPC_B64_return
||
966 (MI
.isReturn() && MI
.isCall() && !callWaitsOnFunctionEntry(MI
))) {
967 Wait
= Wait
.combined(AMDGPU::Waitcnt::allZero(ST
->hasVscnt()));
969 // Resolve vm waits before gs-done.
970 else if ((MI
.getOpcode() == AMDGPU::S_SENDMSG
||
971 MI
.getOpcode() == AMDGPU::S_SENDMSGHALT
) &&
972 ((MI
.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_
) ==
973 AMDGPU::SendMsg::ID_GS_DONE
)) {
976 #if 0 // TODO: the following blocks of logic when we have fence.
977 else if (MI
.getOpcode() == SC_FENCE
) {
978 const unsigned int group_size
=
979 context
->shader_info
->GetMaxThreadGroupSize();
980 // group_size == 0 means thread group size is unknown at compile time
981 const bool group_is_multi_wave
=
982 (group_size
== 0 || group_size
> target_info
->GetWaveFrontSize());
983 const bool fence_is_global
= !((SCInstInternalMisc
*)Inst
)->IsGroupFence();
985 for (unsigned int i
= 0; i
< Inst
->NumSrcOperands(); i
++) {
986 SCRegType src_type
= Inst
->GetSrcType(i
);
989 if (group_is_multi_wave
||
990 context
->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT
)) {
991 EmitWaitcnt
|= ScoreBrackets
->updateByWait(LGKM_CNT
,
992 ScoreBrackets
->getScoreUB(LGKM_CNT
));
993 // LDS may have to wait for VM_CNT after buffer load to LDS
994 if (target_info
->HasBufferLoadToLDS()) {
995 EmitWaitcnt
|= ScoreBrackets
->updateByWait(VM_CNT
,
996 ScoreBrackets
->getScoreUB(VM_CNT
));
1002 if (group_is_multi_wave
|| fence_is_global
) {
1003 EmitWaitcnt
|= ScoreBrackets
->updateByWait(EXP_CNT
,
1004 ScoreBrackets
->getScoreUB(EXP_CNT
));
1005 EmitWaitcnt
|= ScoreBrackets
->updateByWait(LGKM_CNT
,
1006 ScoreBrackets
->getScoreUB(LGKM_CNT
));
1014 if (group_is_multi_wave
|| fence_is_global
) {
1015 EmitWaitcnt
|= ScoreBrackets
->updateByWait(EXP_CNT
,
1016 ScoreBrackets
->getScoreUB(EXP_CNT
));
1017 EmitWaitcnt
|= ScoreBrackets
->updateByWait(VM_CNT
,
1018 ScoreBrackets
->getScoreUB(VM_CNT
));
1030 // Export & GDS instructions do not read the EXEC mask until after the export
1031 // is granted (which can occur well after the instruction is issued).
1032 // The shader program must flush all EXP operations on the export-count
1033 // before overwriting the EXEC mask.
1035 if (MI
.modifiesRegister(AMDGPU::EXEC
, TRI
)) {
1036 // Export and GDS are tracked individually, either may trigger a waitcnt
1038 if (ScoreBrackets
.hasPendingEvent(EXP_GPR_LOCK
) ||
1039 ScoreBrackets
.hasPendingEvent(EXP_PARAM_ACCESS
) ||
1040 ScoreBrackets
.hasPendingEvent(EXP_POS_ACCESS
) ||
1041 ScoreBrackets
.hasPendingEvent(GDS_GPR_LOCK
)) {
1046 if (MI
.isCall() && callWaitsOnFunctionEntry(MI
)) {
1047 // The function is going to insert a wait on everything in its prolog.
1048 // This still needs to be careful if the call target is a load (e.g. a GOT
1049 // load). We also need to check WAW depenancy with saved PC.
1050 Wait
= AMDGPU::Waitcnt();
1053 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::src0
);
1055 if (MI
.getOperand(CallAddrOpIdx
).isReg()) {
1056 RegInterval CallAddrOpInterval
=
1057 ScoreBrackets
.getRegInterval(&MI
, TII
, MRI
, TRI
, CallAddrOpIdx
);
1059 for (int RegNo
= CallAddrOpInterval
.first
;
1060 RegNo
< CallAddrOpInterval
.second
; ++RegNo
)
1061 ScoreBrackets
.determineWait(
1062 LGKM_CNT
, ScoreBrackets
.getRegScore(RegNo
, LGKM_CNT
), Wait
);
1065 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::dst
);
1066 if (RtnAddrOpIdx
!= -1) {
1067 RegInterval RtnAddrOpInterval
=
1068 ScoreBrackets
.getRegInterval(&MI
, TII
, MRI
, TRI
, RtnAddrOpIdx
);
1070 for (int RegNo
= RtnAddrOpInterval
.first
;
1071 RegNo
< RtnAddrOpInterval
.second
; ++RegNo
)
1072 ScoreBrackets
.determineWait(
1073 LGKM_CNT
, ScoreBrackets
.getRegScore(RegNo
, LGKM_CNT
), Wait
);
1077 // FIXME: Should not be relying on memoperands.
1078 // Look at the source operands of every instruction to see if
1079 // any of them results from a previous memory operation that affects
1080 // its current usage. If so, an s_waitcnt instruction needs to be
1082 // If the source operand was defined by a load, add the s_waitcnt
1085 // Two cases are handled for destination operands:
1086 // 1) If the destination operand was defined by a load, add the s_waitcnt
1087 // instruction to guarantee the right WAW order.
1088 // 2) If a destination operand that was used by a recent export/store ins,
1089 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1090 for (const MachineMemOperand
*Memop
: MI
.memoperands()) {
1091 const Value
*Ptr
= Memop
->getValue();
1092 if (Memop
->isStore() && SLoadAddresses
.count(Ptr
)) {
1093 addWait(Wait
, LGKM_CNT
, 0);
1094 if (PDT
->dominates(MI
.getParent(), SLoadAddresses
.find(Ptr
)->second
))
1095 SLoadAddresses
.erase(Ptr
);
1097 unsigned AS
= Memop
->getAddrSpace();
1098 if (AS
!= AMDGPUAS::LOCAL_ADDRESS
)
1100 unsigned RegNo
= SQ_MAX_PGM_VGPRS
+ EXTRA_VGPR_LDS
;
1101 // VM_CNT is only relevant to vgpr or LDS.
1102 ScoreBrackets
.determineWait(
1103 VM_CNT
, ScoreBrackets
.getRegScore(RegNo
, VM_CNT
), Wait
);
1104 if (Memop
->isStore()) {
1105 ScoreBrackets
.determineWait(
1106 EXP_CNT
, ScoreBrackets
.getRegScore(RegNo
, EXP_CNT
), Wait
);
1110 // Loop over use and def operands.
1111 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
1112 MachineOperand
&Op
= MI
.getOperand(I
);
1115 RegInterval Interval
=
1116 ScoreBrackets
.getRegInterval(&MI
, TII
, MRI
, TRI
, I
);
1118 const bool IsVGPR
= TRI
->isVectorRegister(*MRI
, Op
.getReg());
1119 for (int RegNo
= Interval
.first
; RegNo
< Interval
.second
; ++RegNo
) {
1121 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1122 // previous write and this write are the same type of VMEM
1123 // instruction, in which case they're guaranteed to write their
1124 // results in order anyway.
1125 if (Op
.isUse() || !SIInstrInfo::isVMEM(MI
) ||
1126 ScoreBrackets
.hasOtherPendingVmemTypes(RegNo
,
1128 ScoreBrackets
.determineWait(
1129 VM_CNT
, ScoreBrackets
.getRegScore(RegNo
, VM_CNT
), Wait
);
1130 ScoreBrackets
.clearVgprVmemTypes(RegNo
);
1133 ScoreBrackets
.determineWait(
1134 EXP_CNT
, ScoreBrackets
.getRegScore(RegNo
, EXP_CNT
), Wait
);
1137 ScoreBrackets
.determineWait(
1138 LGKM_CNT
, ScoreBrackets
.getRegScore(RegNo
, LGKM_CNT
), Wait
);
1144 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1145 // occurs before the instruction. Doing it here prevents any additional
1146 // S_WAITCNTs from being emitted if the instruction was marked as
1147 // requiring a WAITCNT beforehand.
1148 if (MI
.getOpcode() == AMDGPU::S_BARRIER
&&
1149 !ST
->hasAutoWaitcntBeforeBarrier()) {
1150 Wait
= Wait
.combined(AMDGPU::Waitcnt::allZero(ST
->hasVscnt()));
1153 // TODO: Remove this work-around, enable the assert for Bug 457939
1154 // after fixing the scheduler. Also, the Shader Compiler code is
1155 // independent of target.
1156 if (readsVCCZ(MI
) && ST
->hasReadVCCZBug()) {
1157 if (ScoreBrackets
.getScoreLB(LGKM_CNT
) <
1158 ScoreBrackets
.getScoreUB(LGKM_CNT
) &&
1159 ScoreBrackets
.hasPendingEvent(SMEM_ACCESS
)) {
1164 // Verify that the wait is actually needed.
1165 ScoreBrackets
.simplifyWaitcnt(Wait
);
1167 if (ForceEmitZeroWaitcnts
)
1168 Wait
= AMDGPU::Waitcnt::allZero(ST
->hasVscnt());
1170 if (ForceEmitWaitcnt
[VM_CNT
])
1172 if (ForceEmitWaitcnt
[EXP_CNT
])
1174 if (ForceEmitWaitcnt
[LGKM_CNT
])
1176 if (ForceEmitWaitcnt
[VS_CNT
])
1179 if (OldWaitcntInstr
) {
1180 // Try to merge the required wait with preexisting waitcnt instructions.
1181 // Also erase redundant waitcnt.
1183 applyPreexistingWaitcnt(ScoreBrackets
, *OldWaitcntInstr
, Wait
, &MI
);
1185 // Update waitcnt brackets after determining the required wait.
1186 ScoreBrackets
.applyWaitcnt(Wait
);
1189 // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1190 // instruction was modified to handle the required wait.
1191 if (Wait
.hasWaitExceptVsCnt()) {
1192 unsigned Enc
= AMDGPU::encodeWaitcnt(IV
, Wait
);
1193 auto SWaitInst
= BuildMI(*MI
.getParent(), MI
.getIterator(),
1194 MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
1196 TrackedWaitcntSet
.insert(SWaitInst
);
1199 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1200 << "Old Instr: " << MI
1201 << "New Instr: " << *SWaitInst
<< '\n');
1204 if (Wait
.hasWaitVsCnt()) {
1205 assert(ST
->hasVscnt());
1208 BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1209 TII
->get(AMDGPU::S_WAITCNT_VSCNT
))
1210 .addReg(AMDGPU::SGPR_NULL
, RegState::Undef
)
1211 .addImm(Wait
.VsCnt
);
1212 TrackedWaitcntSet
.insert(SWaitInst
);
1215 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1216 << "Old Instr: " << MI
1217 << "New Instr: " << *SWaitInst
<< '\n');
1223 // This is a flat memory operation. Check to see if it has memory tokens other
1224 // than LDS. Other address spaces supported by flat memory operations involve
1226 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr
&MI
) const {
1227 assert(TII
->isFLAT(MI
));
1229 // All flat instructions use the VMEM counter.
1230 assert(TII
->usesVM_CNT(MI
));
1232 // If there are no memory operands then conservatively assume the flat
1233 // operation may access VMEM.
1234 if (MI
.memoperands_empty())
1237 // See if any memory operand specifies an address space that involves VMEM.
1238 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1239 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1240 // (GDS) address space is not supported by flat operations. Therefore, simply
1241 // return true unless only the LDS address space is found.
1242 for (const MachineMemOperand
*Memop
: MI
.memoperands()) {
1243 unsigned AS
= Memop
->getAddrSpace();
1244 assert(AS
!= AMDGPUAS::REGION_ADDRESS
);
1245 if (AS
!= AMDGPUAS::LOCAL_ADDRESS
)
1252 // This is a flat memory operation. Check to see if it has memory tokens for
1253 // either LDS or FLAT.
1254 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr
&MI
) const {
1255 assert(TII
->isFLAT(MI
));
1257 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1258 if (!TII
->usesLGKM_CNT(MI
))
1261 // If in tgsplit mode then there can be no use of LDS.
1262 if (ST
->isTgSplitEnabled())
1265 // If there are no memory operands then conservatively assume the flat
1266 // operation may access LDS.
1267 if (MI
.memoperands_empty())
1270 // See if any memory operand specifies an address space that involves LDS.
1271 for (const MachineMemOperand
*Memop
: MI
.memoperands()) {
1272 unsigned AS
= Memop
->getAddrSpace();
1273 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
)
1280 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr
&Inst
,
1281 WaitcntBrackets
*ScoreBrackets
) {
1282 // Now look at the instruction opcode. If it is a memory access
1283 // instruction, update the upper-bound of the appropriate counter's
1284 // bracket and the destination operand scores.
1285 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1286 if (TII
->isDS(Inst
) && TII
->usesLGKM_CNT(Inst
)) {
1287 if (TII
->isAlwaysGDS(Inst
.getOpcode()) ||
1288 TII
->hasModifiersSet(Inst
, AMDGPU::OpName::gds
)) {
1289 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, GDS_ACCESS
, Inst
);
1290 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, GDS_GPR_LOCK
, Inst
);
1292 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, LDS_ACCESS
, Inst
);
1294 } else if (TII
->isFLAT(Inst
)) {
1295 assert(Inst
.mayLoadOrStore());
1297 int FlatASCount
= 0;
1299 if (mayAccessVMEMThroughFlat(Inst
)) {
1301 if (!ST
->hasVscnt())
1302 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_ACCESS
, Inst
);
1303 else if (Inst
.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst
))
1304 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_READ_ACCESS
, Inst
);
1306 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_WRITE_ACCESS
, Inst
);
1309 if (mayAccessLDSThroughFlat(Inst
)) {
1311 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, LDS_ACCESS
, Inst
);
1314 // A Flat memory operation must access at least one address space.
1315 assert(FlatASCount
);
1317 // This is a flat memory operation that access both VMEM and LDS, so note it
1318 // - it will require that both the VM and LGKM be flushed to zero if it is
1319 // pending when a VM or LGKM dependency occurs.
1320 if (FlatASCount
> 1)
1321 ScoreBrackets
->setPendingFlat();
1322 } else if (SIInstrInfo::isVMEM(Inst
) &&
1323 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst
.getOpcode())) {
1324 if (!ST
->hasVscnt())
1325 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_ACCESS
, Inst
);
1326 else if ((Inst
.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst
)) ||
1327 /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1328 (TII
->isMIMG(Inst
) && !Inst
.mayLoad() && !Inst
.mayStore()))
1329 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_READ_ACCESS
, Inst
);
1330 else if (Inst
.mayStore())
1331 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMEM_WRITE_ACCESS
, Inst
);
1333 if (ST
->vmemWriteNeedsExpWaitcnt() &&
1334 (Inst
.mayStore() || SIInstrInfo::isAtomicRet(Inst
))) {
1335 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, VMW_GPR_LOCK
, Inst
);
1337 } else if (TII
->isSMRD(Inst
)) {
1338 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, SMEM_ACCESS
, Inst
);
1339 } else if (Inst
.isCall()) {
1340 if (callWaitsOnFunctionReturn(Inst
)) {
1341 // Act as a wait on everything
1342 ScoreBrackets
->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST
->hasVscnt()));
1344 // May need to way wait for anything.
1345 ScoreBrackets
->applyWaitcnt(AMDGPU::Waitcnt());
1347 } else if (SIInstrInfo::isEXP(Inst
)) {
1348 unsigned Imm
= TII
->getNamedOperand(Inst
, AMDGPU::OpName::tgt
)->getImm();
1349 if (Imm
>= AMDGPU::Exp::ET_PARAM0
&& Imm
<= AMDGPU::Exp::ET_PARAM31
)
1350 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, EXP_PARAM_ACCESS
, Inst
);
1351 else if (Imm
>= AMDGPU::Exp::ET_POS0
&& Imm
<= AMDGPU::Exp::ET_POS_LAST
)
1352 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, EXP_POS_ACCESS
, Inst
);
1354 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, EXP_GPR_LOCK
, Inst
);
1356 switch (Inst
.getOpcode()) {
1357 case AMDGPU::S_SENDMSG
:
1358 case AMDGPU::S_SENDMSGHALT
:
1359 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, SQ_MESSAGE
, Inst
);
1361 case AMDGPU::S_MEMTIME
:
1362 case AMDGPU::S_MEMREALTIME
:
1363 ScoreBrackets
->updateByEvent(TII
, TRI
, MRI
, SMEM_ACCESS
, Inst
);
1369 bool WaitcntBrackets::mergeScore(const MergeInfo
&M
, unsigned &Score
,
1370 unsigned OtherScore
) {
1371 unsigned MyShifted
= Score
<= M
.OldLB
? 0 : Score
+ M
.MyShift
;
1372 unsigned OtherShifted
=
1373 OtherScore
<= M
.OtherLB
? 0 : OtherScore
+ M
.OtherShift
;
1374 Score
= std::max(MyShifted
, OtherShifted
);
1375 return OtherShifted
> MyShifted
;
1378 /// Merge the pending events and associater score brackets of \p Other into
1379 /// this brackets status.
1381 /// Returns whether the merge resulted in a change that requires tighter waits
1382 /// (i.e. the merged brackets strictly dominate the original brackets).
1383 bool WaitcntBrackets::merge(const WaitcntBrackets
&Other
) {
1384 bool StrictDom
= false;
1386 VgprUB
= std::max(VgprUB
, Other
.VgprUB
);
1387 SgprUB
= std::max(SgprUB
, Other
.SgprUB
);
1389 for (auto T
: inst_counter_types()) {
1390 // Merge event flags for this counter
1391 const bool OldOutOfOrder
= counterOutOfOrder(T
);
1392 const unsigned OldEvents
= PendingEvents
& WaitEventMaskForInst
[T
];
1393 const unsigned OtherEvents
= Other
.PendingEvents
& WaitEventMaskForInst
[T
];
1394 if (OtherEvents
& ~OldEvents
)
1396 PendingEvents
|= OtherEvents
;
1398 // Merge scores for this counter
1399 const unsigned MyPending
= ScoreUBs
[T
] - ScoreLBs
[T
];
1400 const unsigned OtherPending
= Other
.ScoreUBs
[T
] - Other
.ScoreLBs
[T
];
1401 const unsigned NewUB
= ScoreLBs
[T
] + std::max(MyPending
, OtherPending
);
1402 if (NewUB
< ScoreLBs
[T
])
1403 report_fatal_error("waitcnt score overflow");
1406 M
.OldLB
= ScoreLBs
[T
];
1407 M
.OtherLB
= Other
.ScoreLBs
[T
];
1408 M
.MyShift
= NewUB
- ScoreUBs
[T
];
1409 M
.OtherShift
= NewUB
- Other
.ScoreUBs
[T
];
1411 ScoreUBs
[T
] = NewUB
;
1413 StrictDom
|= mergeScore(M
, LastFlat
[T
], Other
.LastFlat
[T
]);
1415 bool RegStrictDom
= false;
1416 for (int J
= 0; J
<= VgprUB
; J
++) {
1417 RegStrictDom
|= mergeScore(M
, VgprScores
[T
][J
], Other
.VgprScores
[T
][J
]);
1421 for (int J
= 0; J
<= VgprUB
; J
++) {
1422 unsigned char NewVmemTypes
= VgprVmemTypes
[J
] | Other
.VgprVmemTypes
[J
];
1423 RegStrictDom
|= NewVmemTypes
!= VgprVmemTypes
[J
];
1424 VgprVmemTypes
[J
] = NewVmemTypes
;
1428 if (T
== LGKM_CNT
) {
1429 for (int J
= 0; J
<= SgprUB
; J
++) {
1430 RegStrictDom
|= mergeScore(M
, SgprScores
[J
], Other
.SgprScores
[J
]);
1434 if (RegStrictDom
&& !OldOutOfOrder
)
1441 // Generate s_waitcnt instructions where needed.
1442 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction
&MF
,
1443 MachineBasicBlock
&Block
,
1444 WaitcntBrackets
&ScoreBrackets
) {
1445 bool Modified
= false;
1448 dbgs() << "*** Block" << Block
.getNumber() << " ***";
1449 ScoreBrackets
.dump();
1452 // Track the correctness of vccz through this basic block. There are two
1453 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1454 // ST->partialVCCWritesUpdateVCCZ().
1455 bool VCCZCorrect
= true;
1456 if (ST
->hasReadVCCZBug()) {
1457 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1458 // to vcc and then issued an smem load.
1459 VCCZCorrect
= false;
1460 } else if (!ST
->partialVCCWritesUpdateVCCZ()) {
1461 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1462 // to vcc_lo or vcc_hi.
1463 VCCZCorrect
= false;
1466 // Walk over the instructions.
1467 MachineInstr
*OldWaitcntInstr
= nullptr;
1469 for (MachineBasicBlock::instr_iterator Iter
= Block
.instr_begin(),
1470 E
= Block
.instr_end();
1472 MachineInstr
&Inst
= *Iter
;
1474 // Track pre-existing waitcnts that were added in earlier iterations or by
1475 // the memory legalizer.
1476 if (Inst
.getOpcode() == AMDGPU::S_WAITCNT
||
1477 (Inst
.getOpcode() == AMDGPU::S_WAITCNT_VSCNT
&&
1478 Inst
.getOperand(0).isReg() &&
1479 Inst
.getOperand(0).getReg() == AMDGPU::SGPR_NULL
)) {
1480 if (!OldWaitcntInstr
)
1481 OldWaitcntInstr
= &Inst
;
1486 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1487 Modified
|= generateWaitcntInstBefore(Inst
, ScoreBrackets
, OldWaitcntInstr
);
1488 OldWaitcntInstr
= nullptr;
1490 // Restore vccz if it's not known to be correct already.
1491 bool RestoreVCCZ
= !VCCZCorrect
&& readsVCCZ(Inst
);
1493 // Don't examine operands unless we need to track vccz correctness.
1494 if (ST
->hasReadVCCZBug() || !ST
->partialVCCWritesUpdateVCCZ()) {
1495 if (Inst
.definesRegister(AMDGPU::VCC_LO
) ||
1496 Inst
.definesRegister(AMDGPU::VCC_HI
)) {
1497 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1498 if (!ST
->partialVCCWritesUpdateVCCZ())
1499 VCCZCorrect
= false;
1500 } else if (Inst
.definesRegister(AMDGPU::VCC
)) {
1501 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1502 // vccz bit, so when we detect that an instruction may read from a
1503 // corrupt vccz bit, we need to:
1504 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1505 // operations to complete.
1506 // 2. Restore the correct value of vccz by writing the current value
1507 // of vcc back to vcc.
1508 if (ST
->hasReadVCCZBug() &&
1509 ScoreBrackets
.getScoreLB(LGKM_CNT
) <
1510 ScoreBrackets
.getScoreUB(LGKM_CNT
) &&
1511 ScoreBrackets
.hasPendingEvent(SMEM_ACCESS
)) {
1512 // Writes to vcc while there's an outstanding smem read may get
1513 // clobbered as soon as any read completes.
1514 VCCZCorrect
= false;
1516 // Writes to vcc will fix any incorrect value in vccz.
1522 if (TII
->isSMRD(Inst
)) {
1523 for (const MachineMemOperand
*Memop
: Inst
.memoperands()) {
1524 // No need to handle invariant loads when avoiding WAR conflicts, as
1525 // there cannot be a vector store to the same memory location.
1526 if (!Memop
->isInvariant()) {
1527 const Value
*Ptr
= Memop
->getValue();
1528 SLoadAddresses
.insert(std::make_pair(Ptr
, Inst
.getParent()));
1531 if (ST
->hasReadVCCZBug()) {
1532 // This smem read could complete and clobber vccz at any time.
1533 VCCZCorrect
= false;
1537 updateEventWaitcntAfter(Inst
, &ScoreBrackets
);
1539 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1540 // If this instruction generates a S_SETVSKIP because it is an
1541 // indexed resource, and we are on Tahiti, then it will also force
1542 // an S_WAITCNT vmcnt(0)
1543 if (RequireCheckResourceType(Inst
, context
)) {
1544 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1545 ScoreBrackets
->setScoreLB(VM_CNT
,
1546 ScoreBrackets
->getScoreUB(VM_CNT
));
1552 ScoreBrackets
.dump();
1555 // TODO: Remove this work-around after fixing the scheduler and enable the
1558 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1559 // bit is updated, so we can restore the bit by reading the value of
1560 // vcc and then writing it back to the register.
1561 BuildMI(Block
, Inst
, Inst
.getDebugLoc(),
1562 TII
->get(ST
->isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
),
1564 .addReg(TRI
->getVCC());
1575 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction
&MF
) {
1576 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1577 TII
= ST
->getInstrInfo();
1578 TRI
= &TII
->getRegisterInfo();
1579 MRI
= &MF
.getRegInfo();
1580 IV
= AMDGPU::getIsaVersion(ST
->getCPU());
1581 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1582 PDT
= &getAnalysis
<MachinePostDominatorTree
>();
1584 ForceEmitZeroWaitcnts
= ForceEmitZeroFlag
;
1585 for (auto T
: inst_counter_types())
1586 ForceEmitWaitcnt
[T
] = false;
1588 HardwareLimits
.VmcntMax
= AMDGPU::getVmcntBitMask(IV
);
1589 HardwareLimits
.ExpcntMax
= AMDGPU::getExpcntBitMask(IV
);
1590 HardwareLimits
.LgkmcntMax
= AMDGPU::getLgkmcntBitMask(IV
);
1591 HardwareLimits
.VscntMax
= ST
->hasVscnt() ? 63 : 0;
1593 unsigned NumVGPRsMax
= ST
->getAddressableNumVGPRs();
1594 unsigned NumSGPRsMax
= ST
->getAddressableNumSGPRs();
1595 assert(NumVGPRsMax
<= SQ_MAX_PGM_VGPRS
);
1596 assert(NumSGPRsMax
<= SQ_MAX_PGM_SGPRS
);
1598 RegisterEncoding
.VGPR0
= TRI
->getEncodingValue(AMDGPU::VGPR0
);
1599 RegisterEncoding
.VGPRL
= RegisterEncoding
.VGPR0
+ NumVGPRsMax
- 1;
1600 RegisterEncoding
.SGPR0
= TRI
->getEncodingValue(AMDGPU::SGPR0
);
1601 RegisterEncoding
.SGPRL
= RegisterEncoding
.SGPR0
+ NumSGPRsMax
- 1;
1603 TrackedWaitcntSet
.clear();
1605 bool Modified
= false;
1607 if (!MFI
->isEntryFunction()) {
1608 // Wait for any outstanding memory operations that the input registers may
1609 // depend on. We can't track them and it's better to do the wait after the
1610 // costly call sequence.
1612 // TODO: Could insert earlier and schedule more liberally with operations
1613 // that only use caller preserved registers.
1614 MachineBasicBlock
&EntryBB
= MF
.front();
1615 MachineBasicBlock::iterator I
= EntryBB
.begin();
1616 for (MachineBasicBlock::iterator E
= EntryBB
.end();
1617 I
!= E
&& (I
->isPHI() || I
->isMetaInstruction()); ++I
)
1619 BuildMI(EntryBB
, I
, DebugLoc(), TII
->get(AMDGPU::S_WAITCNT
)).addImm(0);
1621 BuildMI(EntryBB
, I
, DebugLoc(), TII
->get(AMDGPU::S_WAITCNT_VSCNT
))
1622 .addReg(AMDGPU::SGPR_NULL
, RegState::Undef
)
1628 // Keep iterating over the blocks in reverse post order, inserting and
1629 // updating s_waitcnt where needed, until a fix point is reached.
1630 for (auto *MBB
: ReversePostOrderTraversal
<MachineFunction
*>(&MF
))
1631 BlockInfos
.insert({MBB
, BlockInfo(MBB
)});
1633 std::unique_ptr
<WaitcntBrackets
> Brackets
;
1638 for (auto BII
= BlockInfos
.begin(), BIE
= BlockInfos
.end(); BII
!= BIE
;
1640 BlockInfo
&BI
= BII
->second
;
1646 Brackets
= std::make_unique
<WaitcntBrackets
>(*BI
.Incoming
);
1648 *Brackets
= *BI
.Incoming
;
1651 Brackets
= std::make_unique
<WaitcntBrackets
>(ST
);
1653 *Brackets
= WaitcntBrackets(ST
);
1656 Modified
|= insertWaitcntInBlock(MF
, *BI
.MBB
, *Brackets
);
1659 if (Brackets
->hasPending()) {
1660 BlockInfo
*MoveBracketsToSucc
= nullptr;
1661 for (MachineBasicBlock
*Succ
: BI
.MBB
->successors()) {
1662 auto SuccBII
= BlockInfos
.find(Succ
);
1663 BlockInfo
&SuccBI
= SuccBII
->second
;
1664 if (!SuccBI
.Incoming
) {
1665 SuccBI
.Dirty
= true;
1668 if (!MoveBracketsToSucc
) {
1669 MoveBracketsToSucc
= &SuccBI
;
1671 SuccBI
.Incoming
= std::make_unique
<WaitcntBrackets
>(*Brackets
);
1673 } else if (SuccBI
.Incoming
->merge(*Brackets
)) {
1674 SuccBI
.Dirty
= true;
1679 if (MoveBracketsToSucc
)
1680 MoveBracketsToSucc
->Incoming
= std::move(Brackets
);
1685 SmallVector
<MachineBasicBlock
*, 4> EndPgmBlocks
;
1687 bool HaveScalarStores
= false;
1689 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end(); BI
!= BE
;
1691 MachineBasicBlock
&MBB
= *BI
;
1693 for (MachineBasicBlock::iterator I
= MBB
.begin(), E
= MBB
.end(); I
!= E
;
1695 if (!HaveScalarStores
&& TII
->isScalarStore(*I
))
1696 HaveScalarStores
= true;
1698 if (I
->getOpcode() == AMDGPU::S_ENDPGM
||
1699 I
->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG
)
1700 EndPgmBlocks
.push_back(&MBB
);
1704 if (HaveScalarStores
) {
1705 // If scalar writes are used, the cache must be flushed or else the next
1706 // wave to reuse the same scratch memory can be clobbered.
1708 // Insert s_dcache_wb at wave termination points if there were any scalar
1709 // stores, and only if the cache hasn't already been flushed. This could be
1710 // improved by looking across blocks for flushes in postdominating blocks
1711 // from the stores but an explicitly requested flush is probably very rare.
1712 for (MachineBasicBlock
*MBB
: EndPgmBlocks
) {
1713 bool SeenDCacheWB
= false;
1715 for (MachineBasicBlock::iterator I
= MBB
->begin(), E
= MBB
->end(); I
!= E
;
1717 if (I
->getOpcode() == AMDGPU::S_DCACHE_WB
)
1718 SeenDCacheWB
= true;
1719 else if (TII
->isScalarStore(*I
))
1720 SeenDCacheWB
= false;
1722 // FIXME: It would be better to insert this before a waitcnt if any.
1723 if ((I
->getOpcode() == AMDGPU::S_ENDPGM
||
1724 I
->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG
) &&
1727 BuildMI(*MBB
, I
, I
->getDebugLoc(), TII
->get(AMDGPU::S_DCACHE_WB
));