1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// SI Implementation of TargetInstrInfo.
12 //===----------------------------------------------------------------------===//
14 #include "SIInstrInfo.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "GCNHazardRecognizer.h"
18 #include "GCNSubtarget.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "Utils/AMDGPUBaseInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
23 #include "llvm/CodeGen/LiveIntervals.h"
24 #include "llvm/CodeGen/LiveVariables.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineScheduler.h"
28 #include "llvm/CodeGen/RegisterScavenging.h"
29 #include "llvm/CodeGen/ScheduleDAG.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/IntrinsicsAMDGPU.h"
32 #include "llvm/MC/MCContext.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Target/TargetMachine.h"
38 #define DEBUG_TYPE "si-instr-info"
40 #define GET_INSTRINFO_CTOR_DTOR
41 #include "AMDGPUGenInstrInfo.inc"
43 namespace llvm::AMDGPU
{
44 #define GET_D16ImageDimIntrinsics_IMPL
45 #define GET_ImageDimIntrinsicTable_IMPL
46 #define GET_RsrcIntrinsics_IMPL
47 #include "AMDGPUGenSearchableTables.inc"
48 } // namespace llvm::AMDGPU
50 // Must be at least 4 to be able to branch over minimum unconditional branch
51 // code. This is only for making it possible to write reasonably small tests for
53 static cl::opt
<unsigned>
54 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden
, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
57 static cl::opt
<bool> Fix16BitCopies(
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 SIInstrInfo::SIInstrInfo(const GCNSubtarget
&ST
)
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP
, AMDGPU::ADJCALLSTACKDOWN
),
69 //===----------------------------------------------------------------------===//
70 // TargetInstrInfo callbacks
71 //===----------------------------------------------------------------------===//
73 static unsigned getNumOperandsNoGlue(SDNode
*Node
) {
74 unsigned N
= Node
->getNumOperands();
75 while (N
&& Node
->getOperand(N
- 1).getValueType() == MVT::Glue
)
80 /// Returns true if both nodes have the same value for the given
81 /// operand \p Op, or if both nodes do not have this operand.
82 static bool nodesHaveSameOperandValue(SDNode
*N0
, SDNode
* N1
, unsigned OpName
) {
83 unsigned Opc0
= N0
->getMachineOpcode();
84 unsigned Opc1
= N1
->getMachineOpcode();
86 int Op0Idx
= AMDGPU::getNamedOperandIdx(Opc0
, OpName
);
87 int Op1Idx
= AMDGPU::getNamedOperandIdx(Opc1
, OpName
);
89 if (Op0Idx
== -1 && Op1Idx
== -1)
93 if ((Op0Idx
== -1 && Op1Idx
!= -1) ||
94 (Op1Idx
== -1 && Op0Idx
!= -1))
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
104 return N0
->getOperand(Op0Idx
) == N1
->getOperand(Op1Idx
);
107 static bool canRemat(const MachineInstr
&MI
) {
109 if (SIInstrInfo::isVOP1(MI
) || SIInstrInfo::isVOP2(MI
) ||
110 SIInstrInfo::isVOP3(MI
) || SIInstrInfo::isSDWA(MI
) ||
111 SIInstrInfo::isSALU(MI
))
114 if (SIInstrInfo::isSMRD(MI
)) {
115 return !MI
.memoperands_empty() &&
116 llvm::all_of(MI
.memoperands(), [](const MachineMemOperand
*MMO
) {
117 return MMO
->isLoad() && MMO
->isInvariant();
124 bool SIInstrInfo::isReallyTriviallyReMaterializable(
125 const MachineInstr
&MI
) const {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI
.hasImplicitDef() &&
140 MI
.getNumImplicitOperands() == MI
.getDesc().implicit_uses().size() &&
141 !MI
.mayRaiseFPException())
145 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI
);
148 // Returns true if the scalar result of a VALU instruction depends on exec.
149 static bool resultDependsOnExec(const MachineInstr
&MI
) {
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI
.isCompare()) {
153 const MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
154 Register DstReg
= MI
.getOperand(0).getReg();
155 if (!DstReg
.isVirtual())
157 for (MachineInstr
&Use
: MRI
.use_nodbg_instructions(DstReg
)) {
158 switch (Use
.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32
:
160 case AMDGPU::S_AND_SAVEEXEC_B64
:
162 case AMDGPU::S_AND_B32
:
163 case AMDGPU::S_AND_B64
:
164 if (!Use
.readsRegister(AMDGPU::EXEC
, /*TRI=*/nullptr))
174 switch (MI
.getOpcode()) {
177 case AMDGPU::V_READFIRSTLANE_B32
:
184 bool SIInstrInfo::isIgnorableUse(const MachineOperand
&MO
) const {
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO
.getReg() == AMDGPU::EXEC
&& MO
.isImplicit() &&
187 isVALU(*MO
.getParent()) && !resultDependsOnExec(*MO
.getParent());
190 bool SIInstrInfo::isSafeToSink(MachineInstr
&MI
,
191 MachineBasicBlock
*SuccToSinkTo
,
192 MachineCycleInfo
*CI
) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI
.getOpcode() == AMDGPU::SI_IF_BREAK
)
197 MachineRegisterInfo
&MRI
= MI
.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op
: MI
.uses()) {
200 if (Op
.isReg() && Op
.getReg().isVirtual() &&
201 RI
.isSGPRClass(MRI
.getRegClass(Op
.getReg()))) {
202 MachineInstr
*SgprDef
= MRI
.getVRegDef(Op
.getReg());
204 // SgprDef defined inside cycle
205 MachineCycle
*FromCycle
= CI
->getCycle(SgprDef
->getParent());
206 if (FromCycle
== nullptr)
209 MachineCycle
*ToCycle
= CI
->getCycle(SuccToSinkTo
);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle
&& !FromCycle
->contains(ToCycle
)) {
213 SmallVector
<MachineBasicBlock
*, 1> ExitingBlocks
;
214 FromCycle
->getExitingBlocks(ExitingBlocks
);
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock
*ExitingBlock
: ExitingBlocks
) {
218 if (hasDivergentBranch(ExitingBlock
))
222 FromCycle
= FromCycle
->getParentCycle();
230 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode
*Load0
, SDNode
*Load1
,
232 int64_t &Offset1
) const {
233 if (!Load0
->isMachineOpcode() || !Load1
->isMachineOpcode())
236 unsigned Opc0
= Load0
->getMachineOpcode();
237 unsigned Opc1
= Load1
->getMachineOpcode();
239 // Make sure both are actually loads.
240 if (!get(Opc0
).mayLoad() || !get(Opc1
).mayLoad())
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0
).getNumDefs() || !get(Opc1
).getNumDefs())
247 if (isDS(Opc0
) && isDS(Opc1
)) {
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0
) != getNumOperandsNoGlue(Load1
))
254 if (Load0
->getOperand(0) != Load1
->getOperand(0))
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
260 int Offset0Idx
= AMDGPU::getNamedOperandIdx(Opc0
, AMDGPU::OpName::offset
);
261 int Offset1Idx
= AMDGPU::getNamedOperandIdx(Opc1
, AMDGPU::OpName::offset
);
262 if (Offset0Idx
== -1 || Offset1Idx
== -1)
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx
-= get(Opc0
).NumDefs
;
270 Offset1Idx
-= get(Opc1
).NumDefs
;
271 Offset0
= Load0
->getConstantOperandVal(Offset0Idx
);
272 Offset1
= Load1
->getConstantOperandVal(Offset1Idx
);
276 if (isSMRD(Opc0
) && isSMRD(Opc1
)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0
, AMDGPU::OpName::sbase
) ||
279 !AMDGPU::hasNamedOperand(Opc1
, AMDGPU::OpName::sbase
))
282 unsigned NumOps
= getNumOperandsNoGlue(Load0
);
283 if (NumOps
!= getNumOperandsNoGlue(Load1
))
287 if (Load0
->getOperand(0) != Load1
->getOperand(0))
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps
== 4 || NumOps
== 5);
292 if (NumOps
== 5 && Load0
->getOperand(1) != Load1
->getOperand(1))
295 const ConstantSDNode
*Load0Offset
=
296 dyn_cast
<ConstantSDNode
>(Load0
->getOperand(NumOps
- 3));
297 const ConstantSDNode
*Load1Offset
=
298 dyn_cast
<ConstantSDNode
>(Load1
->getOperand(NumOps
- 3));
300 if (!Load0Offset
|| !Load1Offset
)
303 Offset0
= Load0Offset
->getZExtValue();
304 Offset1
= Load1Offset
->getZExtValue();
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0
) || isMTBUF(Opc0
)) && (isMUBUF(Opc1
) || isMTBUF(Opc1
))) {
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0
, Load1
, AMDGPU::OpName::soffset
) ||
313 !nodesHaveSameOperandValue(Load0
, Load1
, AMDGPU::OpName::vaddr
) ||
314 !nodesHaveSameOperandValue(Load0
, Load1
, AMDGPU::OpName::srsrc
))
317 int OffIdx0
= AMDGPU::getNamedOperandIdx(Opc0
, AMDGPU::OpName::offset
);
318 int OffIdx1
= AMDGPU::getNamedOperandIdx(Opc1
, AMDGPU::OpName::offset
);
320 if (OffIdx0
== -1 || OffIdx1
== -1)
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0
-= get(Opc0
).NumDefs
;
327 OffIdx1
-= get(Opc1
).NumDefs
;
329 SDValue Off0
= Load0
->getOperand(OffIdx0
);
330 SDValue Off1
= Load1
->getOperand(OffIdx1
);
332 // The offset might be a FrameIndexSDNode.
333 if (!isa
<ConstantSDNode
>(Off0
) || !isa
<ConstantSDNode
>(Off1
))
336 Offset0
= Off0
->getAsZExtVal();
337 Offset1
= Off1
->getAsZExtVal();
344 static bool isStride64(unsigned Opc
) {
346 case AMDGPU::DS_READ2ST64_B32
:
347 case AMDGPU::DS_READ2ST64_B64
:
348 case AMDGPU::DS_WRITE2ST64_B32
:
349 case AMDGPU::DS_WRITE2ST64_B64
:
356 bool SIInstrInfo::getMemOperandsWithOffsetWidth(
357 const MachineInstr
&LdSt
, SmallVectorImpl
<const MachineOperand
*> &BaseOps
,
358 int64_t &Offset
, bool &OffsetIsScalable
, LocationSize
&Width
,
359 const TargetRegisterInfo
*TRI
) const {
360 if (!LdSt
.mayLoadOrStore())
363 unsigned Opc
= LdSt
.getOpcode();
364 OffsetIsScalable
= false;
365 const MachineOperand
*BaseOp
, *OffsetOp
;
369 BaseOp
= getNamedOperand(LdSt
, AMDGPU::OpName::addr
);
370 OffsetOp
= getNamedOperand(LdSt
, AMDGPU::OpName::offset
);
372 // Normal, single offset LDS instruction.
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
378 BaseOps
.push_back(BaseOp
);
379 Offset
= OffsetOp
->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdst
);
383 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::data0
);
384 Width
= getOpSize(LdSt
, DataOpIdx
);
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand
*Offset0Op
=
390 getNamedOperand(LdSt
, AMDGPU::OpName::offset0
);
391 const MachineOperand
*Offset1Op
=
392 getNamedOperand(LdSt
, AMDGPU::OpName::offset1
);
394 unsigned Offset0
= Offset0Op
->getImm() & 0xff;
395 unsigned Offset1
= Offset1Op
->getImm() & 0xff;
396 if (Offset0
+ 1 != Offset1
)
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
404 EltSize
= TRI
->getRegSizeInBits(*getOpRegClass(LdSt
, 0)) / 16;
406 assert(LdSt
.mayStore());
407 int Data0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::data0
);
408 EltSize
= TRI
->getRegSizeInBits(*getOpRegClass(LdSt
, Data0Idx
)) / 8;
414 BaseOps
.push_back(BaseOp
);
415 Offset
= EltSize
* Offset0
;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdst
);
418 if (DataOpIdx
== -1) {
419 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::data0
);
420 Width
= getOpSize(LdSt
, DataOpIdx
);
421 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::data1
);
422 Width
= Width
.getValue() + getOpSize(LdSt
, DataOpIdx
);
424 Width
= getOpSize(LdSt
, DataOpIdx
);
430 if (isMUBUF(LdSt
) || isMTBUF(LdSt
)) {
431 const MachineOperand
*RSrc
= getNamedOperand(LdSt
, AMDGPU::OpName::srsrc
);
432 if (!RSrc
) // e.g. BUFFER_WBINVL1_VOL
434 BaseOps
.push_back(RSrc
);
435 BaseOp
= getNamedOperand(LdSt
, AMDGPU::OpName::vaddr
);
436 if (BaseOp
&& !BaseOp
->isFI())
437 BaseOps
.push_back(BaseOp
);
438 const MachineOperand
*OffsetImm
=
439 getNamedOperand(LdSt
, AMDGPU::OpName::offset
);
440 Offset
= OffsetImm
->getImm();
441 const MachineOperand
*SOffset
=
442 getNamedOperand(LdSt
, AMDGPU::OpName::soffset
);
444 if (SOffset
->isReg())
445 BaseOps
.push_back(SOffset
);
447 Offset
+= SOffset
->getImm();
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdst
);
452 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdata
);
453 if (DataOpIdx
== -1) // LDS DMA
455 Width
= getOpSize(LdSt
, DataOpIdx
);
461 isMIMG(LdSt
) ? AMDGPU::OpName::srsrc
: AMDGPU::OpName::rsrc
;
462 int SRsrcIdx
= AMDGPU::getNamedOperandIdx(Opc
, RsrcOpName
);
463 BaseOps
.push_back(&LdSt
.getOperand(SRsrcIdx
));
464 int VAddr0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr0
);
465 if (VAddr0Idx
>= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I
= VAddr0Idx
; I
< SRsrcIdx
; ++I
)
468 BaseOps
.push_back(&LdSt
.getOperand(I
));
470 BaseOps
.push_back(getNamedOperand(LdSt
, AMDGPU::OpName::vaddr
));
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdata
);
476 return false; // no return sampler
477 Width
= getOpSize(LdSt
, DataOpIdx
);
482 BaseOp
= getNamedOperand(LdSt
, AMDGPU::OpName::sbase
);
483 if (!BaseOp
) // e.g. S_MEMTIME
485 BaseOps
.push_back(BaseOp
);
486 OffsetOp
= getNamedOperand(LdSt
, AMDGPU::OpName::offset
);
487 Offset
= OffsetOp
? OffsetOp
->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::sdst
);
492 Width
= getOpSize(LdSt
, DataOpIdx
);
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp
= getNamedOperand(LdSt
, AMDGPU::OpName::vaddr
);
500 BaseOps
.push_back(BaseOp
);
501 BaseOp
= getNamedOperand(LdSt
, AMDGPU::OpName::saddr
);
503 BaseOps
.push_back(BaseOp
);
504 Offset
= getNamedOperand(LdSt
, AMDGPU::OpName::offset
)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdst
);
508 DataOpIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdata
);
509 if (DataOpIdx
== -1) // LDS DMA
511 Width
= getOpSize(LdSt
, DataOpIdx
);
518 static bool memOpsHaveSameBasePtr(const MachineInstr
&MI1
,
519 ArrayRef
<const MachineOperand
*> BaseOps1
,
520 const MachineInstr
&MI2
,
521 ArrayRef
<const MachineOperand
*> BaseOps2
) {
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1
.front()->isIdenticalTo(*BaseOps2
.front()))
528 if (!MI1
.hasOneMemOperand() || !MI2
.hasOneMemOperand())
531 auto *MO1
= *MI1
.memoperands_begin();
532 auto *MO2
= *MI2
.memoperands_begin();
533 if (MO1
->getAddrSpace() != MO2
->getAddrSpace())
536 const auto *Base1
= MO1
->getValue();
537 const auto *Base2
= MO2
->getValue();
538 if (!Base1
|| !Base2
)
540 Base1
= getUnderlyingObject(Base1
);
541 Base2
= getUnderlyingObject(Base2
);
543 if (isa
<UndefValue
>(Base1
) || isa
<UndefValue
>(Base2
))
546 return Base1
== Base2
;
549 bool SIInstrInfo::shouldClusterMemOps(ArrayRef
<const MachineOperand
*> BaseOps1
,
550 int64_t Offset1
, bool OffsetIsScalable1
,
551 ArrayRef
<const MachineOperand
*> BaseOps2
,
552 int64_t Offset2
, bool OffsetIsScalable2
,
553 unsigned ClusterSize
,
554 unsigned NumBytes
) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 if (!BaseOps1
.empty() && !BaseOps2
.empty()) {
558 const MachineInstr
&FirstLdSt
= *BaseOps1
.front()->getParent();
559 const MachineInstr
&SecondLdSt
= *BaseOps2
.front()->getParent();
560 if (!memOpsHaveSameBasePtr(FirstLdSt
, BaseOps1
, SecondLdSt
, BaseOps2
))
562 } else if (!BaseOps1
.empty() || !BaseOps2
.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed 8. This is an
569 // empirical value based on certain observations and performance related
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize`.
574 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
575 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
576 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
577 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
578 // (5) LoadSize >= 17: do not cluster
579 const unsigned LoadSize
= NumBytes
/ ClusterSize
;
580 const unsigned NumDWORDs
= ((LoadSize
+ 3) / 4) * ClusterSize
;
581 return NumDWORDs
<= 8;
584 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
585 // the first 16 loads will be interleaved with the stores, and the next 16 will
586 // be clustered as expected. It should really split into 2 16 store batches.
588 // Loads are clustered until this returns false, rather than trying to schedule
589 // groups of stores. This also means we have to deal with saying different
590 // address space loads should be clustered, and ones which might cause bank
593 // This might be deprecated so it might not be worth that much effort to fix.
594 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode
*Load0
, SDNode
*Load1
,
595 int64_t Offset0
, int64_t Offset1
,
596 unsigned NumLoads
) const {
597 assert(Offset1
> Offset0
&&
598 "Second offset should be larger than first offset!");
599 // If we have less than 16 loads in a row, and the offsets are within 64
600 // bytes, then schedule together.
602 // A cacheline is 64 bytes (for global memory).
603 return (NumLoads
<= 16 && (Offset1
- Offset0
) < 64);
606 static void reportIllegalCopy(const SIInstrInfo
*TII
, MachineBasicBlock
&MBB
,
607 MachineBasicBlock::iterator MI
,
608 const DebugLoc
&DL
, MCRegister DestReg
,
609 MCRegister SrcReg
, bool KillSrc
,
610 const char *Msg
= "illegal VGPR to SGPR copy") {
611 MachineFunction
*MF
= MBB
.getParent();
612 DiagnosticInfoUnsupported
IllegalCopy(MF
->getFunction(), Msg
, DL
, DS_Error
);
613 LLVMContext
&C
= MF
->getFunction().getContext();
614 C
.diagnose(IllegalCopy
);
616 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::SI_ILLEGAL_COPY
), DestReg
)
617 .addReg(SrcReg
, getKillRegState(KillSrc
));
620 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
621 /// possible to have a direct copy in these cases on GFX908, so an intermediate
622 /// VGPR copy is required.
623 static void indirectCopyToAGPR(const SIInstrInfo
&TII
,
624 MachineBasicBlock
&MBB
,
625 MachineBasicBlock::iterator MI
,
626 const DebugLoc
&DL
, MCRegister DestReg
,
627 MCRegister SrcReg
, bool KillSrc
,
628 RegScavenger
&RS
, bool RegsOverlap
,
629 Register ImpDefSuperReg
= Register(),
630 Register ImpUseSuperReg
= Register()) {
631 assert((TII
.getSubtarget().hasMAIInsts() &&
632 !TII
.getSubtarget().hasGFX90AInsts()) &&
633 "Expected GFX908 subtarget.");
635 assert((AMDGPU::SReg_32RegClass
.contains(SrcReg
) ||
636 AMDGPU::AGPR_32RegClass
.contains(SrcReg
)) &&
637 "Source register of the copy should be either an SGPR or an AGPR.");
639 assert(AMDGPU::AGPR_32RegClass
.contains(DestReg
) &&
640 "Destination register of the copy should be an AGPR.");
642 const SIRegisterInfo
&RI
= TII
.getRegisterInfo();
644 // First try to find defining accvgpr_write to avoid temporary registers.
645 // In the case of copies of overlapping AGPRs, we conservatively do not
646 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
647 // an accvgpr_write used for this same copy due to implicit-defs
649 for (auto Def
= MI
, E
= MBB
.begin(); Def
!= E
; ) {
652 if (!Def
->modifiesRegister(SrcReg
, &RI
))
655 if (Def
->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64
||
656 Def
->getOperand(0).getReg() != SrcReg
)
659 MachineOperand
&DefOp
= Def
->getOperand(1);
660 assert(DefOp
.isReg() || DefOp
.isImm());
663 bool SafeToPropagate
= true;
664 // Check that register source operand is not clobbered before MI.
665 // Immediate operands are always safe to propagate.
666 for (auto I
= Def
; I
!= MI
&& SafeToPropagate
; ++I
)
667 if (I
->modifiesRegister(DefOp
.getReg(), &RI
))
668 SafeToPropagate
= false;
670 if (!SafeToPropagate
)
673 DefOp
.setIsKill(false);
676 MachineInstrBuilder Builder
=
677 BuildMI(MBB
, MI
, DL
, TII
.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), DestReg
)
680 Builder
.addReg(ImpDefSuperReg
, RegState::Define
| RegState::Implicit
);
682 if (ImpUseSuperReg
) {
683 Builder
.addReg(ImpUseSuperReg
,
684 getKillRegState(KillSrc
) | RegState::Implicit
);
691 RS
.enterBasicBlockEnd(MBB
);
692 RS
.backward(std::next(MI
));
694 // Ideally we want to have three registers for a long reg_sequence copy
695 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
696 unsigned MaxVGPRs
= RI
.getRegPressureLimit(&AMDGPU::VGPR_32RegClass
,
699 // Registers in the sequence are allocated contiguously so we can just
700 // use register number to pick one of three round-robin temps.
701 unsigned RegNo
= (DestReg
- AMDGPU::AGPR0
) % 3;
703 MBB
.getParent()->getInfo
<SIMachineFunctionInfo
>()->getVGPRForAGPRCopy();
704 assert(MBB
.getParent()->getRegInfo().isReserved(Tmp
) &&
705 "VGPR used for an intermediate copy should have been reserved.");
707 // Only loop through if there are any free registers left. We don't want to
710 Register Tmp2
= RS
.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass
, MI
,
711 /* RestoreAfter */ false, 0,
712 /* AllowSpill */ false);
713 if (!Tmp2
|| RI
.getHWRegIndex(Tmp2
) >= MaxVGPRs
)
719 // Insert copy to temporary VGPR.
720 unsigned TmpCopyOp
= AMDGPU::V_MOV_B32_e32
;
721 if (AMDGPU::AGPR_32RegClass
.contains(SrcReg
)) {
722 TmpCopyOp
= AMDGPU::V_ACCVGPR_READ_B32_e64
;
724 assert(AMDGPU::SReg_32RegClass
.contains(SrcReg
));
727 MachineInstrBuilder UseBuilder
= BuildMI(MBB
, MI
, DL
, TII
.get(TmpCopyOp
), Tmp
)
728 .addReg(SrcReg
, getKillRegState(KillSrc
));
729 if (ImpUseSuperReg
) {
730 UseBuilder
.addReg(ImpUseSuperReg
,
731 getKillRegState(KillSrc
) | RegState::Implicit
);
734 MachineInstrBuilder DefBuilder
735 = BuildMI(MBB
, MI
, DL
, TII
.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), DestReg
)
736 .addReg(Tmp
, RegState::Kill
);
739 DefBuilder
.addReg(ImpDefSuperReg
, RegState::Define
| RegState::Implicit
);
742 static void expandSGPRCopy(const SIInstrInfo
&TII
, MachineBasicBlock
&MBB
,
743 MachineBasicBlock::iterator MI
, const DebugLoc
&DL
,
744 MCRegister DestReg
, MCRegister SrcReg
, bool KillSrc
,
745 const TargetRegisterClass
*RC
, bool Forward
) {
746 const SIRegisterInfo
&RI
= TII
.getRegisterInfo();
747 ArrayRef
<int16_t> BaseIndices
= RI
.getRegSplitParts(RC
, 4);
748 MachineBasicBlock::iterator I
= MI
;
749 MachineInstr
*FirstMI
= nullptr, *LastMI
= nullptr;
751 for (unsigned Idx
= 0; Idx
< BaseIndices
.size(); ++Idx
) {
752 int16_t SubIdx
= BaseIndices
[Idx
];
753 Register DestSubReg
= RI
.getSubReg(DestReg
, SubIdx
);
754 Register SrcSubReg
= RI
.getSubReg(SrcReg
, SubIdx
);
755 assert(DestSubReg
&& SrcSubReg
&& "Failed to find subregs!");
756 unsigned Opcode
= AMDGPU::S_MOV_B32
;
758 // Is SGPR aligned? If so try to combine with next.
759 bool AlignedDest
= ((DestSubReg
- AMDGPU::SGPR0
) % 2) == 0;
760 bool AlignedSrc
= ((SrcSubReg
- AMDGPU::SGPR0
) % 2) == 0;
761 if (AlignedDest
&& AlignedSrc
&& (Idx
+ 1 < BaseIndices
.size())) {
762 // Can use SGPR64 copy
763 unsigned Channel
= RI
.getChannelFromSubReg(SubIdx
);
764 SubIdx
= RI
.getSubRegFromChannel(Channel
, 2);
765 DestSubReg
= RI
.getSubReg(DestReg
, SubIdx
);
766 SrcSubReg
= RI
.getSubReg(SrcReg
, SubIdx
);
767 assert(DestSubReg
&& SrcSubReg
&& "Failed to find subregs!");
768 Opcode
= AMDGPU::S_MOV_B64
;
772 LastMI
= BuildMI(MBB
, I
, DL
, TII
.get(Opcode
), DestSubReg
)
774 .addReg(SrcReg
, RegState::Implicit
);
783 assert(FirstMI
&& LastMI
);
785 std::swap(FirstMI
, LastMI
);
788 MachineOperand::CreateReg(DestReg
, true /*IsDef*/, true /*IsImp*/));
791 LastMI
->addRegisterKilled(SrcReg
, &RI
);
794 void SIInstrInfo::copyPhysReg(MachineBasicBlock
&MBB
,
795 MachineBasicBlock::iterator MI
,
796 const DebugLoc
&DL
, MCRegister DestReg
,
797 MCRegister SrcReg
, bool KillSrc
,
798 bool RenamableDest
, bool RenamableSrc
) const {
799 const TargetRegisterClass
*RC
= RI
.getPhysRegBaseClass(DestReg
);
800 unsigned Size
= RI
.getRegSizeInBits(*RC
);
801 const TargetRegisterClass
*SrcRC
= RI
.getPhysRegBaseClass(SrcReg
);
802 unsigned SrcSize
= RI
.getRegSizeInBits(*SrcRC
);
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies
) {
808 if (((Size
== 16) != (SrcSize
== 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
810 assert(ST
.hasTrue16BitInsts());
811 MCRegister
&RegToFix
= (Size
== 32) ? DestReg
: SrcReg
;
812 MCRegister SubReg
= RI
.getSubReg(RegToFix
, AMDGPU::lo16
);
815 if (DestReg
== SrcReg
) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
818 BuildMI(MBB
, MI
, DL
, get(AMDGPU::BUNDLE
));
821 RC
= RI
.getPhysRegBaseClass(DestReg
);
822 Size
= RI
.getRegSizeInBits(*RC
);
823 SrcRC
= RI
.getPhysRegBaseClass(SrcReg
);
824 SrcSize
= RI
.getRegSizeInBits(*SrcRC
);
828 if (RC
== &AMDGPU::VGPR_32RegClass
) {
829 assert(AMDGPU::VGPR_32RegClass
.contains(SrcReg
) ||
830 AMDGPU::SReg_32RegClass
.contains(SrcReg
) ||
831 AMDGPU::AGPR_32RegClass
.contains(SrcReg
));
832 unsigned Opc
= AMDGPU::AGPR_32RegClass
.contains(SrcReg
) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64
: AMDGPU::V_MOV_B32_e32
;
834 BuildMI(MBB
, MI
, DL
, get(Opc
), DestReg
)
835 .addReg(SrcReg
, getKillRegState(KillSrc
));
839 if (RC
== &AMDGPU::SReg_32_XM0RegClass
||
840 RC
== &AMDGPU::SReg_32RegClass
) {
841 if (SrcReg
== AMDGPU::SCC
) {
842 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_CSELECT_B32
), DestReg
)
848 if (DestReg
== AMDGPU::VCC_LO
) {
849 if (AMDGPU::SReg_32RegClass
.contains(SrcReg
)) {
850 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B32
), AMDGPU::VCC_LO
)
851 .addReg(SrcReg
, getKillRegState(KillSrc
));
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass
.contains(SrcReg
));
855 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_CMP_NE_U32_e32
))
857 .addReg(SrcReg
, getKillRegState(KillSrc
));
863 if (!AMDGPU::SReg_32RegClass
.contains(SrcReg
)) {
864 reportIllegalCopy(this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
);
868 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B32
), DestReg
)
869 .addReg(SrcReg
, getKillRegState(KillSrc
));
873 if (RC
== &AMDGPU::SReg_64RegClass
) {
874 if (SrcReg
== AMDGPU::SCC
) {
875 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_CSELECT_B64
), DestReg
)
881 if (DestReg
== AMDGPU::VCC
) {
882 if (AMDGPU::SReg_64RegClass
.contains(SrcReg
)) {
883 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B64
), AMDGPU::VCC
)
884 .addReg(SrcReg
, getKillRegState(KillSrc
));
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass
.contains(SrcReg
));
888 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_CMP_NE_U32_e32
))
890 .addReg(SrcReg
, getKillRegState(KillSrc
));
896 if (!AMDGPU::SReg_64RegClass
.contains(SrcReg
)) {
897 reportIllegalCopy(this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
);
901 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B64
), DestReg
)
902 .addReg(SrcReg
, getKillRegState(KillSrc
));
906 if (DestReg
== AMDGPU::SCC
) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass
.contains(SrcReg
)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
913 assert(ST
.hasScalarCompareEq64());
914 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_CMP_LG_U64
))
915 .addReg(SrcReg
, getKillRegState(KillSrc
))
918 assert(AMDGPU::SReg_32RegClass
.contains(SrcReg
));
919 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_CMP_LG_U32
))
920 .addReg(SrcReg
, getKillRegState(KillSrc
))
927 if (RC
== &AMDGPU::AGPR_32RegClass
) {
928 if (AMDGPU::VGPR_32RegClass
.contains(SrcReg
) ||
929 (ST
.hasGFX90AInsts() && AMDGPU::SReg_32RegClass
.contains(SrcReg
))) {
930 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), DestReg
)
931 .addReg(SrcReg
, getKillRegState(KillSrc
));
935 if (AMDGPU::AGPR_32RegClass
.contains(SrcReg
) && ST
.hasGFX90AInsts()) {
936 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_ACCVGPR_MOV_B32
), DestReg
)
937 .addReg(SrcReg
, getKillRegState(KillSrc
));
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
944 const bool Overlap
= RI
.regsOverlap(SrcReg
, DestReg
);
945 indirectCopyToAGPR(*this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
, RS
, Overlap
);
950 assert(AMDGPU::VGPR_16RegClass
.contains(SrcReg
) ||
951 AMDGPU::SReg_LO16RegClass
.contains(SrcReg
) ||
952 AMDGPU::AGPR_LO16RegClass
.contains(SrcReg
));
954 bool IsSGPRDst
= AMDGPU::SReg_LO16RegClass
.contains(DestReg
);
955 bool IsSGPRSrc
= AMDGPU::SReg_LO16RegClass
.contains(SrcReg
);
956 bool IsAGPRDst
= AMDGPU::AGPR_LO16RegClass
.contains(DestReg
);
957 bool IsAGPRSrc
= AMDGPU::AGPR_LO16RegClass
.contains(SrcReg
);
958 bool DstLow
= !AMDGPU::isHi16Reg(DestReg
, RI
);
959 bool SrcLow
= !AMDGPU::isHi16Reg(SrcReg
, RI
);
960 MCRegister NewDestReg
= RI
.get32BitRegister(DestReg
);
961 MCRegister NewSrcReg
= RI
.get32BitRegister(SrcReg
);
965 reportIllegalCopy(this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
);
969 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B32
), NewDestReg
)
970 .addReg(NewSrcReg
, getKillRegState(KillSrc
));
974 if (IsAGPRDst
|| IsAGPRSrc
) {
975 if (!DstLow
|| !SrcLow
) {
976 reportIllegalCopy(this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
,
977 "Cannot use hi16 subreg with an AGPR!");
980 copyPhysReg(MBB
, MI
, DL
, NewDestReg
, NewSrcReg
, KillSrc
);
984 if (ST
.hasTrue16BitInsts()) {
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass
.contains(DestReg
) &&
991 (IsSGPRSrc
|| AMDGPU::VGPR_16_Lo128RegClass
.contains(SrcReg
))) {
992 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B16_t16_e32
), DestReg
)
995 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B16_t16_e64
), DestReg
)
996 .addImm(0) // src0_modifiers
998 .addImm(0); // op_sel
1003 if (IsSGPRSrc
&& !ST
.hasSDWAScalar()) {
1004 if (!DstLow
|| !SrcLow
) {
1005 reportIllegalCopy(this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
,
1006 "Cannot use hi16 subreg on VI!");
1009 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_e32
), NewDestReg
)
1010 .addReg(NewSrcReg
, getKillRegState(KillSrc
));
1014 auto MIB
= BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_sdwa
), NewDestReg
)
1015 .addImm(0) // src0_modifiers
1018 .addImm(DstLow
? AMDGPU::SDWA::SdwaSel::WORD_0
1019 : AMDGPU::SDWA::SdwaSel::WORD_1
)
1020 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
)
1021 .addImm(SrcLow
? AMDGPU::SDWA::SdwaSel::WORD_0
1022 : AMDGPU::SDWA::SdwaSel::WORD_1
)
1023 .addReg(NewDestReg
, RegState::Implicit
| RegState::Undef
);
1024 // First implicit operand is $exec.
1025 MIB
->tieOperands(0, MIB
->getNumOperands() - 1);
1029 if (RC
== RI
.getVGPR64Class() && (SrcRC
== RC
|| RI
.isSGPRClass(SrcRC
))) {
1030 if (ST
.hasMovB64()) {
1031 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B64_e32
), DestReg
)
1032 .addReg(SrcReg
, getKillRegState(KillSrc
));
1035 if (ST
.hasPkMovB32()) {
1036 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_PK_MOV_B32
), DestReg
)
1037 .addImm(SISrcMods::OP_SEL_1
)
1039 .addImm(SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1046 .addReg(SrcReg
, getKillRegState(KillSrc
) | RegState::Implicit
);
1051 const bool Forward
= RI
.getHWRegIndex(DestReg
) <= RI
.getHWRegIndex(SrcReg
);
1052 if (RI
.isSGPRClass(RC
)) {
1053 if (!RI
.isSGPRClass(SrcRC
)) {
1054 reportIllegalCopy(this, MBB
, MI
, DL
, DestReg
, SrcReg
, KillSrc
);
1057 const bool CanKillSuperReg
= KillSrc
&& !RI
.regsOverlap(SrcReg
, DestReg
);
1058 expandSGPRCopy(*this, MBB
, MI
, DL
, DestReg
, SrcReg
, CanKillSuperReg
, RC
,
1063 unsigned EltSize
= 4;
1064 unsigned Opcode
= AMDGPU::V_MOV_B32_e32
;
1065 if (RI
.isAGPRClass(RC
)) {
1066 if (ST
.hasGFX90AInsts() && RI
.isAGPRClass(SrcRC
))
1067 Opcode
= AMDGPU::V_ACCVGPR_MOV_B32
;
1068 else if (RI
.hasVGPRs(SrcRC
) ||
1069 (ST
.hasGFX90AInsts() && RI
.isSGPRClass(SrcRC
)))
1070 Opcode
= AMDGPU::V_ACCVGPR_WRITE_B32_e64
;
1072 Opcode
= AMDGPU::INSTRUCTION_LIST_END
;
1073 } else if (RI
.hasVGPRs(RC
) && RI
.isAGPRClass(SrcRC
)) {
1074 Opcode
= AMDGPU::V_ACCVGPR_READ_B32_e64
;
1075 } else if ((Size
% 64 == 0) && RI
.hasVGPRs(RC
) &&
1076 (RI
.isProperlyAlignedRC(*RC
) &&
1077 (SrcRC
== RC
|| RI
.isSGPRClass(SrcRC
)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST
.hasMovB64()) {
1080 Opcode
= AMDGPU::V_MOV_B64_e32
;
1082 } else if (ST
.hasPkMovB32()) {
1083 Opcode
= AMDGPU::V_PK_MOV_B32
;
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr
<RegScavenger
> RS
;
1094 if (Opcode
== AMDGPU::INSTRUCTION_LIST_END
)
1095 RS
= std::make_unique
<RegScavenger
>();
1097 ArrayRef
<int16_t> SubIndices
= RI
.getRegSplitParts(RC
, EltSize
);
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap
= RI
.regsOverlap(SrcReg
, DestReg
);
1102 const bool CanKillSuperReg
= KillSrc
&& !Overlap
;
1104 for (unsigned Idx
= 0; Idx
< SubIndices
.size(); ++Idx
) {
1107 SubIdx
= SubIndices
[Idx
];
1109 SubIdx
= SubIndices
[SubIndices
.size() - Idx
- 1];
1110 Register DestSubReg
= RI
.getSubReg(DestReg
, SubIdx
);
1111 Register SrcSubReg
= RI
.getSubReg(SrcReg
, SubIdx
);
1112 assert(DestSubReg
&& SrcSubReg
&& "Failed to find subregs!");
1114 bool IsFirstSubreg
= Idx
== 0;
1115 bool UseKill
= CanKillSuperReg
&& Idx
== SubIndices
.size() - 1;
1117 if (Opcode
== AMDGPU::INSTRUCTION_LIST_END
) {
1118 Register ImpDefSuper
= IsFirstSubreg
? Register(DestReg
) : Register();
1119 Register ImpUseSuper
= SrcReg
;
1120 indirectCopyToAGPR(*this, MBB
, MI
, DL
, DestSubReg
, SrcSubReg
, UseKill
,
1121 *RS
, Overlap
, ImpDefSuper
, ImpUseSuper
);
1122 } else if (Opcode
== AMDGPU::V_PK_MOV_B32
) {
1123 MachineInstrBuilder MIB
=
1124 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_PK_MOV_B32
), DestSubReg
)
1125 .addImm(SISrcMods::OP_SEL_1
)
1127 .addImm(SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1134 .addReg(SrcReg
, getKillRegState(UseKill
) | RegState::Implicit
);
1136 MIB
.addReg(DestReg
, RegState::Define
| RegState::Implicit
);
1138 MachineInstrBuilder Builder
=
1139 BuildMI(MBB
, MI
, DL
, get(Opcode
), DestSubReg
).addReg(SrcSubReg
);
1141 Builder
.addReg(DestReg
, RegState::Define
| RegState::Implicit
);
1143 Builder
.addReg(SrcReg
, getKillRegState(UseKill
) | RegState::Implicit
);
1148 int SIInstrInfo::commuteOpcode(unsigned Opcode
) const {
1151 // Try to map original to commuted opcode
1152 NewOpc
= AMDGPU::getCommuteRev(Opcode
);
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc
) != -1 ? NewOpc
: -1;
1157 // Try to map commuted to original opcode
1158 NewOpc
= AMDGPU::getCommuteOrig(Opcode
);
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc
) != -1 ? NewOpc
: -1;
1166 void SIInstrInfo::materializeImmediate(MachineBasicBlock
&MBB
,
1167 MachineBasicBlock::iterator MI
,
1168 const DebugLoc
&DL
, Register DestReg
,
1169 int64_t Value
) const {
1170 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
1171 const TargetRegisterClass
*RegClass
= MRI
.getRegClass(DestReg
);
1172 if (RegClass
== &AMDGPU::SReg_32RegClass
||
1173 RegClass
== &AMDGPU::SGPR_32RegClass
||
1174 RegClass
== &AMDGPU::SReg_32_XM0RegClass
||
1175 RegClass
== &AMDGPU::SReg_32_XM0_XEXECRegClass
) {
1176 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B32
), DestReg
)
1181 if (RegClass
== &AMDGPU::SReg_64RegClass
||
1182 RegClass
== &AMDGPU::SGPR_64RegClass
||
1183 RegClass
== &AMDGPU::SReg_64_XEXECRegClass
) {
1184 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B64
), DestReg
)
1189 if (RegClass
== &AMDGPU::VGPR_32RegClass
) {
1190 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_e32
), DestReg
)
1194 if (RegClass
->hasSuperClassEq(&AMDGPU::VReg_64RegClass
)) {
1195 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B64_PSEUDO
), DestReg
)
1200 unsigned EltSize
= 4;
1201 unsigned Opcode
= AMDGPU::V_MOV_B32_e32
;
1202 if (RI
.isSGPRClass(RegClass
)) {
1203 if (RI
.getRegSizeInBits(*RegClass
) > 32) {
1204 Opcode
= AMDGPU::S_MOV_B64
;
1207 Opcode
= AMDGPU::S_MOV_B32
;
1212 ArrayRef
<int16_t> SubIndices
= RI
.getRegSplitParts(RegClass
, EltSize
);
1213 for (unsigned Idx
= 0; Idx
< SubIndices
.size(); ++Idx
) {
1214 int64_t IdxValue
= Idx
== 0 ? Value
: 0;
1216 MachineInstrBuilder Builder
= BuildMI(MBB
, MI
, DL
,
1217 get(Opcode
), RI
.getSubReg(DestReg
, SubIndices
[Idx
]));
1218 Builder
.addImm(IdxValue
);
1222 const TargetRegisterClass
*
1223 SIInstrInfo::getPreferredSelectRegClass(unsigned Size
) const {
1224 return &AMDGPU::VGPR_32RegClass
;
1227 void SIInstrInfo::insertVectorSelect(MachineBasicBlock
&MBB
,
1228 MachineBasicBlock::iterator I
,
1229 const DebugLoc
&DL
, Register DstReg
,
1230 ArrayRef
<MachineOperand
> Cond
,
1232 Register FalseReg
) const {
1233 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
1234 const TargetRegisterClass
*BoolXExecRC
= RI
.getWaveMaskRegClass();
1235 assert(MRI
.getRegClass(DstReg
) == &AMDGPU::VGPR_32RegClass
&&
1236 "Not a VGPR32 reg");
1238 if (Cond
.size() == 1) {
1239 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1240 BuildMI(MBB
, I
, DL
, get(AMDGPU::COPY
), SReg
)
1242 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1248 } else if (Cond
.size() == 2) {
1249 assert(Cond
[0].isImm() && "Cond[0] is not an immediate");
1250 switch (Cond
[0].getImm()) {
1251 case SIInstrInfo::SCC_TRUE
: {
1252 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1253 BuildMI(MBB
, I
, DL
, get(ST
.isWave32() ? AMDGPU::S_CSELECT_B32
1254 : AMDGPU::S_CSELECT_B64
), SReg
)
1257 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1265 case SIInstrInfo::SCC_FALSE
: {
1266 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1267 BuildMI(MBB
, I
, DL
, get(ST
.isWave32() ? AMDGPU::S_CSELECT_B32
1268 : AMDGPU::S_CSELECT_B64
), SReg
)
1271 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1279 case SIInstrInfo::VCCNZ
: {
1280 MachineOperand RegOp
= Cond
[1];
1281 RegOp
.setImplicit(false);
1282 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1283 BuildMI(MBB
, I
, DL
, get(AMDGPU::COPY
), SReg
)
1285 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1293 case SIInstrInfo::VCCZ
: {
1294 MachineOperand RegOp
= Cond
[1];
1295 RegOp
.setImplicit(false);
1296 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1297 BuildMI(MBB
, I
, DL
, get(AMDGPU::COPY
), SReg
)
1299 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1307 case SIInstrInfo::EXECNZ
: {
1308 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1309 Register SReg2
= MRI
.createVirtualRegister(RI
.getBoolRC());
1310 BuildMI(MBB
, I
, DL
, get(ST
.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1311 : AMDGPU::S_OR_SAVEEXEC_B64
), SReg2
)
1313 BuildMI(MBB
, I
, DL
, get(ST
.isWave32() ? AMDGPU::S_CSELECT_B32
1314 : AMDGPU::S_CSELECT_B64
), SReg
)
1317 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1325 case SIInstrInfo::EXECZ
: {
1326 Register SReg
= MRI
.createVirtualRegister(BoolXExecRC
);
1327 Register SReg2
= MRI
.createVirtualRegister(RI
.getBoolRC());
1328 BuildMI(MBB
, I
, DL
, get(ST
.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1329 : AMDGPU::S_OR_SAVEEXEC_B64
), SReg2
)
1331 BuildMI(MBB
, I
, DL
, get(ST
.isWave32() ? AMDGPU::S_CSELECT_B32
1332 : AMDGPU::S_CSELECT_B64
), SReg
)
1335 BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1341 llvm_unreachable("Unhandled branch predicate EXECZ");
1345 llvm_unreachable("invalid branch predicate");
1348 llvm_unreachable("Can only handle Cond size 1 or 2");
1352 Register
SIInstrInfo::insertEQ(MachineBasicBlock
*MBB
,
1353 MachineBasicBlock::iterator I
,
1355 Register SrcReg
, int Value
) const {
1356 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
1357 Register Reg
= MRI
.createVirtualRegister(RI
.getBoolRC());
1358 BuildMI(*MBB
, I
, DL
, get(AMDGPU::V_CMP_EQ_I32_e64
), Reg
)
1365 Register
SIInstrInfo::insertNE(MachineBasicBlock
*MBB
,
1366 MachineBasicBlock::iterator I
,
1368 Register SrcReg
, int Value
) const {
1369 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
1370 Register Reg
= MRI
.createVirtualRegister(RI
.getBoolRC());
1371 BuildMI(*MBB
, I
, DL
, get(AMDGPU::V_CMP_NE_I32_e64
), Reg
)
1378 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass
*DstRC
) const {
1380 if (RI
.isAGPRClass(DstRC
))
1381 return AMDGPU::COPY
;
1382 if (RI
.getRegSizeInBits(*DstRC
) == 16) {
1383 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1385 return RI
.isSGPRClass(DstRC
) ? AMDGPU::COPY
: AMDGPU::V_MOV_B16_t16_e64
;
1387 if (RI
.getRegSizeInBits(*DstRC
) == 32)
1388 return RI
.isSGPRClass(DstRC
) ? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
1389 if (RI
.getRegSizeInBits(*DstRC
) == 64 && RI
.isSGPRClass(DstRC
))
1390 return AMDGPU::S_MOV_B64
;
1391 if (RI
.getRegSizeInBits(*DstRC
) == 64 && !RI
.isSGPRClass(DstRC
))
1392 return AMDGPU::V_MOV_B64_PSEUDO
;
1393 return AMDGPU::COPY
;
1397 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize
,
1398 bool IsIndirectSrc
) const {
1399 if (IsIndirectSrc
) {
1400 if (VecSize
<= 32) // 4 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1
);
1402 if (VecSize
<= 64) // 8 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2
);
1404 if (VecSize
<= 96) // 12 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3
);
1406 if (VecSize
<= 128) // 16 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4
);
1408 if (VecSize
<= 160) // 20 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5
);
1410 if (VecSize
<= 256) // 32 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8
);
1412 if (VecSize
<= 288) // 36 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9
);
1414 if (VecSize
<= 320) // 40 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10
);
1416 if (VecSize
<= 352) // 44 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11
);
1418 if (VecSize
<= 384) // 48 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12
);
1420 if (VecSize
<= 512) // 64 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16
);
1422 if (VecSize
<= 1024) // 128 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32
);
1425 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1428 if (VecSize
<= 32) // 4 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1
);
1430 if (VecSize
<= 64) // 8 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2
);
1432 if (VecSize
<= 96) // 12 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3
);
1434 if (VecSize
<= 128) // 16 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4
);
1436 if (VecSize
<= 160) // 20 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5
);
1438 if (VecSize
<= 256) // 32 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8
);
1440 if (VecSize
<= 288) // 36 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9
);
1442 if (VecSize
<= 320) // 40 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10
);
1444 if (VecSize
<= 352) // 44 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11
);
1446 if (VecSize
<= 384) // 48 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12
);
1448 if (VecSize
<= 512) // 64 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16
);
1450 if (VecSize
<= 1024) // 128 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32
);
1453 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1456 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize
) {
1457 if (VecSize
<= 32) // 4 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1
;
1459 if (VecSize
<= 64) // 8 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2
;
1461 if (VecSize
<= 96) // 12 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3
;
1463 if (VecSize
<= 128) // 16 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4
;
1465 if (VecSize
<= 160) // 20 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5
;
1467 if (VecSize
<= 256) // 32 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8
;
1469 if (VecSize
<= 288) // 36 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9
;
1471 if (VecSize
<= 320) // 40 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10
;
1473 if (VecSize
<= 352) // 44 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11
;
1475 if (VecSize
<= 384) // 48 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12
;
1477 if (VecSize
<= 512) // 64 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16
;
1479 if (VecSize
<= 1024) // 128 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32
;
1482 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1485 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize
) {
1486 if (VecSize
<= 32) // 4 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1
;
1488 if (VecSize
<= 64) // 8 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2
;
1490 if (VecSize
<= 96) // 12 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3
;
1492 if (VecSize
<= 128) // 16 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4
;
1494 if (VecSize
<= 160) // 20 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5
;
1496 if (VecSize
<= 256) // 32 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8
;
1498 if (VecSize
<= 288) // 36 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9
;
1500 if (VecSize
<= 320) // 40 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10
;
1502 if (VecSize
<= 352) // 44 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11
;
1504 if (VecSize
<= 384) // 48 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12
;
1506 if (VecSize
<= 512) // 64 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16
;
1508 if (VecSize
<= 1024) // 128 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32
;
1511 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize
) {
1515 if (VecSize
<= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1
;
1517 if (VecSize
<= 128) // 16 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2
;
1519 if (VecSize
<= 256) // 32 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4
;
1521 if (VecSize
<= 512) // 64 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8
;
1523 if (VecSize
<= 1024) // 128 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16
;
1526 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1530 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize
, unsigned EltSize
,
1531 bool IsSGPR
) const {
1535 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize
));
1537 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize
));
1539 llvm_unreachable("invalid reg indexing elt size");
1543 assert(EltSize
== 32 && "invalid reg indexing elt size");
1544 return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize
));
1547 static unsigned getSGPRSpillSaveOpcode(unsigned Size
) {
1550 return AMDGPU::SI_SPILL_S32_SAVE
;
1552 return AMDGPU::SI_SPILL_S64_SAVE
;
1554 return AMDGPU::SI_SPILL_S96_SAVE
;
1556 return AMDGPU::SI_SPILL_S128_SAVE
;
1558 return AMDGPU::SI_SPILL_S160_SAVE
;
1560 return AMDGPU::SI_SPILL_S192_SAVE
;
1562 return AMDGPU::SI_SPILL_S224_SAVE
;
1564 return AMDGPU::SI_SPILL_S256_SAVE
;
1566 return AMDGPU::SI_SPILL_S288_SAVE
;
1568 return AMDGPU::SI_SPILL_S320_SAVE
;
1570 return AMDGPU::SI_SPILL_S352_SAVE
;
1572 return AMDGPU::SI_SPILL_S384_SAVE
;
1574 return AMDGPU::SI_SPILL_S512_SAVE
;
1576 return AMDGPU::SI_SPILL_S1024_SAVE
;
1578 llvm_unreachable("unknown register size");
1582 static unsigned getVGPRSpillSaveOpcode(unsigned Size
) {
1585 return AMDGPU::SI_SPILL_V32_SAVE
;
1587 return AMDGPU::SI_SPILL_V64_SAVE
;
1589 return AMDGPU::SI_SPILL_V96_SAVE
;
1591 return AMDGPU::SI_SPILL_V128_SAVE
;
1593 return AMDGPU::SI_SPILL_V160_SAVE
;
1595 return AMDGPU::SI_SPILL_V192_SAVE
;
1597 return AMDGPU::SI_SPILL_V224_SAVE
;
1599 return AMDGPU::SI_SPILL_V256_SAVE
;
1601 return AMDGPU::SI_SPILL_V288_SAVE
;
1603 return AMDGPU::SI_SPILL_V320_SAVE
;
1605 return AMDGPU::SI_SPILL_V352_SAVE
;
1607 return AMDGPU::SI_SPILL_V384_SAVE
;
1609 return AMDGPU::SI_SPILL_V512_SAVE
;
1611 return AMDGPU::SI_SPILL_V1024_SAVE
;
1613 llvm_unreachable("unknown register size");
1617 static unsigned getAGPRSpillSaveOpcode(unsigned Size
) {
1620 return AMDGPU::SI_SPILL_A32_SAVE
;
1622 return AMDGPU::SI_SPILL_A64_SAVE
;
1624 return AMDGPU::SI_SPILL_A96_SAVE
;
1626 return AMDGPU::SI_SPILL_A128_SAVE
;
1628 return AMDGPU::SI_SPILL_A160_SAVE
;
1630 return AMDGPU::SI_SPILL_A192_SAVE
;
1632 return AMDGPU::SI_SPILL_A224_SAVE
;
1634 return AMDGPU::SI_SPILL_A256_SAVE
;
1636 return AMDGPU::SI_SPILL_A288_SAVE
;
1638 return AMDGPU::SI_SPILL_A320_SAVE
;
1640 return AMDGPU::SI_SPILL_A352_SAVE
;
1642 return AMDGPU::SI_SPILL_A384_SAVE
;
1644 return AMDGPU::SI_SPILL_A512_SAVE
;
1646 return AMDGPU::SI_SPILL_A1024_SAVE
;
1648 llvm_unreachable("unknown register size");
1652 static unsigned getAVSpillSaveOpcode(unsigned Size
) {
1655 return AMDGPU::SI_SPILL_AV32_SAVE
;
1657 return AMDGPU::SI_SPILL_AV64_SAVE
;
1659 return AMDGPU::SI_SPILL_AV96_SAVE
;
1661 return AMDGPU::SI_SPILL_AV128_SAVE
;
1663 return AMDGPU::SI_SPILL_AV160_SAVE
;
1665 return AMDGPU::SI_SPILL_AV192_SAVE
;
1667 return AMDGPU::SI_SPILL_AV224_SAVE
;
1669 return AMDGPU::SI_SPILL_AV256_SAVE
;
1671 return AMDGPU::SI_SPILL_AV288_SAVE
;
1673 return AMDGPU::SI_SPILL_AV320_SAVE
;
1675 return AMDGPU::SI_SPILL_AV352_SAVE
;
1677 return AMDGPU::SI_SPILL_AV384_SAVE
;
1679 return AMDGPU::SI_SPILL_AV512_SAVE
;
1681 return AMDGPU::SI_SPILL_AV1024_SAVE
;
1683 llvm_unreachable("unknown register size");
1687 static unsigned getWWMRegSpillSaveOpcode(unsigned Size
,
1688 bool IsVectorSuperClass
) {
1689 // Currently, there is only 32-bit WWM register spills needed.
1691 llvm_unreachable("unknown wwm register spill size");
1693 if (IsVectorSuperClass
)
1694 return AMDGPU::SI_SPILL_WWM_AV32_SAVE
;
1696 return AMDGPU::SI_SPILL_WWM_V32_SAVE
;
1699 static unsigned getVectorRegSpillSaveOpcode(Register Reg
,
1700 const TargetRegisterClass
*RC
,
1702 const SIRegisterInfo
&TRI
,
1703 const SIMachineFunctionInfo
&MFI
) {
1704 bool IsVectorSuperClass
= TRI
.isVectorSuperClass(RC
);
1706 // Choose the right opcode if spilling a WWM register.
1707 if (MFI
.checkFlag(Reg
, AMDGPU::VirtRegFlag::WWM_REG
))
1708 return getWWMRegSpillSaveOpcode(Size
, IsVectorSuperClass
);
1710 if (IsVectorSuperClass
)
1711 return getAVSpillSaveOpcode(Size
);
1713 return TRI
.isAGPRClass(RC
) ? getAGPRSpillSaveOpcode(Size
)
1714 : getVGPRSpillSaveOpcode(Size
);
1717 void SIInstrInfo::storeRegToStackSlot(
1718 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator MI
, Register SrcReg
,
1719 bool isKill
, int FrameIndex
, const TargetRegisterClass
*RC
,
1720 const TargetRegisterInfo
*TRI
, Register VReg
) const {
1721 MachineFunction
*MF
= MBB
.getParent();
1722 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
1723 MachineFrameInfo
&FrameInfo
= MF
->getFrameInfo();
1724 const DebugLoc
&DL
= MBB
.findDebugLoc(MI
);
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(*MF
, FrameIndex
);
1728 MachineMemOperand
*MMO
= MF
->getMachineMemOperand(
1729 PtrInfo
, MachineMemOperand::MOStore
, FrameInfo
.getObjectSize(FrameIndex
),
1730 FrameInfo
.getObjectAlign(FrameIndex
));
1731 unsigned SpillSize
= TRI
->getSpillSize(*RC
);
1733 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
1734 if (RI
.isSGPRClass(RC
)) {
1735 MFI
->setHasSpilledSGPRs();
1736 assert(SrcReg
!= AMDGPU::M0
&& "m0 should not be spilled");
1737 assert(SrcReg
!= AMDGPU::EXEC_LO
&& SrcReg
!= AMDGPU::EXEC_HI
&&
1738 SrcReg
!= AMDGPU::EXEC
&& "exec should not be spilled");
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc
&OpDesc
= get(getSGPRSpillSaveOpcode(SpillSize
));
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg
.isVirtual() && SpillSize
== 4) {
1747 MRI
.constrainRegClass(SrcReg
, &AMDGPU::SReg_32_XM0_XEXECRegClass
);
1750 BuildMI(MBB
, MI
, DL
, OpDesc
)
1751 .addReg(SrcReg
, getKillRegState(isKill
)) // data
1752 .addFrameIndex(FrameIndex
) // addr
1754 .addReg(MFI
->getStackPtrOffsetReg(), RegState::Implicit
);
1756 if (RI
.spillSGPRToVGPR())
1757 FrameInfo
.setStackID(FrameIndex
, TargetStackID::SGPRSpill
);
1761 unsigned Opcode
= getVectorRegSpillSaveOpcode(VReg
? VReg
: SrcReg
, RC
,
1762 SpillSize
, RI
, *MFI
);
1763 MFI
->setHasSpilledVGPRs();
1765 BuildMI(MBB
, MI
, DL
, get(Opcode
))
1766 .addReg(SrcReg
, getKillRegState(isKill
)) // data
1767 .addFrameIndex(FrameIndex
) // addr
1768 .addReg(MFI
->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(0) // offset
1770 .addMemOperand(MMO
);
1773 static unsigned getSGPRSpillRestoreOpcode(unsigned Size
) {
1776 return AMDGPU::SI_SPILL_S32_RESTORE
;
1778 return AMDGPU::SI_SPILL_S64_RESTORE
;
1780 return AMDGPU::SI_SPILL_S96_RESTORE
;
1782 return AMDGPU::SI_SPILL_S128_RESTORE
;
1784 return AMDGPU::SI_SPILL_S160_RESTORE
;
1786 return AMDGPU::SI_SPILL_S192_RESTORE
;
1788 return AMDGPU::SI_SPILL_S224_RESTORE
;
1790 return AMDGPU::SI_SPILL_S256_RESTORE
;
1792 return AMDGPU::SI_SPILL_S288_RESTORE
;
1794 return AMDGPU::SI_SPILL_S320_RESTORE
;
1796 return AMDGPU::SI_SPILL_S352_RESTORE
;
1798 return AMDGPU::SI_SPILL_S384_RESTORE
;
1800 return AMDGPU::SI_SPILL_S512_RESTORE
;
1802 return AMDGPU::SI_SPILL_S1024_RESTORE
;
1804 llvm_unreachable("unknown register size");
1808 static unsigned getVGPRSpillRestoreOpcode(unsigned Size
) {
1811 return AMDGPU::SI_SPILL_V32_RESTORE
;
1813 return AMDGPU::SI_SPILL_V64_RESTORE
;
1815 return AMDGPU::SI_SPILL_V96_RESTORE
;
1817 return AMDGPU::SI_SPILL_V128_RESTORE
;
1819 return AMDGPU::SI_SPILL_V160_RESTORE
;
1821 return AMDGPU::SI_SPILL_V192_RESTORE
;
1823 return AMDGPU::SI_SPILL_V224_RESTORE
;
1825 return AMDGPU::SI_SPILL_V256_RESTORE
;
1827 return AMDGPU::SI_SPILL_V288_RESTORE
;
1829 return AMDGPU::SI_SPILL_V320_RESTORE
;
1831 return AMDGPU::SI_SPILL_V352_RESTORE
;
1833 return AMDGPU::SI_SPILL_V384_RESTORE
;
1835 return AMDGPU::SI_SPILL_V512_RESTORE
;
1837 return AMDGPU::SI_SPILL_V1024_RESTORE
;
1839 llvm_unreachable("unknown register size");
1843 static unsigned getAGPRSpillRestoreOpcode(unsigned Size
) {
1846 return AMDGPU::SI_SPILL_A32_RESTORE
;
1848 return AMDGPU::SI_SPILL_A64_RESTORE
;
1850 return AMDGPU::SI_SPILL_A96_RESTORE
;
1852 return AMDGPU::SI_SPILL_A128_RESTORE
;
1854 return AMDGPU::SI_SPILL_A160_RESTORE
;
1856 return AMDGPU::SI_SPILL_A192_RESTORE
;
1858 return AMDGPU::SI_SPILL_A224_RESTORE
;
1860 return AMDGPU::SI_SPILL_A256_RESTORE
;
1862 return AMDGPU::SI_SPILL_A288_RESTORE
;
1864 return AMDGPU::SI_SPILL_A320_RESTORE
;
1866 return AMDGPU::SI_SPILL_A352_RESTORE
;
1868 return AMDGPU::SI_SPILL_A384_RESTORE
;
1870 return AMDGPU::SI_SPILL_A512_RESTORE
;
1872 return AMDGPU::SI_SPILL_A1024_RESTORE
;
1874 llvm_unreachable("unknown register size");
1878 static unsigned getAVSpillRestoreOpcode(unsigned Size
) {
1881 return AMDGPU::SI_SPILL_AV32_RESTORE
;
1883 return AMDGPU::SI_SPILL_AV64_RESTORE
;
1885 return AMDGPU::SI_SPILL_AV96_RESTORE
;
1887 return AMDGPU::SI_SPILL_AV128_RESTORE
;
1889 return AMDGPU::SI_SPILL_AV160_RESTORE
;
1891 return AMDGPU::SI_SPILL_AV192_RESTORE
;
1893 return AMDGPU::SI_SPILL_AV224_RESTORE
;
1895 return AMDGPU::SI_SPILL_AV256_RESTORE
;
1897 return AMDGPU::SI_SPILL_AV288_RESTORE
;
1899 return AMDGPU::SI_SPILL_AV320_RESTORE
;
1901 return AMDGPU::SI_SPILL_AV352_RESTORE
;
1903 return AMDGPU::SI_SPILL_AV384_RESTORE
;
1905 return AMDGPU::SI_SPILL_AV512_RESTORE
;
1907 return AMDGPU::SI_SPILL_AV1024_RESTORE
;
1909 llvm_unreachable("unknown register size");
1913 static unsigned getWWMRegSpillRestoreOpcode(unsigned Size
,
1914 bool IsVectorSuperClass
) {
1915 // Currently, there is only 32-bit WWM register spills needed.
1917 llvm_unreachable("unknown wwm register spill size");
1919 if (IsVectorSuperClass
)
1920 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE
;
1922 return AMDGPU::SI_SPILL_WWM_V32_RESTORE
;
1926 getVectorRegSpillRestoreOpcode(Register Reg
, const TargetRegisterClass
*RC
,
1927 unsigned Size
, const SIRegisterInfo
&TRI
,
1928 const SIMachineFunctionInfo
&MFI
) {
1929 bool IsVectorSuperClass
= TRI
.isVectorSuperClass(RC
);
1931 // Choose the right opcode if restoring a WWM register.
1932 if (MFI
.checkFlag(Reg
, AMDGPU::VirtRegFlag::WWM_REG
))
1933 return getWWMRegSpillRestoreOpcode(Size
, IsVectorSuperClass
);
1935 if (IsVectorSuperClass
)
1936 return getAVSpillRestoreOpcode(Size
);
1938 return TRI
.isAGPRClass(RC
) ? getAGPRSpillRestoreOpcode(Size
)
1939 : getVGPRSpillRestoreOpcode(Size
);
1942 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock
&MBB
,
1943 MachineBasicBlock::iterator MI
,
1944 Register DestReg
, int FrameIndex
,
1945 const TargetRegisterClass
*RC
,
1946 const TargetRegisterInfo
*TRI
,
1947 Register VReg
) const {
1948 MachineFunction
*MF
= MBB
.getParent();
1949 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
1950 MachineFrameInfo
&FrameInfo
= MF
->getFrameInfo();
1951 const DebugLoc
&DL
= MBB
.findDebugLoc(MI
);
1952 unsigned SpillSize
= TRI
->getSpillSize(*RC
);
1954 MachinePointerInfo PtrInfo
1955 = MachinePointerInfo::getFixedStack(*MF
, FrameIndex
);
1957 MachineMemOperand
*MMO
= MF
->getMachineMemOperand(
1958 PtrInfo
, MachineMemOperand::MOLoad
, FrameInfo
.getObjectSize(FrameIndex
),
1959 FrameInfo
.getObjectAlign(FrameIndex
));
1961 if (RI
.isSGPRClass(RC
)) {
1962 MFI
->setHasSpilledSGPRs();
1963 assert(DestReg
!= AMDGPU::M0
&& "m0 should not be reloaded into");
1964 assert(DestReg
!= AMDGPU::EXEC_LO
&& DestReg
!= AMDGPU::EXEC_HI
&&
1965 DestReg
!= AMDGPU::EXEC
&& "exec should not be spilled");
1967 // FIXME: Maybe this should not include a memoperand because it will be
1968 // lowered to non-memory instructions.
1969 const MCInstrDesc
&OpDesc
= get(getSGPRSpillRestoreOpcode(SpillSize
));
1970 if (DestReg
.isVirtual() && SpillSize
== 4) {
1971 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
1972 MRI
.constrainRegClass(DestReg
, &AMDGPU::SReg_32_XM0_XEXECRegClass
);
1975 if (RI
.spillSGPRToVGPR())
1976 FrameInfo
.setStackID(FrameIndex
, TargetStackID::SGPRSpill
);
1977 BuildMI(MBB
, MI
, DL
, OpDesc
, DestReg
)
1978 .addFrameIndex(FrameIndex
) // addr
1980 .addReg(MFI
->getStackPtrOffsetReg(), RegState::Implicit
);
1985 unsigned Opcode
= getVectorRegSpillRestoreOpcode(VReg
? VReg
: DestReg
, RC
,
1986 SpillSize
, RI
, *MFI
);
1987 BuildMI(MBB
, MI
, DL
, get(Opcode
), DestReg
)
1988 .addFrameIndex(FrameIndex
) // vaddr
1989 .addReg(MFI
->getStackPtrOffsetReg()) // scratch_offset
1990 .addImm(0) // offset
1991 .addMemOperand(MMO
);
1994 void SIInstrInfo::insertNoop(MachineBasicBlock
&MBB
,
1995 MachineBasicBlock::iterator MI
) const {
1996 insertNoops(MBB
, MI
, 1);
1999 void SIInstrInfo::insertNoops(MachineBasicBlock
&MBB
,
2000 MachineBasicBlock::iterator MI
,
2001 unsigned Quantity
) const {
2002 DebugLoc DL
= MBB
.findDebugLoc(MI
);
2003 while (Quantity
> 0) {
2004 unsigned Arg
= std::min(Quantity
, 8u);
2006 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_NOP
)).addImm(Arg
- 1);
2010 void SIInstrInfo::insertReturn(MachineBasicBlock
&MBB
) const {
2011 auto *MF
= MBB
.getParent();
2012 SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
2014 assert(Info
->isEntryFunction());
2016 if (MBB
.succ_empty()) {
2017 bool HasNoTerminator
= MBB
.getFirstTerminator() == MBB
.end();
2018 if (HasNoTerminator
) {
2019 if (Info
->returnsVoid()) {
2020 BuildMI(MBB
, MBB
.end(), DebugLoc(), get(AMDGPU::S_ENDPGM
)).addImm(0);
2022 BuildMI(MBB
, MBB
.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG
));
2028 MachineBasicBlock
*SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo
&MRI
,
2029 MachineBasicBlock
&MBB
,
2031 const DebugLoc
&DL
) const {
2032 MachineFunction
*MF
= MBB
.getParent();
2033 constexpr unsigned DoorbellIDMask
= 0x3ff;
2034 constexpr unsigned ECQueueWaveAbort
= 0x400;
2036 MachineBasicBlock
*TrapBB
= &MBB
;
2037 MachineBasicBlock
*ContBB
= &MBB
;
2038 MachineBasicBlock
*HaltLoopBB
= MF
->CreateMachineBasicBlock();
2040 if (!MBB
.succ_empty() || std::next(MI
.getIterator()) != MBB
.end()) {
2041 ContBB
= MBB
.splitAt(MI
, /*UpdateLiveIns=*/false);
2042 TrapBB
= MF
->CreateMachineBasicBlock();
2043 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_CBRANCH_EXECNZ
)).addMBB(TrapBB
);
2044 MF
->push_back(TrapBB
);
2045 MBB
.addSuccessor(TrapBB
);
2048 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2050 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_TRAP
))
2051 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
));
2052 Register DoorbellReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2053 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_SENDMSG_RTN_B32
),
2055 .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL
);
2056 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_MOV_B32
), AMDGPU::TTMP2
)
2057 .addUse(AMDGPU::M0
);
2058 Register DoorbellRegMasked
=
2059 MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2060 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_AND_B32
), DoorbellRegMasked
)
2061 .addUse(DoorbellReg
)
2062 .addImm(DoorbellIDMask
);
2063 Register SetWaveAbortBit
=
2064 MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2065 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_OR_B32
), SetWaveAbortBit
)
2066 .addUse(DoorbellRegMasked
)
2067 .addImm(ECQueueWaveAbort
);
2068 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
2069 .addUse(SetWaveAbortBit
);
2070 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_SENDMSG
))
2071 .addImm(AMDGPU::SendMsg::ID_INTERRUPT
);
2072 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
2073 .addUse(AMDGPU::TTMP2
);
2074 BuildMI(*TrapBB
, TrapBB
->end(), DL
, get(AMDGPU::S_BRANCH
)).addMBB(HaltLoopBB
);
2075 TrapBB
->addSuccessor(HaltLoopBB
);
2077 BuildMI(*HaltLoopBB
, HaltLoopBB
->end(), DL
, get(AMDGPU::S_SETHALT
)).addImm(5);
2078 BuildMI(*HaltLoopBB
, HaltLoopBB
->end(), DL
, get(AMDGPU::S_BRANCH
))
2079 .addMBB(HaltLoopBB
);
2080 MF
->push_back(HaltLoopBB
);
2081 HaltLoopBB
->addSuccessor(HaltLoopBB
);
2086 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr
&MI
) {
2087 switch (MI
.getOpcode()) {
2089 if (MI
.isMetaInstruction())
2091 return 1; // FIXME: Do wait states equal cycles?
2094 return MI
.getOperand(0).getImm() + 1;
2095 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2096 // hazard, even if one exist, won't really be visible. Should we handle it?
2100 bool SIInstrInfo::expandPostRAPseudo(MachineInstr
&MI
) const {
2101 MachineBasicBlock
&MBB
= *MI
.getParent();
2102 DebugLoc DL
= MBB
.findDebugLoc(MI
);
2103 switch (MI
.getOpcode()) {
2104 default: return TargetInstrInfo::expandPostRAPseudo(MI
);
2105 case AMDGPU::S_MOV_B64_term
:
2106 // This is only a terminator to get the correct spill code placement during
2107 // register allocation.
2108 MI
.setDesc(get(AMDGPU::S_MOV_B64
));
2111 case AMDGPU::S_MOV_B32_term
:
2112 // This is only a terminator to get the correct spill code placement during
2113 // register allocation.
2114 MI
.setDesc(get(AMDGPU::S_MOV_B32
));
2117 case AMDGPU::S_XOR_B64_term
:
2118 // This is only a terminator to get the correct spill code placement during
2119 // register allocation.
2120 MI
.setDesc(get(AMDGPU::S_XOR_B64
));
2123 case AMDGPU::S_XOR_B32_term
:
2124 // This is only a terminator to get the correct spill code placement during
2125 // register allocation.
2126 MI
.setDesc(get(AMDGPU::S_XOR_B32
));
2128 case AMDGPU::S_OR_B64_term
:
2129 // This is only a terminator to get the correct spill code placement during
2130 // register allocation.
2131 MI
.setDesc(get(AMDGPU::S_OR_B64
));
2133 case AMDGPU::S_OR_B32_term
:
2134 // This is only a terminator to get the correct spill code placement during
2135 // register allocation.
2136 MI
.setDesc(get(AMDGPU::S_OR_B32
));
2139 case AMDGPU::S_ANDN2_B64_term
:
2140 // This is only a terminator to get the correct spill code placement during
2141 // register allocation.
2142 MI
.setDesc(get(AMDGPU::S_ANDN2_B64
));
2145 case AMDGPU::S_ANDN2_B32_term
:
2146 // This is only a terminator to get the correct spill code placement during
2147 // register allocation.
2148 MI
.setDesc(get(AMDGPU::S_ANDN2_B32
));
2151 case AMDGPU::S_AND_B64_term
:
2152 // This is only a terminator to get the correct spill code placement during
2153 // register allocation.
2154 MI
.setDesc(get(AMDGPU::S_AND_B64
));
2157 case AMDGPU::S_AND_B32_term
:
2158 // This is only a terminator to get the correct spill code placement during
2159 // register allocation.
2160 MI
.setDesc(get(AMDGPU::S_AND_B32
));
2163 case AMDGPU::S_AND_SAVEEXEC_B64_term
:
2164 // This is only a terminator to get the correct spill code placement during
2165 // register allocation.
2166 MI
.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64
));
2169 case AMDGPU::S_AND_SAVEEXEC_B32_term
:
2170 // This is only a terminator to get the correct spill code placement during
2171 // register allocation.
2172 MI
.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32
));
2175 case AMDGPU::SI_SPILL_S32_TO_VGPR
:
2176 MI
.setDesc(get(AMDGPU::V_WRITELANE_B32
));
2179 case AMDGPU::SI_RESTORE_S32_FROM_VGPR
:
2180 MI
.setDesc(get(AMDGPU::V_READLANE_B32
));
2183 case AMDGPU::V_MOV_B64_PSEUDO
: {
2184 Register Dst
= MI
.getOperand(0).getReg();
2185 Register DstLo
= RI
.getSubReg(Dst
, AMDGPU::sub0
);
2186 Register DstHi
= RI
.getSubReg(Dst
, AMDGPU::sub1
);
2188 const MachineOperand
&SrcOp
= MI
.getOperand(1);
2189 // FIXME: Will this work for 64-bit floating point immediates?
2190 assert(!SrcOp
.isFPImm());
2191 if (ST
.hasMovB64()) {
2192 MI
.setDesc(get(AMDGPU::V_MOV_B64_e32
));
2193 if (SrcOp
.isReg() || isInlineConstant(MI
, 1) ||
2194 isUInt
<32>(SrcOp
.getImm()))
2197 if (SrcOp
.isImm()) {
2198 APInt
Imm(64, SrcOp
.getImm());
2199 APInt
Lo(32, Imm
.getLoBits(32).getZExtValue());
2200 APInt
Hi(32, Imm
.getHiBits(32).getZExtValue());
2201 if (ST
.hasPkMovB32() && Lo
== Hi
&& isInlineConstant(Lo
)) {
2202 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_PK_MOV_B32
), Dst
)
2203 .addImm(SISrcMods::OP_SEL_1
)
2204 .addImm(Lo
.getSExtValue())
2205 .addImm(SISrcMods::OP_SEL_1
)
2206 .addImm(Lo
.getSExtValue())
2207 .addImm(0) // op_sel_lo
2208 .addImm(0) // op_sel_hi
2209 .addImm(0) // neg_lo
2210 .addImm(0) // neg_hi
2211 .addImm(0); // clamp
2213 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_e32
), DstLo
)
2214 .addImm(Lo
.getSExtValue())
2215 .addReg(Dst
, RegState::Implicit
| RegState::Define
);
2216 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_e32
), DstHi
)
2217 .addImm(Hi
.getSExtValue())
2218 .addReg(Dst
, RegState::Implicit
| RegState::Define
);
2221 assert(SrcOp
.isReg());
2222 if (ST
.hasPkMovB32() &&
2223 !RI
.isAGPR(MBB
.getParent()->getRegInfo(), SrcOp
.getReg())) {
2224 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_PK_MOV_B32
), Dst
)
2225 .addImm(SISrcMods::OP_SEL_1
) // src0_mod
2226 .addReg(SrcOp
.getReg())
2227 .addImm(SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
) // src1_mod
2228 .addReg(SrcOp
.getReg())
2229 .addImm(0) // op_sel_lo
2230 .addImm(0) // op_sel_hi
2231 .addImm(0) // neg_lo
2232 .addImm(0) // neg_hi
2233 .addImm(0); // clamp
2235 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_e32
), DstLo
)
2236 .addReg(RI
.getSubReg(SrcOp
.getReg(), AMDGPU::sub0
))
2237 .addReg(Dst
, RegState::Implicit
| RegState::Define
);
2238 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_e32
), DstHi
)
2239 .addReg(RI
.getSubReg(SrcOp
.getReg(), AMDGPU::sub1
))
2240 .addReg(Dst
, RegState::Implicit
| RegState::Define
);
2243 MI
.eraseFromParent();
2246 case AMDGPU::V_MOV_B64_DPP_PSEUDO
: {
2250 case AMDGPU::S_MOV_B64_IMM_PSEUDO
: {
2251 const MachineOperand
&SrcOp
= MI
.getOperand(1);
2252 assert(!SrcOp
.isFPImm());
2253 APInt
Imm(64, SrcOp
.getImm());
2254 if (Imm
.isIntN(32) || isInlineConstant(Imm
)) {
2255 MI
.setDesc(get(AMDGPU::S_MOV_B64
));
2259 Register Dst
= MI
.getOperand(0).getReg();
2260 Register DstLo
= RI
.getSubReg(Dst
, AMDGPU::sub0
);
2261 Register DstHi
= RI
.getSubReg(Dst
, AMDGPU::sub1
);
2263 APInt
Lo(32, Imm
.getLoBits(32).getZExtValue());
2264 APInt
Hi(32, Imm
.getHiBits(32).getZExtValue());
2265 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B32
), DstLo
)
2266 .addImm(Lo
.getSExtValue())
2267 .addReg(Dst
, RegState::Implicit
| RegState::Define
);
2268 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_MOV_B32
), DstHi
)
2269 .addImm(Hi
.getSExtValue())
2270 .addReg(Dst
, RegState::Implicit
| RegState::Define
);
2271 MI
.eraseFromParent();
2274 case AMDGPU::V_SET_INACTIVE_B32
: {
2275 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2276 Register DstReg
= MI
.getOperand(0).getReg();
2277 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
2278 .add(MI
.getOperand(3))
2279 .add(MI
.getOperand(4))
2280 .add(MI
.getOperand(1))
2281 .add(MI
.getOperand(2))
2282 .add(MI
.getOperand(5));
2283 MI
.eraseFromParent();
2286 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1
:
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2
:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3
:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4
:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5
:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8
:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9
:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10
:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11
:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12
:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16
:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32
:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1
:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2
:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3
:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4
:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5
:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8
:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9
:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10
:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11
:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12
:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16
:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32
:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1
:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2
:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4
:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8
:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16
: {
2315 const TargetRegisterClass
*EltRC
= getOpRegClass(MI
, 2);
2318 if (RI
.hasVGPRs(EltRC
)) {
2319 Opc
= AMDGPU::V_MOVRELD_B32_e32
;
2321 Opc
= RI
.getRegSizeInBits(*EltRC
) == 64 ? AMDGPU::S_MOVRELD_B64
2322 : AMDGPU::S_MOVRELD_B32
;
2325 const MCInstrDesc
&OpDesc
= get(Opc
);
2326 Register VecReg
= MI
.getOperand(0).getReg();
2327 bool IsUndef
= MI
.getOperand(1).isUndef();
2328 unsigned SubReg
= MI
.getOperand(3).getImm();
2329 assert(VecReg
== MI
.getOperand(1).getReg());
2331 MachineInstrBuilder MIB
=
2332 BuildMI(MBB
, MI
, DL
, OpDesc
)
2333 .addReg(RI
.getSubReg(VecReg
, SubReg
), RegState::Undef
)
2334 .add(MI
.getOperand(2))
2335 .addReg(VecReg
, RegState::ImplicitDefine
)
2336 .addReg(VecReg
, RegState::Implicit
| (IsUndef
? RegState::Undef
: 0));
2338 const int ImpDefIdx
=
2339 OpDesc
.getNumOperands() + OpDesc
.implicit_uses().size();
2340 const int ImpUseIdx
= ImpDefIdx
+ 1;
2341 MIB
->tieOperands(ImpDefIdx
, ImpUseIdx
);
2342 MI
.eraseFromParent();
2345 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1
:
2346 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2
:
2347 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3
:
2348 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4
:
2349 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5
:
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8
:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9
:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10
:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11
:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12
:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16
:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32
: {
2357 assert(ST
.useVGPRIndexMode());
2358 Register VecReg
= MI
.getOperand(0).getReg();
2359 bool IsUndef
= MI
.getOperand(1).isUndef();
2360 Register Idx
= MI
.getOperand(3).getReg();
2361 Register SubReg
= MI
.getOperand(4).getImm();
2363 MachineInstr
*SetOn
= BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_SET_GPR_IDX_ON
))
2365 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE
);
2366 SetOn
->getOperand(3).setIsUndef();
2368 const MCInstrDesc
&OpDesc
= get(AMDGPU::V_MOV_B32_indirect_write
);
2369 MachineInstrBuilder MIB
=
2370 BuildMI(MBB
, MI
, DL
, OpDesc
)
2371 .addReg(RI
.getSubReg(VecReg
, SubReg
), RegState::Undef
)
2372 .add(MI
.getOperand(2))
2373 .addReg(VecReg
, RegState::ImplicitDefine
)
2375 RegState::Implicit
| (IsUndef
? RegState::Undef
: 0));
2377 const int ImpDefIdx
=
2378 OpDesc
.getNumOperands() + OpDesc
.implicit_uses().size();
2379 const int ImpUseIdx
= ImpDefIdx
+ 1;
2380 MIB
->tieOperands(ImpDefIdx
, ImpUseIdx
);
2382 MachineInstr
*SetOff
= BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_SET_GPR_IDX_OFF
));
2384 finalizeBundle(MBB
, SetOn
->getIterator(), std::next(SetOff
->getIterator()));
2386 MI
.eraseFromParent();
2389 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1
:
2390 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2
:
2391 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3
:
2392 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4
:
2393 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5
:
2394 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8
:
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9
:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10
:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11
:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12
:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16
:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32
: {
2401 assert(ST
.useVGPRIndexMode());
2402 Register Dst
= MI
.getOperand(0).getReg();
2403 Register VecReg
= MI
.getOperand(1).getReg();
2404 bool IsUndef
= MI
.getOperand(1).isUndef();
2405 Register Idx
= MI
.getOperand(2).getReg();
2406 Register SubReg
= MI
.getOperand(3).getImm();
2408 MachineInstr
*SetOn
= BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_SET_GPR_IDX_ON
))
2410 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE
);
2411 SetOn
->getOperand(3).setIsUndef();
2413 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_indirect_read
))
2415 .addReg(RI
.getSubReg(VecReg
, SubReg
), RegState::Undef
)
2416 .addReg(VecReg
, RegState::Implicit
| (IsUndef
? RegState::Undef
: 0));
2418 MachineInstr
*SetOff
= BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_SET_GPR_IDX_OFF
));
2420 finalizeBundle(MBB
, SetOn
->getIterator(), std::next(SetOff
->getIterator()));
2422 MI
.eraseFromParent();
2425 case AMDGPU::SI_PC_ADD_REL_OFFSET
: {
2426 MachineFunction
&MF
= *MBB
.getParent();
2427 Register Reg
= MI
.getOperand(0).getReg();
2428 Register RegLo
= RI
.getSubReg(Reg
, AMDGPU::sub0
);
2429 Register RegHi
= RI
.getSubReg(Reg
, AMDGPU::sub1
);
2430 MachineOperand OpLo
= MI
.getOperand(1);
2431 MachineOperand OpHi
= MI
.getOperand(2);
2433 // Create a bundle so these instructions won't be re-ordered by the
2434 // post-RA scheduler.
2435 MIBundleBuilder
Bundler(MBB
, MI
);
2436 Bundler
.append(BuildMI(MF
, DL
, get(AMDGPU::S_GETPC_B64
), Reg
));
2438 // What we want here is an offset from the value returned by s_getpc (which
2439 // is the address of the s_add_u32 instruction) to the global variable, but
2440 // since the encoding of $symbol starts 4 bytes after the start of the
2441 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2442 // small. This requires us to add 4 to the global variable offset in order
2443 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2444 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2448 if (ST
.hasGetPCZeroExtension()) {
2449 // Fix up hardware that does not sign-extend the 48-bit PC value by
2450 // inserting: s_sext_i32_i16 reghi, reghi
2452 BuildMI(MF
, DL
, get(AMDGPU::S_SEXT_I32_I16
), RegHi
).addReg(RegHi
));
2456 if (OpLo
.isGlobal())
2457 OpLo
.setOffset(OpLo
.getOffset() + Adjust
+ 4);
2459 BuildMI(MF
, DL
, get(AMDGPU::S_ADD_U32
), RegLo
).addReg(RegLo
).add(OpLo
));
2461 if (OpHi
.isGlobal())
2462 OpHi
.setOffset(OpHi
.getOffset() + Adjust
+ 12);
2463 Bundler
.append(BuildMI(MF
, DL
, get(AMDGPU::S_ADDC_U32
), RegHi
)
2467 finalizeBundle(MBB
, Bundler
.begin());
2469 MI
.eraseFromParent();
2472 case AMDGPU::ENTER_STRICT_WWM
: {
2473 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2474 // Whole Wave Mode is entered.
2475 MI
.setDesc(get(ST
.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2476 : AMDGPU::S_OR_SAVEEXEC_B64
));
2479 case AMDGPU::ENTER_STRICT_WQM
: {
2480 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2481 // STRICT_WQM is entered.
2482 const unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
2483 const unsigned WQMOp
= ST
.isWave32() ? AMDGPU::S_WQM_B32
: AMDGPU::S_WQM_B64
;
2484 const unsigned MovOp
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
2485 BuildMI(MBB
, MI
, DL
, get(MovOp
), MI
.getOperand(0).getReg()).addReg(Exec
);
2486 BuildMI(MBB
, MI
, DL
, get(WQMOp
), Exec
).addReg(Exec
);
2488 MI
.eraseFromParent();
2491 case AMDGPU::EXIT_STRICT_WWM
:
2492 case AMDGPU::EXIT_STRICT_WQM
: {
2493 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2494 // WWM/STICT_WQM is exited.
2495 MI
.setDesc(get(ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
));
2498 case AMDGPU::SI_RETURN
: {
2499 const MachineFunction
*MF
= MBB
.getParent();
2500 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
2501 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
2502 // Hiding the return address use with SI_RETURN may lead to extra kills in
2503 // the function and missing live-ins. We are fine in practice because callee
2504 // saved register handling ensures the register value is restored before
2505 // RET, but we need the undef flag here to appease the MachineVerifier
2507 MachineInstrBuilder MIB
=
2508 BuildMI(MBB
, MI
, DL
, get(AMDGPU::S_SETPC_B64_return
))
2509 .addReg(TRI
->getReturnAddressReg(*MF
), RegState::Undef
);
2511 MIB
.copyImplicitOps(MI
);
2512 MI
.eraseFromParent();
2516 case AMDGPU::S_MUL_U64_U32_PSEUDO
:
2517 case AMDGPU::S_MUL_I64_I32_PSEUDO
:
2518 MI
.setDesc(get(AMDGPU::S_MUL_U64
));
2521 case AMDGPU::S_GETPC_B64_pseudo
:
2522 MI
.setDesc(get(AMDGPU::S_GETPC_B64
));
2523 if (ST
.hasGetPCZeroExtension()) {
2524 Register Dst
= MI
.getOperand(0).getReg();
2525 Register DstHi
= RI
.getSubReg(Dst
, AMDGPU::sub1
);
2526 // Fix up hardware that does not sign-extend the 48-bit PC value by
2527 // inserting: s_sext_i32_i16 dsthi, dsthi
2528 BuildMI(MBB
, std::next(MI
.getIterator()), DL
, get(AMDGPU::S_SEXT_I32_I16
),
2537 void SIInstrInfo::reMaterialize(MachineBasicBlock
&MBB
,
2538 MachineBasicBlock::iterator I
, Register DestReg
,
2539 unsigned SubIdx
, const MachineInstr
&Orig
,
2540 const TargetRegisterInfo
&RI
) const {
2542 // Try shrinking the instruction to remat only the part needed for current
2544 // TODO: Handle more cases.
2545 unsigned Opcode
= Orig
.getOpcode();
2547 case AMDGPU::S_LOAD_DWORDX16_IMM
:
2548 case AMDGPU::S_LOAD_DWORDX8_IMM
: {
2558 // Look for a single use of the register that is also a subreg.
2559 Register RegToFind
= Orig
.getOperand(0).getReg();
2560 MachineOperand
*UseMO
= nullptr;
2561 for (auto &CandMO
: I
->operands()) {
2562 if (!CandMO
.isReg() || CandMO
.getReg() != RegToFind
|| CandMO
.isDef())
2570 if (!UseMO
|| UseMO
->getSubReg() == AMDGPU::NoSubRegister
)
2573 unsigned Offset
= RI
.getSubRegIdxOffset(UseMO
->getSubReg());
2574 unsigned SubregSize
= RI
.getSubRegIdxSize(UseMO
->getSubReg());
2576 MachineFunction
*MF
= MBB
.getParent();
2577 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
2578 assert(MRI
.use_nodbg_empty(DestReg
) && "DestReg should have no users yet.");
2580 unsigned NewOpcode
= -1;
2581 if (SubregSize
== 256)
2582 NewOpcode
= AMDGPU::S_LOAD_DWORDX8_IMM
;
2583 else if (SubregSize
== 128)
2584 NewOpcode
= AMDGPU::S_LOAD_DWORDX4_IMM
;
2588 const MCInstrDesc
&TID
= get(NewOpcode
);
2589 const TargetRegisterClass
*NewRC
=
2590 RI
.getAllocatableClass(getRegClass(TID
, 0, &RI
, *MF
));
2591 MRI
.setRegClass(DestReg
, NewRC
);
2593 UseMO
->setReg(DestReg
);
2594 UseMO
->setSubReg(AMDGPU::NoSubRegister
);
2596 // Use a smaller load with the desired size, possibly with updated offset.
2597 MachineInstr
*MI
= MF
->CloneMachineInstr(&Orig
);
2599 MI
->getOperand(0).setReg(DestReg
);
2600 MI
->getOperand(0).setSubReg(AMDGPU::NoSubRegister
);
2602 MachineOperand
*OffsetMO
= getNamedOperand(*MI
, AMDGPU::OpName::offset
);
2603 int64_t FinalOffset
= OffsetMO
->getImm() + Offset
/ 8;
2604 OffsetMO
->setImm(FinalOffset
);
2606 SmallVector
<MachineMemOperand
*> NewMMOs
;
2607 for (const MachineMemOperand
*MemOp
: Orig
.memoperands())
2608 NewMMOs
.push_back(MF
->getMachineMemOperand(MemOp
, MemOp
->getPointerInfo(),
2610 MI
->setMemRefs(*MF
, NewMMOs
);
2620 TargetInstrInfo::reMaterialize(MBB
, I
, DestReg
, SubIdx
, Orig
, RI
);
2623 std::pair
<MachineInstr
*, MachineInstr
*>
2624 SIInstrInfo::expandMovDPP64(MachineInstr
&MI
) const {
2625 assert (MI
.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO
);
2627 if (ST
.hasMovB64() &&
2628 AMDGPU::isLegalDPALU_DPPControl(
2629 getNamedOperand(MI
, AMDGPU::OpName::dpp_ctrl
)->getImm())) {
2630 MI
.setDesc(get(AMDGPU::V_MOV_B64_dpp
));
2631 return std::pair(&MI
, nullptr);
2634 MachineBasicBlock
&MBB
= *MI
.getParent();
2635 DebugLoc DL
= MBB
.findDebugLoc(MI
);
2636 MachineFunction
*MF
= MBB
.getParent();
2637 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
2638 Register Dst
= MI
.getOperand(0).getReg();
2640 MachineInstr
*Split
[2];
2642 for (auto Sub
: { AMDGPU::sub0
, AMDGPU::sub1
}) {
2643 auto MovDPP
= BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_MOV_B32_dpp
));
2644 if (Dst
.isPhysical()) {
2645 MovDPP
.addDef(RI
.getSubReg(Dst
, Sub
));
2647 assert(MRI
.isSSA());
2648 auto Tmp
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
2652 for (unsigned I
= 1; I
<= 2; ++I
) { // old and src operands.
2653 const MachineOperand
&SrcOp
= MI
.getOperand(I
);
2654 assert(!SrcOp
.isFPImm());
2655 if (SrcOp
.isImm()) {
2656 APInt
Imm(64, SrcOp
.getImm());
2657 Imm
.ashrInPlace(Part
* 32);
2658 MovDPP
.addImm(Imm
.getLoBits(32).getZExtValue());
2660 assert(SrcOp
.isReg());
2661 Register Src
= SrcOp
.getReg();
2662 if (Src
.isPhysical())
2663 MovDPP
.addReg(RI
.getSubReg(Src
, Sub
));
2665 MovDPP
.addReg(Src
, SrcOp
.isUndef() ? RegState::Undef
: 0, Sub
);
2669 for (const MachineOperand
&MO
: llvm::drop_begin(MI
.explicit_operands(), 3))
2670 MovDPP
.addImm(MO
.getImm());
2672 Split
[Part
] = MovDPP
;
2676 if (Dst
.isVirtual())
2677 BuildMI(MBB
, MI
, DL
, get(AMDGPU::REG_SEQUENCE
), Dst
)
2678 .addReg(Split
[0]->getOperand(0).getReg())
2679 .addImm(AMDGPU::sub0
)
2680 .addReg(Split
[1]->getOperand(0).getReg())
2681 .addImm(AMDGPU::sub1
);
2683 MI
.eraseFromParent();
2684 return std::pair(Split
[0], Split
[1]);
2687 std::optional
<DestSourcePair
>
2688 SIInstrInfo::isCopyInstrImpl(const MachineInstr
&MI
) const {
2689 if (MI
.getOpcode() == AMDGPU::WWM_COPY
)
2690 return DestSourcePair
{MI
.getOperand(0), MI
.getOperand(1)};
2692 return std::nullopt
;
2695 bool SIInstrInfo::swapSourceModifiers(MachineInstr
&MI
,
2696 MachineOperand
&Src0
,
2697 unsigned Src0OpName
,
2698 MachineOperand
&Src1
,
2699 unsigned Src1OpName
) const {
2700 MachineOperand
*Src0Mods
= getNamedOperand(MI
, Src0OpName
);
2704 MachineOperand
*Src1Mods
= getNamedOperand(MI
, Src1OpName
);
2706 "All commutable instructions have both src0 and src1 modifiers");
2708 int Src0ModsVal
= Src0Mods
->getImm();
2709 int Src1ModsVal
= Src1Mods
->getImm();
2711 Src1Mods
->setImm(Src0ModsVal
);
2712 Src0Mods
->setImm(Src1ModsVal
);
2716 static MachineInstr
*swapRegAndNonRegOperand(MachineInstr
&MI
,
2717 MachineOperand
&RegOp
,
2718 MachineOperand
&NonRegOp
) {
2719 Register Reg
= RegOp
.getReg();
2720 unsigned SubReg
= RegOp
.getSubReg();
2721 bool IsKill
= RegOp
.isKill();
2722 bool IsDead
= RegOp
.isDead();
2723 bool IsUndef
= RegOp
.isUndef();
2724 bool IsDebug
= RegOp
.isDebug();
2726 if (NonRegOp
.isImm())
2727 RegOp
.ChangeToImmediate(NonRegOp
.getImm());
2728 else if (NonRegOp
.isFI())
2729 RegOp
.ChangeToFrameIndex(NonRegOp
.getIndex());
2730 else if (NonRegOp
.isGlobal()) {
2731 RegOp
.ChangeToGA(NonRegOp
.getGlobal(), NonRegOp
.getOffset(),
2732 NonRegOp
.getTargetFlags());
2736 // Make sure we don't reinterpret a subreg index in the target flags.
2737 RegOp
.setTargetFlags(NonRegOp
.getTargetFlags());
2739 NonRegOp
.ChangeToRegister(Reg
, false, false, IsKill
, IsDead
, IsUndef
, IsDebug
);
2740 NonRegOp
.setSubReg(SubReg
);
2745 MachineInstr
*SIInstrInfo::commuteInstructionImpl(MachineInstr
&MI
, bool NewMI
,
2747 unsigned Src1Idx
) const {
2748 assert(!NewMI
&& "this should never be used");
2750 unsigned Opc
= MI
.getOpcode();
2751 int CommutedOpcode
= commuteOpcode(Opc
);
2752 if (CommutedOpcode
== -1)
2755 if (Src0Idx
> Src1Idx
)
2756 std::swap(Src0Idx
, Src1Idx
);
2758 assert(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
) ==
2759 static_cast<int>(Src0Idx
) &&
2760 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
) ==
2761 static_cast<int>(Src1Idx
) &&
2762 "inconsistency with findCommutedOpIndices");
2764 MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
2765 MachineOperand
&Src1
= MI
.getOperand(Src1Idx
);
2767 MachineInstr
*CommutedMI
= nullptr;
2768 if (Src0
.isReg() && Src1
.isReg()) {
2769 if (isOperandLegal(MI
, Src1Idx
, &Src0
)) {
2770 // Be sure to copy the source modifiers to the right place.
2772 = TargetInstrInfo::commuteInstructionImpl(MI
, NewMI
, Src0Idx
, Src1Idx
);
2775 } else if (Src0
.isReg() && !Src1
.isReg()) {
2776 if (isOperandLegal(MI
, Src1Idx
, &Src0
))
2777 CommutedMI
= swapRegAndNonRegOperand(MI
, Src0
, Src1
);
2778 } else if (!Src0
.isReg() && Src1
.isReg()) {
2779 if (isOperandLegal(MI
, Src1Idx
, &Src0
))
2780 CommutedMI
= swapRegAndNonRegOperand(MI
, Src1
, Src0
);
2782 // FIXME: Found two non registers to commute. This does happen.
2787 swapSourceModifiers(MI
, Src0
, AMDGPU::OpName::src0_modifiers
,
2788 Src1
, AMDGPU::OpName::src1_modifiers
);
2790 swapSourceModifiers(MI
, Src0
, AMDGPU::OpName::src0_sel
, Src1
,
2791 AMDGPU::OpName::src1_sel
);
2793 CommutedMI
->setDesc(get(CommutedOpcode
));
2799 // This needs to be implemented because the source modifiers may be inserted
2800 // between the true commutable operands, and the base
2801 // TargetInstrInfo::commuteInstruction uses it.
2802 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr
&MI
,
2803 unsigned &SrcOpIdx0
,
2804 unsigned &SrcOpIdx1
) const {
2805 return findCommutedOpIndices(MI
.getDesc(), SrcOpIdx0
, SrcOpIdx1
);
2808 bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc
&Desc
,
2809 unsigned &SrcOpIdx0
,
2810 unsigned &SrcOpIdx1
) const {
2811 if (!Desc
.isCommutable())
2814 unsigned Opc
= Desc
.getOpcode();
2815 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
2819 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
2823 return fixCommutedOpIndices(SrcOpIdx0
, SrcOpIdx1
, Src0Idx
, Src1Idx
);
2826 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp
,
2827 int64_t BrOffset
) const {
2828 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2829 // block is unanalyzable.
2830 assert(BranchOp
!= AMDGPU::S_SETPC_B64
);
2832 // Convert to dwords.
2835 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2836 // from the next instruction.
2839 return isIntN(BranchOffsetBits
, BrOffset
);
2843 SIInstrInfo::getBranchDestBlock(const MachineInstr
&MI
) const {
2844 return MI
.getOperand(0).getMBB();
2847 bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock
*MBB
) const {
2848 for (const MachineInstr
&MI
: MBB
->terminators()) {
2849 if (MI
.getOpcode() == AMDGPU::SI_IF
|| MI
.getOpcode() == AMDGPU::SI_ELSE
||
2850 MI
.getOpcode() == AMDGPU::SI_LOOP
)
2856 void SIInstrInfo::insertIndirectBranch(MachineBasicBlock
&MBB
,
2857 MachineBasicBlock
&DestBB
,
2858 MachineBasicBlock
&RestoreBB
,
2859 const DebugLoc
&DL
, int64_t BrOffset
,
2860 RegScavenger
*RS
) const {
2861 assert(RS
&& "RegScavenger required for long branching");
2862 assert(MBB
.empty() &&
2863 "new block should be inserted for expanding unconditional branch");
2864 assert(MBB
.pred_size() == 1);
2865 assert(RestoreBB
.empty() &&
2866 "restore block should be inserted for restoring clobbered registers");
2868 MachineFunction
*MF
= MBB
.getParent();
2869 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
2870 const SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
2872 // FIXME: Virtual register workaround for RegScavenger not working with empty
2874 Register PCReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
2878 // Note: as this is used after hazard recognizer we need to apply some hazard
2879 // workarounds directly.
2880 const bool FlushSGPRWrites
= (ST
.isWave64() && ST
.hasVALUMaskWriteHazard()) ||
2881 ST
.hasVALUReadSGPRHazard();
2882 auto ApplyHazardWorkarounds
= [this, &MBB
, &I
, &DL
, FlushSGPRWrites
]() {
2883 if (FlushSGPRWrites
)
2884 BuildMI(MBB
, I
, DL
, get(AMDGPU::S_WAITCNT_DEPCTR
))
2885 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2888 // We need to compute the offset relative to the instruction immediately after
2889 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2890 MachineInstr
*GetPC
= BuildMI(MBB
, I
, DL
, get(AMDGPU::S_GETPC_B64
), PCReg
);
2891 ApplyHazardWorkarounds();
2893 auto &MCCtx
= MF
->getContext();
2894 MCSymbol
*PostGetPCLabel
=
2895 MCCtx
.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2896 GetPC
->setPostInstrSymbol(*MF
, PostGetPCLabel
);
2898 MCSymbol
*OffsetLo
=
2899 MCCtx
.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2900 MCSymbol
*OffsetHi
=
2901 MCCtx
.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2902 BuildMI(MBB
, I
, DL
, get(AMDGPU::S_ADD_U32
))
2903 .addReg(PCReg
, RegState::Define
, AMDGPU::sub0
)
2904 .addReg(PCReg
, 0, AMDGPU::sub0
)
2905 .addSym(OffsetLo
, MO_FAR_BRANCH_OFFSET
);
2906 BuildMI(MBB
, I
, DL
, get(AMDGPU::S_ADDC_U32
))
2907 .addReg(PCReg
, RegState::Define
, AMDGPU::sub1
)
2908 .addReg(PCReg
, 0, AMDGPU::sub1
)
2909 .addSym(OffsetHi
, MO_FAR_BRANCH_OFFSET
);
2910 ApplyHazardWorkarounds();
2912 // Insert the indirect branch after the other terminator.
2913 BuildMI(&MBB
, DL
, get(AMDGPU::S_SETPC_B64
))
2916 // If a spill is needed for the pc register pair, we need to insert a spill
2917 // restore block right before the destination block, and insert a short branch
2918 // into the old destination block's fallthrough predecessor.
2921 // s_cbranch_scc0 skip_long_branch:
2925 // s_getpc_b64 s[8:9]
2926 // s_add_u32 s8, s8, restore_bb
2927 // s_addc_u32 s9, s9, 0
2928 // s_setpc_b64 s[8:9]
2930 // skip_long_branch:
2935 // dest_bb_fallthrough_predecessor:
2941 // fallthrough dest_bb
2946 Register LongBranchReservedReg
= MFI
->getLongBranchReservedReg();
2949 // If we've previously reserved a register for long branches
2950 // avoid running the scavenger and just use those registers
2951 if (LongBranchReservedReg
) {
2952 RS
->enterBasicBlock(MBB
);
2953 Scav
= LongBranchReservedReg
;
2955 RS
->enterBasicBlockEnd(MBB
);
2956 Scav
= RS
->scavengeRegisterBackwards(
2957 AMDGPU::SReg_64RegClass
, MachineBasicBlock::iterator(GetPC
),
2958 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2961 RS
->setRegUsed(Scav
);
2962 MRI
.replaceRegWith(PCReg
, Scav
);
2963 MRI
.clearVirtRegs();
2965 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2967 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
2968 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
2969 TRI
->spillEmergencySGPR(GetPC
, RestoreBB
, AMDGPU::SGPR0_SGPR1
, RS
);
2970 MRI
.replaceRegWith(PCReg
, AMDGPU::SGPR0_SGPR1
);
2971 MRI
.clearVirtRegs();
2974 MCSymbol
*DestLabel
= Scav
? DestBB
.getSymbol() : RestoreBB
.getSymbol();
2975 // Now, the distance could be defined.
2976 auto *Offset
= MCBinaryExpr::createSub(
2977 MCSymbolRefExpr::create(DestLabel
, MCCtx
),
2978 MCSymbolRefExpr::create(PostGetPCLabel
, MCCtx
), MCCtx
);
2979 // Add offset assignments.
2980 auto *Mask
= MCConstantExpr::create(0xFFFFFFFFULL
, MCCtx
);
2981 OffsetLo
->setVariableValue(MCBinaryExpr::createAnd(Offset
, Mask
, MCCtx
));
2982 auto *ShAmt
= MCConstantExpr::create(32, MCCtx
);
2983 OffsetHi
->setVariableValue(MCBinaryExpr::createAShr(Offset
, ShAmt
, MCCtx
));
2986 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond
) {
2988 case SIInstrInfo::SCC_TRUE
:
2989 return AMDGPU::S_CBRANCH_SCC1
;
2990 case SIInstrInfo::SCC_FALSE
:
2991 return AMDGPU::S_CBRANCH_SCC0
;
2992 case SIInstrInfo::VCCNZ
:
2993 return AMDGPU::S_CBRANCH_VCCNZ
;
2994 case SIInstrInfo::VCCZ
:
2995 return AMDGPU::S_CBRANCH_VCCZ
;
2996 case SIInstrInfo::EXECNZ
:
2997 return AMDGPU::S_CBRANCH_EXECNZ
;
2998 case SIInstrInfo::EXECZ
:
2999 return AMDGPU::S_CBRANCH_EXECZ
;
3001 llvm_unreachable("invalid branch predicate");
3005 SIInstrInfo::BranchPredicate
SIInstrInfo::getBranchPredicate(unsigned Opcode
) {
3007 case AMDGPU::S_CBRANCH_SCC0
:
3009 case AMDGPU::S_CBRANCH_SCC1
:
3011 case AMDGPU::S_CBRANCH_VCCNZ
:
3013 case AMDGPU::S_CBRANCH_VCCZ
:
3015 case AMDGPU::S_CBRANCH_EXECNZ
:
3017 case AMDGPU::S_CBRANCH_EXECZ
:
3024 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock
&MBB
,
3025 MachineBasicBlock::iterator I
,
3026 MachineBasicBlock
*&TBB
,
3027 MachineBasicBlock
*&FBB
,
3028 SmallVectorImpl
<MachineOperand
> &Cond
,
3029 bool AllowModify
) const {
3030 if (I
->getOpcode() == AMDGPU::S_BRANCH
) {
3031 // Unconditional Branch
3032 TBB
= I
->getOperand(0).getMBB();
3036 BranchPredicate Pred
= getBranchPredicate(I
->getOpcode());
3037 if (Pred
== INVALID_BR
)
3040 MachineBasicBlock
*CondBB
= I
->getOperand(0).getMBB();
3041 Cond
.push_back(MachineOperand::CreateImm(Pred
));
3042 Cond
.push_back(I
->getOperand(1)); // Save the branch register.
3046 if (I
== MBB
.end()) {
3047 // Conditional branch followed by fall-through.
3052 if (I
->getOpcode() == AMDGPU::S_BRANCH
) {
3054 FBB
= I
->getOperand(0).getMBB();
3061 bool SIInstrInfo::analyzeBranch(MachineBasicBlock
&MBB
, MachineBasicBlock
*&TBB
,
3062 MachineBasicBlock
*&FBB
,
3063 SmallVectorImpl
<MachineOperand
> &Cond
,
3064 bool AllowModify
) const {
3065 MachineBasicBlock::iterator I
= MBB
.getFirstTerminator();
3070 // Skip over the instructions that are artificially terminators for special
3072 while (I
!= E
&& !I
->isBranch() && !I
->isReturn()) {
3073 switch (I
->getOpcode()) {
3074 case AMDGPU::S_MOV_B64_term
:
3075 case AMDGPU::S_XOR_B64_term
:
3076 case AMDGPU::S_OR_B64_term
:
3077 case AMDGPU::S_ANDN2_B64_term
:
3078 case AMDGPU::S_AND_B64_term
:
3079 case AMDGPU::S_AND_SAVEEXEC_B64_term
:
3080 case AMDGPU::S_MOV_B32_term
:
3081 case AMDGPU::S_XOR_B32_term
:
3082 case AMDGPU::S_OR_B32_term
:
3083 case AMDGPU::S_ANDN2_B32_term
:
3084 case AMDGPU::S_AND_B32_term
:
3085 case AMDGPU::S_AND_SAVEEXEC_B32_term
:
3088 case AMDGPU::SI_ELSE
:
3089 case AMDGPU::SI_KILL_I1_TERMINATOR
:
3090 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
:
3091 // FIXME: It's messy that these need to be considered here at all.
3094 llvm_unreachable("unexpected non-branch terminator inst");
3103 return analyzeBranchImpl(MBB
, I
, TBB
, FBB
, Cond
, AllowModify
);
3106 unsigned SIInstrInfo::removeBranch(MachineBasicBlock
&MBB
,
3107 int *BytesRemoved
) const {
3109 unsigned RemovedSize
= 0;
3110 for (MachineInstr
&MI
: llvm::make_early_inc_range(MBB
.terminators())) {
3111 // Skip over artificial terminators when removing instructions.
3112 if (MI
.isBranch() || MI
.isReturn()) {
3113 RemovedSize
+= getInstSizeInBytes(MI
);
3114 MI
.eraseFromParent();
3120 *BytesRemoved
= RemovedSize
;
3125 // Copy the flags onto the implicit condition register operand.
3126 static void preserveCondRegFlags(MachineOperand
&CondReg
,
3127 const MachineOperand
&OrigCond
) {
3128 CondReg
.setIsUndef(OrigCond
.isUndef());
3129 CondReg
.setIsKill(OrigCond
.isKill());
3132 unsigned SIInstrInfo::insertBranch(MachineBasicBlock
&MBB
,
3133 MachineBasicBlock
*TBB
,
3134 MachineBasicBlock
*FBB
,
3135 ArrayRef
<MachineOperand
> Cond
,
3137 int *BytesAdded
) const {
3138 if (!FBB
&& Cond
.empty()) {
3139 BuildMI(&MBB
, DL
, get(AMDGPU::S_BRANCH
))
3142 *BytesAdded
= ST
.hasOffset3fBug() ? 8 : 4;
3146 assert(TBB
&& Cond
[0].isImm());
3149 = getBranchOpcode(static_cast<BranchPredicate
>(Cond
[0].getImm()));
3152 MachineInstr
*CondBr
=
3153 BuildMI(&MBB
, DL
, get(Opcode
))
3156 // Copy the flags onto the implicit condition register operand.
3157 preserveCondRegFlags(CondBr
->getOperand(1), Cond
[1]);
3158 fixImplicitOperands(*CondBr
);
3161 *BytesAdded
= ST
.hasOffset3fBug() ? 8 : 4;
3167 MachineInstr
*CondBr
=
3168 BuildMI(&MBB
, DL
, get(Opcode
))
3170 fixImplicitOperands(*CondBr
);
3171 BuildMI(&MBB
, DL
, get(AMDGPU::S_BRANCH
))
3174 MachineOperand
&CondReg
= CondBr
->getOperand(1);
3175 CondReg
.setIsUndef(Cond
[1].isUndef());
3176 CondReg
.setIsKill(Cond
[1].isKill());
3179 *BytesAdded
= ST
.hasOffset3fBug() ? 16 : 8;
3184 bool SIInstrInfo::reverseBranchCondition(
3185 SmallVectorImpl
<MachineOperand
> &Cond
) const {
3186 if (Cond
.size() != 2) {
3190 if (Cond
[0].isImm()) {
3191 Cond
[0].setImm(-Cond
[0].getImm());
3198 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock
&MBB
,
3199 ArrayRef
<MachineOperand
> Cond
,
3200 Register DstReg
, Register TrueReg
,
3201 Register FalseReg
, int &CondCycles
,
3202 int &TrueCycles
, int &FalseCycles
) const {
3203 switch (Cond
[0].getImm()) {
3206 const MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
3207 const TargetRegisterClass
*RC
= MRI
.getRegClass(TrueReg
);
3208 if (MRI
.getRegClass(FalseReg
) != RC
)
3211 int NumInsts
= AMDGPU::getRegBitWidth(*RC
) / 32;
3212 CondCycles
= TrueCycles
= FalseCycles
= NumInsts
; // ???
3214 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3215 return RI
.hasVGPRs(RC
) && NumInsts
<= 6;
3219 // FIXME: We could insert for VGPRs if we could replace the original compare
3220 // with a vector one.
3221 const MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
3222 const TargetRegisterClass
*RC
= MRI
.getRegClass(TrueReg
);
3223 if (MRI
.getRegClass(FalseReg
) != RC
)
3226 int NumInsts
= AMDGPU::getRegBitWidth(*RC
) / 32;
3228 // Multiples of 8 can do s_cselect_b64
3229 if (NumInsts
% 2 == 0)
3232 CondCycles
= TrueCycles
= FalseCycles
= NumInsts
; // ???
3233 return RI
.isSGPRClass(RC
);
3240 void SIInstrInfo::insertSelect(MachineBasicBlock
&MBB
,
3241 MachineBasicBlock::iterator I
, const DebugLoc
&DL
,
3242 Register DstReg
, ArrayRef
<MachineOperand
> Cond
,
3243 Register TrueReg
, Register FalseReg
) const {
3244 BranchPredicate Pred
= static_cast<BranchPredicate
>(Cond
[0].getImm());
3245 if (Pred
== VCCZ
|| Pred
== SCC_FALSE
) {
3246 Pred
= static_cast<BranchPredicate
>(-Pred
);
3247 std::swap(TrueReg
, FalseReg
);
3250 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
3251 const TargetRegisterClass
*DstRC
= MRI
.getRegClass(DstReg
);
3252 unsigned DstSize
= RI
.getRegSizeInBits(*DstRC
);
3254 if (DstSize
== 32) {
3255 MachineInstr
*Select
;
3256 if (Pred
== SCC_TRUE
) {
3257 Select
= BuildMI(MBB
, I
, DL
, get(AMDGPU::S_CSELECT_B32
), DstReg
)
3261 // Instruction's operands are backwards from what is expected.
3262 Select
= BuildMI(MBB
, I
, DL
, get(AMDGPU::V_CNDMASK_B32_e32
), DstReg
)
3267 preserveCondRegFlags(Select
->getOperand(3), Cond
[1]);
3271 if (DstSize
== 64 && Pred
== SCC_TRUE
) {
3272 MachineInstr
*Select
=
3273 BuildMI(MBB
, I
, DL
, get(AMDGPU::S_CSELECT_B64
), DstReg
)
3277 preserveCondRegFlags(Select
->getOperand(3), Cond
[1]);
3281 static const int16_t Sub0_15
[] = {
3282 AMDGPU::sub0
, AMDGPU::sub1
, AMDGPU::sub2
, AMDGPU::sub3
,
3283 AMDGPU::sub4
, AMDGPU::sub5
, AMDGPU::sub6
, AMDGPU::sub7
,
3284 AMDGPU::sub8
, AMDGPU::sub9
, AMDGPU::sub10
, AMDGPU::sub11
,
3285 AMDGPU::sub12
, AMDGPU::sub13
, AMDGPU::sub14
, AMDGPU::sub15
,
3288 static const int16_t Sub0_15_64
[] = {
3289 AMDGPU::sub0_sub1
, AMDGPU::sub2_sub3
,
3290 AMDGPU::sub4_sub5
, AMDGPU::sub6_sub7
,
3291 AMDGPU::sub8_sub9
, AMDGPU::sub10_sub11
,
3292 AMDGPU::sub12_sub13
, AMDGPU::sub14_sub15
,
3295 unsigned SelOp
= AMDGPU::V_CNDMASK_B32_e32
;
3296 const TargetRegisterClass
*EltRC
= &AMDGPU::VGPR_32RegClass
;
3297 const int16_t *SubIndices
= Sub0_15
;
3298 int NElts
= DstSize
/ 32;
3300 // 64-bit select is only available for SALU.
3301 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3302 if (Pred
== SCC_TRUE
) {
3304 SelOp
= AMDGPU::S_CSELECT_B32
;
3305 EltRC
= &AMDGPU::SGPR_32RegClass
;
3307 SelOp
= AMDGPU::S_CSELECT_B64
;
3308 EltRC
= &AMDGPU::SGPR_64RegClass
;
3309 SubIndices
= Sub0_15_64
;
3314 MachineInstrBuilder MIB
= BuildMI(
3315 MBB
, I
, DL
, get(AMDGPU::REG_SEQUENCE
), DstReg
);
3317 I
= MIB
->getIterator();
3319 SmallVector
<Register
, 8> Regs
;
3320 for (int Idx
= 0; Idx
!= NElts
; ++Idx
) {
3321 Register DstElt
= MRI
.createVirtualRegister(EltRC
);
3322 Regs
.push_back(DstElt
);
3324 unsigned SubIdx
= SubIndices
[Idx
];
3326 MachineInstr
*Select
;
3327 if (SelOp
== AMDGPU::V_CNDMASK_B32_e32
) {
3329 BuildMI(MBB
, I
, DL
, get(SelOp
), DstElt
)
3330 .addReg(FalseReg
, 0, SubIdx
)
3331 .addReg(TrueReg
, 0, SubIdx
);
3334 BuildMI(MBB
, I
, DL
, get(SelOp
), DstElt
)
3335 .addReg(TrueReg
, 0, SubIdx
)
3336 .addReg(FalseReg
, 0, SubIdx
);
3339 preserveCondRegFlags(Select
->getOperand(3), Cond
[1]);
3340 fixImplicitOperands(*Select
);
3347 bool SIInstrInfo::isFoldableCopy(const MachineInstr
&MI
) {
3348 switch (MI
.getOpcode()) {
3349 case AMDGPU::V_MOV_B16_t16_e32
:
3350 case AMDGPU::V_MOV_B16_t16_e64
:
3351 case AMDGPU::V_MOV_B32_e32
:
3352 case AMDGPU::V_MOV_B32_e64
:
3353 case AMDGPU::V_MOV_B64_PSEUDO
:
3354 case AMDGPU::V_MOV_B64_e32
:
3355 case AMDGPU::V_MOV_B64_e64
:
3356 case AMDGPU::S_MOV_B32
:
3357 case AMDGPU::S_MOV_B64
:
3358 case AMDGPU::S_MOV_B64_IMM_PSEUDO
:
3360 case AMDGPU::WWM_COPY
:
3361 case AMDGPU::V_ACCVGPR_WRITE_B32_e64
:
3362 case AMDGPU::V_ACCVGPR_READ_B32_e64
:
3363 case AMDGPU::V_ACCVGPR_MOV_B32
:
3370 static constexpr unsigned ModifierOpNames
[] = {
3371 AMDGPU::OpName::src0_modifiers
, AMDGPU::OpName::src1_modifiers
,
3372 AMDGPU::OpName::src2_modifiers
, AMDGPU::OpName::clamp
,
3373 AMDGPU::OpName::omod
, AMDGPU::OpName::op_sel
};
3375 void SIInstrInfo::removeModOperands(MachineInstr
&MI
) const {
3376 unsigned Opc
= MI
.getOpcode();
3377 for (unsigned Name
: reverse(ModifierOpNames
)) {
3378 int Idx
= AMDGPU::getNamedOperandIdx(Opc
, Name
);
3380 MI
.removeOperand(Idx
);
3384 bool SIInstrInfo::foldImmediate(MachineInstr
&UseMI
, MachineInstr
&DefMI
,
3385 Register Reg
, MachineRegisterInfo
*MRI
) const {
3386 if (!MRI
->hasOneNonDBGUse(Reg
))
3389 switch (DefMI
.getOpcode()) {
3392 case AMDGPU::V_MOV_B64_e32
:
3393 case AMDGPU::S_MOV_B64
:
3394 case AMDGPU::V_MOV_B64_PSEUDO
:
3395 case AMDGPU::S_MOV_B64_IMM_PSEUDO
:
3396 case AMDGPU::V_MOV_B32_e32
:
3397 case AMDGPU::S_MOV_B32
:
3398 case AMDGPU::V_ACCVGPR_WRITE_B32_e64
:
3402 const MachineOperand
*ImmOp
= getNamedOperand(DefMI
, AMDGPU::OpName::src0
);
3404 // FIXME: We could handle FrameIndex values here.
3405 if (!ImmOp
->isImm())
3408 auto getImmFor
= [ImmOp
](const MachineOperand
&UseOp
) -> int64_t {
3409 int64_t Imm
= ImmOp
->getImm();
3410 switch (UseOp
.getSubReg()) {
3418 return SignExtend64
<16>(Imm
);
3420 return SignExtend64
<16>(Imm
>> 16);
3421 case AMDGPU::sub1_lo16
:
3422 return SignExtend64
<16>(Imm
>> 32);
3423 case AMDGPU::sub1_hi16
:
3424 return SignExtend64
<16>(Imm
>> 48);
3428 assert(!DefMI
.getOperand(0).getSubReg() && "Expected SSA form");
3430 unsigned Opc
= UseMI
.getOpcode();
3431 if (Opc
== AMDGPU::COPY
) {
3432 assert(!UseMI
.getOperand(0).getSubReg() && "Expected SSA form");
3434 Register DstReg
= UseMI
.getOperand(0).getReg();
3435 unsigned OpSize
= getOpSize(UseMI
, 0);
3436 bool Is16Bit
= OpSize
== 2;
3437 bool Is64Bit
= OpSize
== 8;
3438 bool isVGPRCopy
= RI
.isVGPR(*MRI
, DstReg
);
3439 unsigned NewOpc
= isVGPRCopy
? Is64Bit
? AMDGPU::V_MOV_B64_PSEUDO
3440 : AMDGPU::V_MOV_B32_e32
3441 : Is64Bit
? AMDGPU::S_MOV_B64_IMM_PSEUDO
3442 : AMDGPU::S_MOV_B32
;
3443 APInt
Imm(Is64Bit
? 64 : 32, getImmFor(UseMI
.getOperand(1)),
3444 /*isSigned=*/true, /*implicitTrunc=*/true);
3446 if (RI
.isAGPR(*MRI
, DstReg
)) {
3447 if (Is64Bit
|| !isInlineConstant(Imm
))
3449 NewOpc
= AMDGPU::V_ACCVGPR_WRITE_B32_e64
;
3454 return false; // Do not clobber vgpr_hi16
3456 if (DstReg
.isVirtual() && UseMI
.getOperand(0).getSubReg() != AMDGPU::lo16
)
3459 UseMI
.getOperand(0).setSubReg(0);
3460 if (DstReg
.isPhysical()) {
3461 DstReg
= RI
.get32BitRegister(DstReg
);
3462 UseMI
.getOperand(0).setReg(DstReg
);
3464 assert(UseMI
.getOperand(1).getReg().isVirtual());
3467 const MCInstrDesc
&NewMCID
= get(NewOpc
);
3468 if (DstReg
.isPhysical() &&
3469 !RI
.getRegClass(NewMCID
.operands()[0].RegClass
)->contains(DstReg
))
3472 UseMI
.setDesc(NewMCID
);
3473 UseMI
.getOperand(1).ChangeToImmediate(Imm
.getSExtValue());
3474 UseMI
.addImplicitDefUseOperands(*UseMI
.getParent()->getParent());
3478 if (Opc
== AMDGPU::V_MAD_F32_e64
|| Opc
== AMDGPU::V_MAC_F32_e64
||
3479 Opc
== AMDGPU::V_MAD_F16_e64
|| Opc
== AMDGPU::V_MAC_F16_e64
||
3480 Opc
== AMDGPU::V_FMA_F32_e64
|| Opc
== AMDGPU::V_FMAC_F32_e64
||
3481 Opc
== AMDGPU::V_FMA_F16_e64
|| Opc
== AMDGPU::V_FMAC_F16_e64
||
3482 Opc
== AMDGPU::V_FMAC_F16_fake16_e64
) {
3483 // Don't fold if we are using source or output modifiers. The new VOP2
3484 // instructions don't have them.
3485 if (hasAnyModifiersSet(UseMI
))
3488 // If this is a free constant, there's no reason to do this.
3489 // TODO: We could fold this here instead of letting SIFoldOperands do it
3491 MachineOperand
*Src0
= getNamedOperand(UseMI
, AMDGPU::OpName::src0
);
3493 // Any src operand can be used for the legality check.
3494 if (isInlineConstant(UseMI
, *Src0
, *ImmOp
))
3497 bool IsF32
= Opc
== AMDGPU::V_MAD_F32_e64
|| Opc
== AMDGPU::V_MAC_F32_e64
||
3498 Opc
== AMDGPU::V_FMA_F32_e64
|| Opc
== AMDGPU::V_FMAC_F32_e64
;
3500 Opc
== AMDGPU::V_FMA_F32_e64
|| Opc
== AMDGPU::V_FMAC_F32_e64
||
3501 Opc
== AMDGPU::V_FMA_F16_e64
|| Opc
== AMDGPU::V_FMAC_F16_e64
||
3502 Opc
== AMDGPU::V_FMAC_F16_fake16_e64
;
3503 MachineOperand
*Src1
= getNamedOperand(UseMI
, AMDGPU::OpName::src1
);
3504 MachineOperand
*Src2
= getNamedOperand(UseMI
, AMDGPU::OpName::src2
);
3506 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3507 if ((Src0
->isReg() && Src0
->getReg() == Reg
) ||
3508 (Src1
->isReg() && Src1
->getReg() == Reg
)) {
3509 MachineOperand
*RegSrc
=
3510 Src1
->isReg() && Src1
->getReg() == Reg
? Src0
: Src1
;
3511 if (!RegSrc
->isReg())
3513 if (RI
.isSGPRClass(MRI
->getRegClass(RegSrc
->getReg())) &&
3514 ST
.getConstantBusLimit(Opc
) < 2)
3517 if (!Src2
->isReg() || RI
.isSGPRClass(MRI
->getRegClass(Src2
->getReg())))
3520 // If src2 is also a literal constant then we have to choose which one to
3521 // fold. In general it is better to choose madak so that the other literal
3522 // can be materialized in an sgpr instead of a vgpr:
3523 // s_mov_b32 s0, literal
3524 // v_madak_f32 v0, s0, v0, literal
3526 // v_mov_b32 v1, literal
3527 // v_madmk_f32 v0, v0, literal, v1
3528 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Src2
->getReg());
3529 if (Def
&& Def
->isMoveImmediate() &&
3530 !isInlineConstant(Def
->getOperand(1)))
3534 IsFMA
? (IsF32
? AMDGPU::V_FMAMK_F32
3535 : ST
.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3536 : AMDGPU::V_FMAMK_F16
)
3537 : (IsF32
? AMDGPU::V_MADMK_F32
: AMDGPU::V_MADMK_F16
);
3538 if (pseudoToMCOpcode(NewOpc
) == -1)
3541 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3542 // would also require restricting their register classes. For now
3544 if (NewOpc
== AMDGPU::V_FMAMK_F16_fake16
)
3547 const int64_t Imm
= getImmFor(RegSrc
== Src1
? *Src0
: *Src1
);
3549 // FIXME: This would be a lot easier if we could return a new instruction
3550 // instead of having to modify in place.
3552 Register SrcReg
= RegSrc
->getReg();
3553 unsigned SrcSubReg
= RegSrc
->getSubReg();
3554 Src0
->setReg(SrcReg
);
3555 Src0
->setSubReg(SrcSubReg
);
3556 Src0
->setIsKill(RegSrc
->isKill());
3558 if (Opc
== AMDGPU::V_MAC_F32_e64
|| Opc
== AMDGPU::V_MAC_F16_e64
||
3559 Opc
== AMDGPU::V_FMAC_F32_e64
||
3560 Opc
== AMDGPU::V_FMAC_F16_fake16_e64
|| Opc
== AMDGPU::V_FMAC_F16_e64
)
3561 UseMI
.untieRegOperand(
3562 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
));
3564 Src1
->ChangeToImmediate(Imm
);
3566 removeModOperands(UseMI
);
3567 UseMI
.setDesc(get(NewOpc
));
3569 bool DeleteDef
= MRI
->use_nodbg_empty(Reg
);
3571 DefMI
.eraseFromParent();
3576 // Added part is the constant: Use v_madak_{f16, f32}.
3577 if (Src2
->isReg() && Src2
->getReg() == Reg
) {
3578 if (ST
.getConstantBusLimit(Opc
) < 2) {
3579 // Not allowed to use constant bus for another operand.
3580 // We can however allow an inline immediate as src0.
3581 bool Src0Inlined
= false;
3582 if (Src0
->isReg()) {
3583 // Try to inline constant if possible.
3584 // If the Def moves immediate and the use is single
3585 // We are saving VGPR here.
3586 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Src0
->getReg());
3587 if (Def
&& Def
->isMoveImmediate() &&
3588 isInlineConstant(Def
->getOperand(1)) &&
3589 MRI
->hasOneUse(Src0
->getReg())) {
3590 Src0
->ChangeToImmediate(Def
->getOperand(1).getImm());
3592 } else if (ST
.getConstantBusLimit(Opc
) <= 1 &&
3593 RI
.isSGPRReg(*MRI
, Src0
->getReg())) {
3596 // VGPR is okay as Src0 - fallthrough
3599 if (Src1
->isReg() && !Src0Inlined
) {
3600 // We have one slot for inlinable constant so far - try to fill it
3601 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Src1
->getReg());
3602 if (Def
&& Def
->isMoveImmediate() &&
3603 isInlineConstant(Def
->getOperand(1)) &&
3604 MRI
->hasOneUse(Src1
->getReg()) && commuteInstruction(UseMI
))
3605 Src0
->ChangeToImmediate(Def
->getOperand(1).getImm());
3606 else if (RI
.isSGPRReg(*MRI
, Src1
->getReg()))
3608 // VGPR is okay as Src1 - fallthrough
3613 IsFMA
? (IsF32
? AMDGPU::V_FMAAK_F32
3614 : ST
.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3615 : AMDGPU::V_FMAAK_F16
)
3616 : (IsF32
? AMDGPU::V_MADAK_F32
: AMDGPU::V_MADAK_F16
);
3617 if (pseudoToMCOpcode(NewOpc
) == -1)
3620 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3621 // would also require restricting their register classes. For now
3623 if (NewOpc
== AMDGPU::V_FMAAK_F16_fake16
)
3626 // FIXME: This would be a lot easier if we could return a new instruction
3627 // instead of having to modify in place.
3629 if (Opc
== AMDGPU::V_MAC_F32_e64
|| Opc
== AMDGPU::V_MAC_F16_e64
||
3630 Opc
== AMDGPU::V_FMAC_F32_e64
||
3631 Opc
== AMDGPU::V_FMAC_F16_fake16_e64
|| Opc
== AMDGPU::V_FMAC_F16_e64
)
3632 UseMI
.untieRegOperand(
3633 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
));
3635 // ChangingToImmediate adds Src2 back to the instruction.
3636 Src2
->ChangeToImmediate(getImmFor(*Src2
));
3638 // These come before src2.
3639 removeModOperands(UseMI
);
3640 UseMI
.setDesc(get(NewOpc
));
3641 // It might happen that UseMI was commuted
3642 // and we now have SGPR as SRC1. If so 2 inlined
3643 // constant and SGPR are illegal.
3644 legalizeOperands(UseMI
);
3646 bool DeleteDef
= MRI
->use_nodbg_empty(Reg
);
3648 DefMI
.eraseFromParent();
3658 memOpsHaveSameBaseOperands(ArrayRef
<const MachineOperand
*> BaseOps1
,
3659 ArrayRef
<const MachineOperand
*> BaseOps2
) {
3660 if (BaseOps1
.size() != BaseOps2
.size())
3662 for (size_t I
= 0, E
= BaseOps1
.size(); I
< E
; ++I
) {
3663 if (!BaseOps1
[I
]->isIdenticalTo(*BaseOps2
[I
]))
3669 static bool offsetsDoNotOverlap(LocationSize WidthA
, int OffsetA
,
3670 LocationSize WidthB
, int OffsetB
) {
3671 int LowOffset
= OffsetA
< OffsetB
? OffsetA
: OffsetB
;
3672 int HighOffset
= OffsetA
< OffsetB
? OffsetB
: OffsetA
;
3673 LocationSize LowWidth
= (LowOffset
== OffsetA
) ? WidthA
: WidthB
;
3674 return LowWidth
.hasValue() &&
3675 LowOffset
+ (int)LowWidth
.getValue() <= HighOffset
;
3678 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr
&MIa
,
3679 const MachineInstr
&MIb
) const {
3680 SmallVector
<const MachineOperand
*, 4> BaseOps0
, BaseOps1
;
3681 int64_t Offset0
, Offset1
;
3682 LocationSize Dummy0
= 0, Dummy1
= 0;
3683 bool Offset0IsScalable
, Offset1IsScalable
;
3684 if (!getMemOperandsWithOffsetWidth(MIa
, BaseOps0
, Offset0
, Offset0IsScalable
,
3686 !getMemOperandsWithOffsetWidth(MIb
, BaseOps1
, Offset1
, Offset1IsScalable
,
3690 if (!memOpsHaveSameBaseOperands(BaseOps0
, BaseOps1
))
3693 if (!MIa
.hasOneMemOperand() || !MIb
.hasOneMemOperand()) {
3694 // FIXME: Handle ds_read2 / ds_write2.
3697 LocationSize Width0
= MIa
.memoperands().front()->getSize();
3698 LocationSize Width1
= MIb
.memoperands().front()->getSize();
3699 return offsetsDoNotOverlap(Width0
, Offset0
, Width1
, Offset1
);
3702 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr
&MIa
,
3703 const MachineInstr
&MIb
) const {
3704 assert(MIa
.mayLoadOrStore() &&
3705 "MIa must load from or modify a memory location");
3706 assert(MIb
.mayLoadOrStore() &&
3707 "MIb must load from or modify a memory location");
3709 if (MIa
.hasUnmodeledSideEffects() || MIb
.hasUnmodeledSideEffects())
3712 // XXX - Can we relax this between address spaces?
3713 if (MIa
.hasOrderedMemoryRef() || MIb
.hasOrderedMemoryRef())
3716 if (isLDSDMA(MIa
) || isLDSDMA(MIb
))
3719 // TODO: Should we check the address space from the MachineMemOperand? That
3720 // would allow us to distinguish objects we know don't alias based on the
3721 // underlying address space, even if it was lowered to a different one,
3722 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3726 return checkInstOffsetsDoNotOverlap(MIa
, MIb
);
3728 return !isFLAT(MIb
) || isSegmentSpecificFLAT(MIb
);
3731 if (isMUBUF(MIa
) || isMTBUF(MIa
)) {
3732 if (isMUBUF(MIb
) || isMTBUF(MIb
))
3733 return checkInstOffsetsDoNotOverlap(MIa
, MIb
);
3736 return isFLATScratch(MIb
);
3738 return !isSMRD(MIb
);
3743 return checkInstOffsetsDoNotOverlap(MIa
, MIb
);
3746 return isFLATScratch(MIb
);
3748 return !isMUBUF(MIb
) && !isMTBUF(MIb
);
3753 if ((isFLATScratch(MIa
) && isFLATGlobal(MIb
)) ||
3754 (isFLATGlobal(MIa
) && isFLATScratch(MIb
)))
3757 return checkInstOffsetsDoNotOverlap(MIa
, MIb
);
3766 static bool getFoldableImm(Register Reg
, const MachineRegisterInfo
&MRI
,
3767 int64_t &Imm
, MachineInstr
**DefMI
= nullptr) {
3768 if (Reg
.isPhysical())
3770 auto *Def
= MRI
.getUniqueVRegDef(Reg
);
3771 if (Def
&& SIInstrInfo::isFoldableCopy(*Def
) && Def
->getOperand(1).isImm()) {
3772 Imm
= Def
->getOperand(1).getImm();
3780 static bool getFoldableImm(const MachineOperand
*MO
, int64_t &Imm
,
3781 MachineInstr
**DefMI
= nullptr) {
3784 const MachineFunction
*MF
= MO
->getParent()->getParent()->getParent();
3785 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3786 return getFoldableImm(MO
->getReg(), MRI
, Imm
, DefMI
);
3789 static void updateLiveVariables(LiveVariables
*LV
, MachineInstr
&MI
,
3790 MachineInstr
&NewMI
) {
3792 unsigned NumOps
= MI
.getNumOperands();
3793 for (unsigned I
= 1; I
< NumOps
; ++I
) {
3794 MachineOperand
&Op
= MI
.getOperand(I
);
3795 if (Op
.isReg() && Op
.isKill())
3796 LV
->replaceKillInstruction(Op
.getReg(), MI
, NewMI
);
3801 MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr
&MI
,
3803 LiveIntervals
*LIS
) const {
3804 MachineBasicBlock
&MBB
= *MI
.getParent();
3805 unsigned Opc
= MI
.getOpcode();
3808 int NewMFMAOpc
= AMDGPU::getMFMAEarlyClobberOp(Opc
);
3809 if (NewMFMAOpc
!= -1) {
3810 MachineInstrBuilder MIB
=
3811 BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(NewMFMAOpc
));
3812 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3813 MIB
.add(MI
.getOperand(I
));
3814 updateLiveVariables(LV
, MI
, *MIB
);
3816 LIS
->ReplaceMachineInstrInMaps(MI
, *MIB
);
3817 // SlotIndex of defs needs to be updated when converting to early-clobber
3818 MachineOperand
&Def
= MIB
->getOperand(0);
3819 if (Def
.isEarlyClobber() && Def
.isReg() &&
3820 LIS
->hasInterval(Def
.getReg())) {
3821 SlotIndex OldIndex
= LIS
->getInstructionIndex(*MIB
).getRegSlot(false);
3822 SlotIndex NewIndex
= LIS
->getInstructionIndex(*MIB
).getRegSlot(true);
3823 auto &LI
= LIS
->getInterval(Def
.getReg());
3824 auto UpdateDefIndex
= [&](LiveRange
&LR
) {
3825 auto *S
= LR
.find(OldIndex
);
3826 if (S
!= LR
.end() && S
->start
== OldIndex
) {
3827 assert(S
->valno
&& S
->valno
->def
== OldIndex
);
3828 S
->start
= NewIndex
;
3829 S
->valno
->def
= NewIndex
;
3833 for (auto &SR
: LI
.subranges())
3840 if (SIInstrInfo::isWMMA(MI
)) {
3841 unsigned NewOpc
= AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI
.getOpcode());
3842 MachineInstrBuilder MIB
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(NewOpc
))
3843 .setMIFlags(MI
.getFlags());
3844 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3845 MIB
->addOperand(MI
.getOperand(I
));
3847 updateLiveVariables(LV
, MI
, *MIB
);
3849 LIS
->ReplaceMachineInstrInMaps(MI
, *MIB
);
3855 Opc
!= AMDGPU::V_FMAC_F16_fake16_e32
&&
3856 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3860 bool IsF16
= Opc
== AMDGPU::V_MAC_F16_e32
|| Opc
== AMDGPU::V_MAC_F16_e64
||
3861 Opc
== AMDGPU::V_FMAC_F16_e32
|| Opc
== AMDGPU::V_FMAC_F16_e64
||
3862 Opc
== AMDGPU::V_FMAC_F16_fake16_e64
;
3863 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e32
|| Opc
== AMDGPU::V_FMAC_F32_e64
||
3864 Opc
== AMDGPU::V_FMAC_LEGACY_F32_e32
||
3865 Opc
== AMDGPU::V_FMAC_LEGACY_F32_e64
||
3866 Opc
== AMDGPU::V_FMAC_F16_e32
|| Opc
== AMDGPU::V_FMAC_F16_e64
||
3867 Opc
== AMDGPU::V_FMAC_F16_fake16_e64
||
3868 Opc
== AMDGPU::V_FMAC_F64_e32
|| Opc
== AMDGPU::V_FMAC_F64_e64
;
3869 bool IsF64
= Opc
== AMDGPU::V_FMAC_F64_e32
|| Opc
== AMDGPU::V_FMAC_F64_e64
;
3870 bool IsLegacy
= Opc
== AMDGPU::V_MAC_LEGACY_F32_e32
||
3871 Opc
== AMDGPU::V_MAC_LEGACY_F32_e64
||
3872 Opc
== AMDGPU::V_FMAC_LEGACY_F32_e32
||
3873 Opc
== AMDGPU::V_FMAC_LEGACY_F32_e64
;
3874 bool Src0Literal
= false;
3879 case AMDGPU::V_MAC_F16_e64
:
3880 case AMDGPU::V_FMAC_F16_e64
:
3881 case AMDGPU::V_FMAC_F16_fake16_e64
:
3882 case AMDGPU::V_MAC_F32_e64
:
3883 case AMDGPU::V_MAC_LEGACY_F32_e64
:
3884 case AMDGPU::V_FMAC_F32_e64
:
3885 case AMDGPU::V_FMAC_LEGACY_F32_e64
:
3886 case AMDGPU::V_FMAC_F64_e64
:
3888 case AMDGPU::V_MAC_F16_e32
:
3889 case AMDGPU::V_FMAC_F16_e32
:
3890 case AMDGPU::V_MAC_F32_e32
:
3891 case AMDGPU::V_MAC_LEGACY_F32_e32
:
3892 case AMDGPU::V_FMAC_F32_e32
:
3893 case AMDGPU::V_FMAC_LEGACY_F32_e32
:
3894 case AMDGPU::V_FMAC_F64_e32
: {
3895 int Src0Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
3896 AMDGPU::OpName::src0
);
3897 const MachineOperand
*Src0
= &MI
.getOperand(Src0Idx
);
3898 if (!Src0
->isReg() && !Src0
->isImm())
3901 if (Src0
->isImm() && !isInlineConstant(MI
, Src0Idx
, *Src0
))
3908 MachineInstrBuilder MIB
;
3909 const MachineOperand
*Dst
= getNamedOperand(MI
, AMDGPU::OpName::vdst
);
3910 const MachineOperand
*Src0
= getNamedOperand(MI
, AMDGPU::OpName::src0
);
3911 const MachineOperand
*Src0Mods
=
3912 getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
);
3913 const MachineOperand
*Src1
= getNamedOperand(MI
, AMDGPU::OpName::src1
);
3914 const MachineOperand
*Src1Mods
=
3915 getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
);
3916 const MachineOperand
*Src2
= getNamedOperand(MI
, AMDGPU::OpName::src2
);
3917 const MachineOperand
*Src2Mods
=
3918 getNamedOperand(MI
, AMDGPU::OpName::src2_modifiers
);
3919 const MachineOperand
*Clamp
= getNamedOperand(MI
, AMDGPU::OpName::clamp
);
3920 const MachineOperand
*Omod
= getNamedOperand(MI
, AMDGPU::OpName::omod
);
3921 const MachineOperand
*OpSel
= getNamedOperand(MI
, AMDGPU::OpName::op_sel
);
3923 if (!Src0Mods
&& !Src1Mods
&& !Src2Mods
&& !Clamp
&& !Omod
&& !IsF64
&&
3925 // If we have an SGPR input, we will violate the constant bus restriction.
3926 (ST
.getConstantBusLimit(Opc
) > 1 || !Src0
->isReg() ||
3927 !RI
.isSGPRReg(MBB
.getParent()->getRegInfo(), Src0
->getReg()))) {
3928 MachineInstr
*DefMI
;
3929 const auto killDef
= [&]() -> void {
3930 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
3931 // The only user is the instruction which will be killed.
3932 Register DefReg
= DefMI
->getOperand(0).getReg();
3934 if (MRI
.hasOneNonDBGUse(DefReg
)) {
3935 // We cannot just remove the DefMI here, calling pass will crash.
3936 DefMI
->setDesc(get(AMDGPU::IMPLICIT_DEF
));
3937 DefMI
->getOperand(0).setIsDead(true);
3938 for (unsigned I
= DefMI
->getNumOperands() - 1; I
!= 0; --I
)
3939 DefMI
->removeOperand(I
);
3941 LV
->getVarInfo(DefReg
).AliveBlocks
.clear();
3945 LiveInterval
&DefLI
= LIS
->getInterval(DefReg
);
3947 // We cannot delete the original instruction here, so hack out the use
3948 // in the original instruction with a dummy register so we can use
3949 // shrinkToUses to deal with any multi-use edge cases. Other targets do
3950 // not have the complexity of deleting a use to consider here.
3951 Register DummyReg
= MRI
.cloneVirtualRegister(DefReg
);
3952 for (MachineOperand
&MIOp
: MI
.uses()) {
3953 if (MIOp
.isReg() && MIOp
.getReg() == DefReg
) {
3954 MIOp
.setIsUndef(true);
3955 MIOp
.setReg(DummyReg
);
3959 LIS
->shrinkToUses(&DefLI
);
3964 if (!Src0Literal
&& getFoldableImm(Src2
, Imm
, &DefMI
)) {
3966 IsFMA
? (IsF16
? (ST
.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3967 : AMDGPU::V_FMAAK_F16
)
3968 : AMDGPU::V_FMAAK_F32
)
3969 : (IsF16
? AMDGPU::V_MADAK_F16
: AMDGPU::V_MADAK_F32
);
3970 if (pseudoToMCOpcode(NewOpc
) != -1) {
3971 MIB
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(NewOpc
))
3976 .setMIFlags(MI
.getFlags());
3977 updateLiveVariables(LV
, MI
, *MIB
);
3979 LIS
->ReplaceMachineInstrInMaps(MI
, *MIB
);
3985 IsFMA
? (IsF16
? (ST
.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3986 : AMDGPU::V_FMAMK_F16
)
3987 : AMDGPU::V_FMAMK_F32
)
3988 : (IsF16
? AMDGPU::V_MADMK_F16
: AMDGPU::V_MADMK_F32
);
3989 if (!Src0Literal
&& getFoldableImm(Src1
, Imm
, &DefMI
)) {
3990 if (pseudoToMCOpcode(NewOpc
) != -1) {
3991 MIB
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(NewOpc
))
3996 .setMIFlags(MI
.getFlags());
3997 updateLiveVariables(LV
, MI
, *MIB
);
4000 LIS
->ReplaceMachineInstrInMaps(MI
, *MIB
);
4005 if (Src0Literal
|| getFoldableImm(Src0
, Imm
, &DefMI
)) {
4007 Imm
= Src0
->getImm();
4010 if (pseudoToMCOpcode(NewOpc
) != -1 &&
4012 MI
, AMDGPU::getNamedOperandIdx(NewOpc
, AMDGPU::OpName::src0
),
4014 MIB
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(NewOpc
))
4019 .setMIFlags(MI
.getFlags());
4020 updateLiveVariables(LV
, MI
, *MIB
);
4023 LIS
->ReplaceMachineInstrInMaps(MI
, *MIB
);
4031 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4032 // if VOP3 does not allow a literal operand.
4033 if (Src0Literal
&& !ST
.hasVOP3Literal())
4036 unsigned NewOpc
= IsFMA
? IsF16
? AMDGPU::V_FMA_F16_gfx9_e64
4037 : IsF64
? AMDGPU::V_FMA_F64_e64
4039 ? AMDGPU::V_FMA_LEGACY_F32_e64
4040 : AMDGPU::V_FMA_F32_e64
4041 : IsF16
? AMDGPU::V_MAD_F16_e64
4042 : IsLegacy
? AMDGPU::V_MAD_LEGACY_F32_e64
4043 : AMDGPU::V_MAD_F32_e64
;
4044 if (pseudoToMCOpcode(NewOpc
) == -1)
4047 MIB
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(NewOpc
))
4049 .addImm(Src0Mods
? Src0Mods
->getImm() : 0)
4051 .addImm(Src1Mods
? Src1Mods
->getImm() : 0)
4053 .addImm(Src2Mods
? Src2Mods
->getImm() : 0)
4055 .addImm(Clamp
? Clamp
->getImm() : 0)
4056 .addImm(Omod
? Omod
->getImm() : 0)
4057 .setMIFlags(MI
.getFlags());
4058 if (AMDGPU::hasNamedOperand(NewOpc
, AMDGPU::OpName::op_sel
))
4059 MIB
.addImm(OpSel
? OpSel
->getImm() : 0);
4060 updateLiveVariables(LV
, MI
, *MIB
);
4062 LIS
->ReplaceMachineInstrInMaps(MI
, *MIB
);
4066 // It's not generally safe to move VALU instructions across these since it will
4067 // start using the register as a base index rather than directly.
4068 // XXX - Why isn't hasSideEffects sufficient for these?
4069 static bool changesVGPRIndexingMode(const MachineInstr
&MI
) {
4070 switch (MI
.getOpcode()) {
4071 case AMDGPU::S_SET_GPR_IDX_ON
:
4072 case AMDGPU::S_SET_GPR_IDX_MODE
:
4073 case AMDGPU::S_SET_GPR_IDX_OFF
:
4080 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr
&MI
,
4081 const MachineBasicBlock
*MBB
,
4082 const MachineFunction
&MF
) const {
4083 // Skipping the check for SP writes in the base implementation. The reason it
4084 // was added was apparently due to compile time concerns.
4086 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4087 // but is probably avoidable.
4089 // Copied from base implementation.
4090 // Terminators and labels can't be scheduled around.
4091 if (MI
.isTerminator() || MI
.isPosition())
4094 // INLINEASM_BR can jump to another block
4095 if (MI
.getOpcode() == TargetOpcode::INLINEASM_BR
)
4098 if (MI
.getOpcode() == AMDGPU::SCHED_BARRIER
&& MI
.getOperand(0).getImm() == 0)
4101 // Target-independent instructions do not have an implicit-use of EXEC, even
4102 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4103 // boundaries prevents incorrect movements of such instructions.
4104 return MI
.modifiesRegister(AMDGPU::EXEC
, &RI
) ||
4105 MI
.getOpcode() == AMDGPU::S_SETREG_IMM32_B32
||
4106 MI
.getOpcode() == AMDGPU::S_SETREG_B32
||
4107 MI
.getOpcode() == AMDGPU::S_SETPRIO
||
4108 changesVGPRIndexingMode(MI
);
4111 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode
) const {
4112 return Opcode
== AMDGPU::DS_ORDERED_COUNT
|| isGWS(Opcode
);
4115 bool SIInstrInfo::modifiesModeRegister(const MachineInstr
&MI
) {
4116 // Skip the full operand and register alias search modifiesRegister
4117 // does. There's only a handful of instructions that touch this, it's only an
4118 // implicit def, and doesn't alias any other registers.
4119 return is_contained(MI
.getDesc().implicit_defs(), AMDGPU::MODE
);
4122 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr
&MI
) const {
4123 unsigned Opcode
= MI
.getOpcode();
4125 if (MI
.mayStore() && isSMRD(MI
))
4126 return true; // scalar store or atomic
4128 // This will terminate the function when other lanes may need to continue.
4132 // These instructions cause shader I/O that may cause hardware lockups
4133 // when executed with an empty EXEC mask.
4135 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4136 // EXEC = 0, but checking for that case here seems not worth it
4137 // given the typical code patterns.
4138 if (Opcode
== AMDGPU::S_SENDMSG
|| Opcode
== AMDGPU::S_SENDMSGHALT
||
4139 isEXP(Opcode
) || Opcode
== AMDGPU::DS_ORDERED_COUNT
||
4140 Opcode
== AMDGPU::S_TRAP
|| Opcode
== AMDGPU::S_WAIT_EVENT
)
4143 if (MI
.isCall() || MI
.isInlineAsm())
4144 return true; // conservative assumption
4146 // Assume that barrier interactions are only intended with active lanes.
4147 if (isBarrier(Opcode
))
4150 // A mode change is a scalar operation that influences vector instructions.
4151 if (modifiesModeRegister(MI
))
4154 // These are like SALU instructions in terms of effects, so it's questionable
4155 // whether we should return true for those.
4157 // However, executing them with EXEC = 0 causes them to operate on undefined
4158 // data, which we avoid by returning true here.
4159 if (Opcode
== AMDGPU::V_READFIRSTLANE_B32
||
4160 Opcode
== AMDGPU::V_READLANE_B32
|| Opcode
== AMDGPU::V_WRITELANE_B32
||
4161 Opcode
== AMDGPU::SI_RESTORE_S32_FROM_VGPR
||
4162 Opcode
== AMDGPU::SI_SPILL_S32_TO_VGPR
)
4168 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo
&MRI
,
4169 const MachineInstr
&MI
) const {
4170 if (MI
.isMetaInstruction())
4173 // This won't read exec if this is an SGPR->SGPR copy.
4174 if (MI
.isCopyLike()) {
4175 if (!RI
.isSGPRReg(MRI
, MI
.getOperand(0).getReg()))
4178 // Make sure this isn't copying exec as a normal operand
4179 return MI
.readsRegister(AMDGPU::EXEC
, &RI
);
4182 // Make a conservative assumption about the callee.
4186 // Be conservative with any unhandled generic opcodes.
4187 if (!isTargetSpecificOpcode(MI
.getOpcode()))
4190 return !isSALU(MI
) || MI
.readsRegister(AMDGPU::EXEC
, &RI
);
4193 bool SIInstrInfo::isInlineConstant(const APInt
&Imm
) const {
4194 switch (Imm
.getBitWidth()) {
4195 case 1: // This likely will be a condition code mask.
4199 return AMDGPU::isInlinableLiteral32(Imm
.getSExtValue(),
4200 ST
.hasInv2PiInlineImm());
4202 return AMDGPU::isInlinableLiteral64(Imm
.getSExtValue(),
4203 ST
.hasInv2PiInlineImm());
4205 return ST
.has16BitInsts() &&
4206 AMDGPU::isInlinableLiteralI16(Imm
.getSExtValue(),
4207 ST
.hasInv2PiInlineImm());
4209 llvm_unreachable("invalid bitwidth");
4213 bool SIInstrInfo::isInlineConstant(const APFloat
&Imm
) const {
4214 APInt IntImm
= Imm
.bitcastToAPInt();
4215 int64_t IntImmVal
= IntImm
.getSExtValue();
4216 bool HasInv2Pi
= ST
.hasInv2PiInlineImm();
4217 switch (APFloat::SemanticsToEnum(Imm
.getSemantics())) {
4219 llvm_unreachable("invalid fltSemantics");
4220 case APFloatBase::S_IEEEsingle
:
4221 case APFloatBase::S_IEEEdouble
:
4222 return isInlineConstant(IntImm
);
4223 case APFloatBase::S_BFloat
:
4224 return ST
.has16BitInsts() &&
4225 AMDGPU::isInlinableLiteralBF16(IntImmVal
, HasInv2Pi
);
4226 case APFloatBase::S_IEEEhalf
:
4227 return ST
.has16BitInsts() &&
4228 AMDGPU::isInlinableLiteralFP16(IntImmVal
, HasInv2Pi
);
4232 bool SIInstrInfo::isInlineConstant(const MachineOperand
&MO
,
4233 uint8_t OperandType
) const {
4234 assert(!MO
.isReg() && "isInlineConstant called on register operand!");
4238 // MachineOperand provides no way to tell the true operand size, since it only
4239 // records a 64-bit value. We need to know the size to determine if a 32-bit
4240 // floating point immediate bit pattern is legal for an integer immediate. It
4241 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4243 int64_t Imm
= MO
.getImm();
4244 switch (OperandType
) {
4245 case AMDGPU::OPERAND_REG_IMM_INT32
:
4246 case AMDGPU::OPERAND_REG_IMM_FP32
:
4247 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED
:
4248 case AMDGPU::OPERAND_REG_INLINE_C_INT32
:
4249 case AMDGPU::OPERAND_REG_INLINE_C_FP32
:
4250 case AMDGPU::OPERAND_REG_IMM_V2FP32
:
4251 case AMDGPU::OPERAND_REG_INLINE_C_V2FP32
:
4252 case AMDGPU::OPERAND_REG_IMM_V2INT32
:
4253 case AMDGPU::OPERAND_REG_INLINE_C_V2INT32
:
4254 case AMDGPU::OPERAND_REG_INLINE_AC_INT32
:
4255 case AMDGPU::OPERAND_REG_INLINE_AC_FP32
:
4256 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32
: {
4257 int32_t Trunc
= static_cast<int32_t>(Imm
);
4258 return AMDGPU::isInlinableLiteral32(Trunc
, ST
.hasInv2PiInlineImm());
4260 case AMDGPU::OPERAND_REG_IMM_INT64
:
4261 case AMDGPU::OPERAND_REG_IMM_FP64
:
4262 case AMDGPU::OPERAND_REG_INLINE_C_INT64
:
4263 case AMDGPU::OPERAND_REG_INLINE_C_FP64
:
4264 case AMDGPU::OPERAND_REG_INLINE_AC_FP64
:
4265 return AMDGPU::isInlinableLiteral64(MO
.getImm(),
4266 ST
.hasInv2PiInlineImm());
4267 case AMDGPU::OPERAND_REG_IMM_INT16
:
4268 case AMDGPU::OPERAND_REG_INLINE_C_INT16
:
4269 case AMDGPU::OPERAND_REG_INLINE_AC_INT16
:
4270 // We would expect inline immediates to not be concerned with an integer/fp
4271 // distinction. However, in the case of 16-bit integer operations, the
4272 // "floating point" values appear to not work. It seems read the low 16-bits
4273 // of 32-bit immediates, which happens to always work for the integer
4276 // See llvm bugzilla 46302.
4278 // TODO: Theoretically we could use op-sel to use the high bits of the
4279 // 32-bit FP values.
4280 return AMDGPU::isInlinableIntLiteral(Imm
);
4281 case AMDGPU::OPERAND_REG_IMM_V2INT16
:
4282 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16
:
4283 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16
:
4284 return AMDGPU::isInlinableLiteralV2I16(Imm
);
4285 case AMDGPU::OPERAND_REG_IMM_V2FP16
:
4286 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16
:
4287 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16
:
4288 return AMDGPU::isInlinableLiteralV2F16(Imm
);
4289 case AMDGPU::OPERAND_REG_IMM_V2BF16
:
4290 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16
:
4291 case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16
:
4292 return AMDGPU::isInlinableLiteralV2BF16(Imm
);
4293 case AMDGPU::OPERAND_REG_IMM_FP16
:
4294 case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED
:
4295 case AMDGPU::OPERAND_REG_INLINE_C_FP16
:
4296 case AMDGPU::OPERAND_REG_INLINE_AC_FP16
: {
4297 if (isInt
<16>(Imm
) || isUInt
<16>(Imm
)) {
4298 // A few special case instructions have 16-bit operands on subtargets
4299 // where 16-bit instructions are not legal.
4300 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4301 // constants in these cases
4302 int16_t Trunc
= static_cast<int16_t>(Imm
);
4303 return ST
.has16BitInsts() &&
4304 AMDGPU::isInlinableLiteralFP16(Trunc
, ST
.hasInv2PiInlineImm());
4309 case AMDGPU::OPERAND_REG_IMM_BF16
:
4310 case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED
:
4311 case AMDGPU::OPERAND_REG_INLINE_C_BF16
:
4312 case AMDGPU::OPERAND_REG_INLINE_AC_BF16
: {
4313 if (isInt
<16>(Imm
) || isUInt
<16>(Imm
)) {
4314 int16_t Trunc
= static_cast<int16_t>(Imm
);
4315 return ST
.has16BitInsts() &&
4316 AMDGPU::isInlinableLiteralBF16(Trunc
, ST
.hasInv2PiInlineImm());
4320 case AMDGPU::OPERAND_KIMM32
:
4321 case AMDGPU::OPERAND_KIMM16
:
4323 case AMDGPU::OPERAND_INPUT_MODS
:
4324 case MCOI::OPERAND_IMMEDIATE
:
4325 // Always embedded in the instruction for free.
4327 case MCOI::OPERAND_UNKNOWN
:
4328 case MCOI::OPERAND_REGISTER
:
4329 case MCOI::OPERAND_PCREL
:
4330 case MCOI::OPERAND_GENERIC_0
:
4331 case MCOI::OPERAND_GENERIC_1
:
4332 case MCOI::OPERAND_GENERIC_2
:
4333 case MCOI::OPERAND_GENERIC_3
:
4334 case MCOI::OPERAND_GENERIC_4
:
4335 case MCOI::OPERAND_GENERIC_5
:
4336 // Just ignore anything else.
4339 llvm_unreachable("invalid operand type");
4343 static bool compareMachineOp(const MachineOperand
&Op0
,
4344 const MachineOperand
&Op1
) {
4345 if (Op0
.getType() != Op1
.getType())
4348 switch (Op0
.getType()) {
4349 case MachineOperand::MO_Register
:
4350 return Op0
.getReg() == Op1
.getReg();
4351 case MachineOperand::MO_Immediate
:
4352 return Op0
.getImm() == Op1
.getImm();
4354 llvm_unreachable("Didn't expect to be comparing these operand types");
4358 bool SIInstrInfo::isImmOperandLegal(const MachineInstr
&MI
, unsigned OpNo
,
4359 const MachineOperand
&MO
) const {
4360 const MCInstrDesc
&InstDesc
= MI
.getDesc();
4361 const MCOperandInfo
&OpInfo
= InstDesc
.operands()[OpNo
];
4363 assert(MO
.isImm() || MO
.isTargetIndex() || MO
.isFI() || MO
.isGlobal());
4365 if (OpInfo
.OperandType
== MCOI::OPERAND_IMMEDIATE
)
4368 if (OpInfo
.RegClass
< 0)
4371 if (MO
.isImm() && isInlineConstant(MO
, OpInfo
)) {
4372 if (isMAI(MI
) && ST
.hasMFMAInlineLiteralBug() &&
4373 OpNo
==(unsigned)AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
4374 AMDGPU::OpName::src2
))
4376 return RI
.opCanUseInlineConstant(OpInfo
.OperandType
);
4379 if (!RI
.opCanUseLiteralConstant(OpInfo
.OperandType
))
4382 if (!isVOP3(MI
) || !AMDGPU::isSISrcOperand(InstDesc
, OpNo
))
4385 return ST
.hasVOP3Literal();
4388 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode
) const {
4389 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4390 if (Opcode
== AMDGPU::V_MUL_LEGACY_F32_e64
&& ST
.hasGFX90AInsts())
4393 int Op32
= AMDGPU::getVOPe32(Opcode
);
4397 return pseudoToMCOpcode(Op32
) != -1;
4400 bool SIInstrInfo::hasModifiers(unsigned Opcode
) const {
4401 // The src0_modifier operand is present on all instructions
4402 // that have modifiers.
4404 return AMDGPU::hasNamedOperand(Opcode
, AMDGPU::OpName::src0_modifiers
);
4407 bool SIInstrInfo::hasModifiersSet(const MachineInstr
&MI
,
4408 unsigned OpName
) const {
4409 const MachineOperand
*Mods
= getNamedOperand(MI
, OpName
);
4410 return Mods
&& Mods
->getImm();
4413 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr
&MI
) const {
4414 return any_of(ModifierOpNames
,
4415 [&](unsigned Name
) { return hasModifiersSet(MI
, Name
); });
4418 bool SIInstrInfo::canShrink(const MachineInstr
&MI
,
4419 const MachineRegisterInfo
&MRI
) const {
4420 const MachineOperand
*Src2
= getNamedOperand(MI
, AMDGPU::OpName::src2
);
4421 // Can't shrink instruction with three operands.
4423 switch (MI
.getOpcode()) {
4424 default: return false;
4426 case AMDGPU::V_ADDC_U32_e64
:
4427 case AMDGPU::V_SUBB_U32_e64
:
4428 case AMDGPU::V_SUBBREV_U32_e64
: {
4429 const MachineOperand
*Src1
4430 = getNamedOperand(MI
, AMDGPU::OpName::src1
);
4431 if (!Src1
->isReg() || !RI
.isVGPR(MRI
, Src1
->getReg()))
4433 // Additional verification is needed for sdst/src2.
4436 case AMDGPU::V_MAC_F16_e64
:
4437 case AMDGPU::V_MAC_F32_e64
:
4438 case AMDGPU::V_MAC_LEGACY_F32_e64
:
4439 case AMDGPU::V_FMAC_F16_e64
:
4440 case AMDGPU::V_FMAC_F16_fake16_e64
:
4441 case AMDGPU::V_FMAC_F32_e64
:
4442 case AMDGPU::V_FMAC_F64_e64
:
4443 case AMDGPU::V_FMAC_LEGACY_F32_e64
:
4444 if (!Src2
->isReg() || !RI
.isVGPR(MRI
, Src2
->getReg()) ||
4445 hasModifiersSet(MI
, AMDGPU::OpName::src2_modifiers
))
4449 case AMDGPU::V_CNDMASK_B32_e64
:
4454 const MachineOperand
*Src1
= getNamedOperand(MI
, AMDGPU::OpName::src1
);
4455 if (Src1
&& (!Src1
->isReg() || !RI
.isVGPR(MRI
, Src1
->getReg()) ||
4456 hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
)))
4459 // We don't need to check src0, all input types are legal, so just make sure
4460 // src0 isn't using any modifiers.
4461 if (hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
))
4464 // Can it be shrunk to a valid 32 bit opcode?
4465 if (!hasVALU32BitEncoding(MI
.getOpcode()))
4468 // Check output modifiers
4469 return !hasModifiersSet(MI
, AMDGPU::OpName::omod
) &&
4470 !hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
4471 !hasModifiersSet(MI
, AMDGPU::OpName::byte_sel
) &&
4472 // TODO: Can we avoid checking bound_ctrl/fi here?
4473 // They are only used by permlane*_swap special case.
4474 !hasModifiersSet(MI
, AMDGPU::OpName::bound_ctrl
) &&
4475 !hasModifiersSet(MI
, AMDGPU::OpName::fi
);
4478 // Set VCC operand with all flags from \p Orig, except for setting it as
4480 static void copyFlagsToImplicitVCC(MachineInstr
&MI
,
4481 const MachineOperand
&Orig
) {
4483 for (MachineOperand
&Use
: MI
.implicit_operands()) {
4485 (Use
.getReg() == AMDGPU::VCC
|| Use
.getReg() == AMDGPU::VCC_LO
)) {
4486 Use
.setIsUndef(Orig
.isUndef());
4487 Use
.setIsKill(Orig
.isKill());
4493 MachineInstr
*SIInstrInfo::buildShrunkInst(MachineInstr
&MI
,
4494 unsigned Op32
) const {
4495 MachineBasicBlock
*MBB
= MI
.getParent();
4497 const MCInstrDesc
&Op32Desc
= get(Op32
);
4498 MachineInstrBuilder Inst32
=
4499 BuildMI(*MBB
, MI
, MI
.getDebugLoc(), Op32Desc
)
4500 .setMIFlags(MI
.getFlags());
4502 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4503 // For VOPC instructions, this is replaced by an implicit def of vcc.
4505 // We assume the defs of the shrunk opcode are in the same order, and the
4506 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4507 for (int I
= 0, E
= Op32Desc
.getNumDefs(); I
!= E
; ++I
)
4508 Inst32
.add(MI
.getOperand(I
));
4510 const MachineOperand
*Src2
= getNamedOperand(MI
, AMDGPU::OpName::src2
);
4512 int Idx
= MI
.getNumExplicitDefs();
4513 for (const MachineOperand
&Use
: MI
.explicit_uses()) {
4514 int OpTy
= MI
.getDesc().operands()[Idx
++].OperandType
;
4515 if (OpTy
== AMDGPU::OPERAND_INPUT_MODS
|| OpTy
== MCOI::OPERAND_IMMEDIATE
)
4519 if (AMDGPU::getNamedOperandIdx(Op32
, AMDGPU::OpName::src2
) == -1) {
4520 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4521 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4522 // of vcc was already added during the initial BuildMI, but we
4523 // 1) may need to change vcc to vcc_lo to preserve the original register
4524 // 2) have to preserve the original flags.
4525 copyFlagsToImplicitVCC(*Inst32
, *Src2
);
4533 // FIXME: Losing implicit operands
4534 fixImplicitOperands(*Inst32
);
4538 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo
&MRI
,
4539 const MachineOperand
&MO
,
4540 const MCOperandInfo
&OpInfo
) const {
4541 // Literal constants use the constant bus.
4543 return !isInlineConstant(MO
, OpInfo
);
4548 if (MO
.getReg().isVirtual())
4549 return RI
.isSGPRClass(MRI
.getRegClass(MO
.getReg()));
4552 if (MO
.getReg() == AMDGPU::SGPR_NULL
|| MO
.getReg() == AMDGPU::SGPR_NULL64
)
4555 // SGPRs use the constant bus
4556 if (MO
.isImplicit()) {
4557 return MO
.getReg() == AMDGPU::M0
|| MO
.getReg() == AMDGPU::VCC
||
4558 MO
.getReg() == AMDGPU::VCC_LO
;
4560 return AMDGPU::SReg_32RegClass
.contains(MO
.getReg()) ||
4561 AMDGPU::SReg_64RegClass
.contains(MO
.getReg());
4564 static Register
findImplicitSGPRRead(const MachineInstr
&MI
) {
4565 for (const MachineOperand
&MO
: MI
.implicit_operands()) {
4566 // We only care about reads.
4570 switch (MO
.getReg()) {
4572 case AMDGPU::VCC_LO
:
4573 case AMDGPU::VCC_HI
:
4575 case AMDGPU::FLAT_SCR
:
4586 static bool shouldReadExec(const MachineInstr
&MI
) {
4587 if (SIInstrInfo::isVALU(MI
)) {
4588 switch (MI
.getOpcode()) {
4589 case AMDGPU::V_READLANE_B32
:
4590 case AMDGPU::SI_RESTORE_S32_FROM_VGPR
:
4591 case AMDGPU::V_WRITELANE_B32
:
4592 case AMDGPU::SI_SPILL_S32_TO_VGPR
:
4599 if (MI
.isPreISelOpcode() ||
4600 SIInstrInfo::isGenericOpcode(MI
.getOpcode()) ||
4601 SIInstrInfo::isSALU(MI
) ||
4602 SIInstrInfo::isSMRD(MI
))
4608 static bool isRegOrFI(const MachineOperand
&MO
) {
4609 return MO
.isReg() || MO
.isFI();
4612 static bool isSubRegOf(const SIRegisterInfo
&TRI
,
4613 const MachineOperand
&SuperVec
,
4614 const MachineOperand
&SubReg
) {
4615 if (SubReg
.getReg().isPhysical())
4616 return TRI
.isSubRegister(SuperVec
.getReg(), SubReg
.getReg());
4618 return SubReg
.getSubReg() != AMDGPU::NoSubRegister
&&
4619 SubReg
.getReg() == SuperVec
.getReg();
4622 // Verify the illegal copy from vector register to SGPR for generic opcode COPY
4623 bool SIInstrInfo::verifyCopy(const MachineInstr
&MI
,
4624 const MachineRegisterInfo
&MRI
,
4625 StringRef
&ErrInfo
) const {
4626 Register DstReg
= MI
.getOperand(0).getReg();
4627 Register SrcReg
= MI
.getOperand(1).getReg();
4628 // This is a check for copy from vector register to SGPR
4629 if (RI
.isVectorRegister(MRI
, SrcReg
) && RI
.isSGPRReg(MRI
, DstReg
)) {
4630 ErrInfo
= "illegal copy from vector register to SGPR";
4636 bool SIInstrInfo::verifyInstruction(const MachineInstr
&MI
,
4637 StringRef
&ErrInfo
) const {
4638 uint16_t Opcode
= MI
.getOpcode();
4639 const MachineFunction
*MF
= MI
.getParent()->getParent();
4640 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
4642 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4643 // Find a better property to recognize the point where instruction selection
4645 // We can only enforce this check after SIFixSGPRCopies pass so that the
4646 // illegal copies are legalized and thereafter we don't expect a pass
4647 // inserting similar copies.
4648 if (!MRI
.isSSA() && MI
.isCopy())
4649 return verifyCopy(MI
, MRI
, ErrInfo
);
4651 if (SIInstrInfo::isGenericOpcode(MI
.getOpcode()))
4654 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
);
4655 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
);
4656 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
);
4658 if (Src0Idx
== -1) {
4659 // VOPD V_DUAL_* instructions use different operand names.
4660 Src0Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0X
);
4661 Src1Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vsrc1X
);
4662 Src2Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0Y
);
4663 Src3Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vsrc1Y
);
4666 // Make sure the number of operands is correct.
4667 const MCInstrDesc
&Desc
= get(Opcode
);
4668 if (!Desc
.isVariadic() &&
4669 Desc
.getNumOperands() != MI
.getNumExplicitOperands()) {
4670 ErrInfo
= "Instruction has wrong number of operands.";
4674 if (MI
.isInlineAsm()) {
4675 // Verify register classes for inlineasm constraints.
4676 for (unsigned I
= InlineAsm::MIOp_FirstOperand
, E
= MI
.getNumOperands();
4678 const TargetRegisterClass
*RC
= MI
.getRegClassConstraint(I
, this, &RI
);
4682 const MachineOperand
&Op
= MI
.getOperand(I
);
4686 Register Reg
= Op
.getReg();
4687 if (!Reg
.isVirtual() && !RC
->contains(Reg
)) {
4688 ErrInfo
= "inlineasm operand has incorrect register class.";
4696 if (isImage(MI
) && MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
4697 ErrInfo
= "missing memory operand from image instruction.";
4701 // Make sure the register classes are correct.
4702 for (int i
= 0, e
= Desc
.getNumOperands(); i
!= e
; ++i
) {
4703 const MachineOperand
&MO
= MI
.getOperand(i
);
4705 ErrInfo
= "FPImm Machine Operands are not supported. ISel should bitcast "
4706 "all fp values to integers.";
4710 int RegClass
= Desc
.operands()[i
].RegClass
;
4712 switch (Desc
.operands()[i
].OperandType
) {
4713 case MCOI::OPERAND_REGISTER
:
4714 if (MI
.getOperand(i
).isImm() || MI
.getOperand(i
).isGlobal()) {
4715 ErrInfo
= "Illegal immediate value for operand.";
4719 case AMDGPU::OPERAND_REG_IMM_INT32
:
4720 case AMDGPU::OPERAND_REG_IMM_FP32
:
4721 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED
:
4722 case AMDGPU::OPERAND_REG_IMM_V2FP32
:
4724 case AMDGPU::OPERAND_REG_INLINE_C_INT32
:
4725 case AMDGPU::OPERAND_REG_INLINE_C_FP32
:
4726 case AMDGPU::OPERAND_REG_INLINE_C_INT64
:
4727 case AMDGPU::OPERAND_REG_INLINE_C_FP64
:
4728 case AMDGPU::OPERAND_REG_INLINE_C_INT16
:
4729 case AMDGPU::OPERAND_REG_INLINE_C_FP16
:
4730 case AMDGPU::OPERAND_REG_INLINE_AC_INT32
:
4731 case AMDGPU::OPERAND_REG_INLINE_AC_FP32
:
4732 case AMDGPU::OPERAND_REG_INLINE_AC_INT16
:
4733 case AMDGPU::OPERAND_REG_INLINE_AC_FP16
:
4734 case AMDGPU::OPERAND_REG_INLINE_AC_FP64
: {
4735 if (!MO
.isReg() && (!MO
.isImm() || !isInlineConstant(MI
, i
))) {
4736 ErrInfo
= "Illegal immediate value for operand.";
4741 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32
:
4742 if (!MI
.getOperand(i
).isImm() || !isInlineConstant(MI
, i
)) {
4743 ErrInfo
= "Expected inline constant for operand.";
4747 case MCOI::OPERAND_IMMEDIATE
:
4748 case AMDGPU::OPERAND_KIMM32
:
4749 // Check if this operand is an immediate.
4750 // FrameIndex operands will be replaced by immediates, so they are
4752 if (!MI
.getOperand(i
).isImm() && !MI
.getOperand(i
).isFI()) {
4753 ErrInfo
= "Expected immediate, but got non-immediate";
4763 Register Reg
= MO
.getReg();
4767 // FIXME: Ideally we would have separate instruction definitions with the
4768 // aligned register constraint.
4769 // FIXME: We do not verify inline asm operands, but custom inline asm
4770 // verification is broken anyway
4771 if (ST
.needsAlignedVGPRs()) {
4772 const TargetRegisterClass
*RC
= RI
.getRegClassForReg(MRI
, Reg
);
4773 if (RI
.hasVectorRegisters(RC
) && MO
.getSubReg()) {
4774 const TargetRegisterClass
*SubRC
=
4775 RI
.getSubRegisterClass(RC
, MO
.getSubReg());
4776 RC
= RI
.getCompatibleSubRegClass(RC
, SubRC
, MO
.getSubReg());
4781 // Check that this is the aligned version of the class.
4782 if (!RC
|| !RI
.isProperlyAlignedRC(*RC
)) {
4783 ErrInfo
= "Subtarget requires even aligned vector registers";
4788 if (RegClass
!= -1) {
4789 if (Reg
.isVirtual())
4792 const TargetRegisterClass
*RC
= RI
.getRegClass(RegClass
);
4793 if (!RC
->contains(Reg
)) {
4794 ErrInfo
= "Operand has incorrect register class.";
4802 if (!ST
.hasSDWA()) {
4803 ErrInfo
= "SDWA is not supported on this target";
4807 int DstIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vdst
);
4809 for (int OpIdx
: {DstIdx
, Src0Idx
, Src1Idx
, Src2Idx
}) {
4812 const MachineOperand
&MO
= MI
.getOperand(OpIdx
);
4814 if (!ST
.hasSDWAScalar()) {
4816 if (!MO
.isReg() || !RI
.hasVGPRs(RI
.getRegClassForReg(MRI
, MO
.getReg()))) {
4817 ErrInfo
= "Only VGPRs allowed as operands in SDWA instructions on VI";
4821 // No immediates on GFX9
4824 "Only reg allowed as operands in SDWA instructions on GFX9+";
4830 if (!ST
.hasSDWAOmod()) {
4831 // No omod allowed on VI
4832 const MachineOperand
*OMod
= getNamedOperand(MI
, AMDGPU::OpName::omod
);
4833 if (OMod
!= nullptr &&
4834 (!OMod
->isImm() || OMod
->getImm() != 0)) {
4835 ErrInfo
= "OMod not allowed in SDWA instructions on VI";
4840 if (Opcode
== AMDGPU::V_CVT_F32_FP8_sdwa
||
4841 Opcode
== AMDGPU::V_CVT_F32_BF8_sdwa
||
4842 Opcode
== AMDGPU::V_CVT_PK_F32_FP8_sdwa
||
4843 Opcode
== AMDGPU::V_CVT_PK_F32_BF8_sdwa
) {
4844 const MachineOperand
*Src0ModsMO
=
4845 getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
);
4846 unsigned Mods
= Src0ModsMO
->getImm();
4847 if (Mods
& SISrcMods::ABS
|| Mods
& SISrcMods::NEG
||
4848 Mods
& SISrcMods::SEXT
) {
4849 ErrInfo
= "sext, abs and neg are not allowed on this instruction";
4854 uint16_t BasicOpcode
= AMDGPU::getBasicFromSDWAOp(Opcode
);
4855 if (isVOPC(BasicOpcode
)) {
4856 if (!ST
.hasSDWASdst() && DstIdx
!= -1) {
4857 // Only vcc allowed as dst on VI for VOPC
4858 const MachineOperand
&Dst
= MI
.getOperand(DstIdx
);
4859 if (!Dst
.isReg() || Dst
.getReg() != AMDGPU::VCC
) {
4860 ErrInfo
= "Only VCC allowed as dst in SDWA instructions on VI";
4863 } else if (!ST
.hasSDWAOutModsVOPC()) {
4864 // No clamp allowed on GFX9 for VOPC
4865 const MachineOperand
*Clamp
= getNamedOperand(MI
, AMDGPU::OpName::clamp
);
4866 if (Clamp
&& (!Clamp
->isImm() || Clamp
->getImm() != 0)) {
4867 ErrInfo
= "Clamp not allowed in VOPC SDWA instructions on VI";
4871 // No omod allowed on GFX9 for VOPC
4872 const MachineOperand
*OMod
= getNamedOperand(MI
, AMDGPU::OpName::omod
);
4873 if (OMod
&& (!OMod
->isImm() || OMod
->getImm() != 0)) {
4874 ErrInfo
= "OMod not allowed in VOPC SDWA instructions on VI";
4880 const MachineOperand
*DstUnused
= getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
4881 if (DstUnused
&& DstUnused
->isImm() &&
4882 DstUnused
->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE
) {
4883 const MachineOperand
&Dst
= MI
.getOperand(DstIdx
);
4884 if (!Dst
.isReg() || !Dst
.isTied()) {
4885 ErrInfo
= "Dst register should have tied register";
4889 const MachineOperand
&TiedMO
=
4890 MI
.getOperand(MI
.findTiedOperandIdx(DstIdx
));
4891 if (!TiedMO
.isReg() || !TiedMO
.isImplicit() || !TiedMO
.isUse()) {
4893 "Dst register should be tied to implicit use of preserved register";
4896 if (TiedMO
.getReg().isPhysical() && Dst
.getReg() != TiedMO
.getReg()) {
4897 ErrInfo
= "Dst register should use same physical register as preserved";
4903 // Verify MIMG / VIMAGE / VSAMPLE
4904 if (isImage(MI
.getOpcode()) && !MI
.mayStore()) {
4905 // Ensure that the return type used is large enough for all the options
4906 // being used TFE/LWE require an extra result register.
4907 const MachineOperand
*DMask
= getNamedOperand(MI
, AMDGPU::OpName::dmask
);
4909 uint64_t DMaskImm
= DMask
->getImm();
4911 isGather4(MI
.getOpcode()) ? 4 : llvm::popcount(DMaskImm
);
4912 const MachineOperand
*TFE
= getNamedOperand(MI
, AMDGPU::OpName::tfe
);
4913 const MachineOperand
*LWE
= getNamedOperand(MI
, AMDGPU::OpName::lwe
);
4914 const MachineOperand
*D16
= getNamedOperand(MI
, AMDGPU::OpName::d16
);
4916 // Adjust for packed 16 bit values
4917 if (D16
&& D16
->getImm() && !ST
.hasUnpackedD16VMem())
4918 RegCount
= divideCeil(RegCount
, 2);
4920 // Adjust if using LWE or TFE
4921 if ((LWE
&& LWE
->getImm()) || (TFE
&& TFE
->getImm()))
4924 const uint32_t DstIdx
=
4925 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdata
);
4926 const MachineOperand
&Dst
= MI
.getOperand(DstIdx
);
4928 const TargetRegisterClass
*DstRC
= getOpRegClass(MI
, DstIdx
);
4929 uint32_t DstSize
= RI
.getRegSizeInBits(*DstRC
) / 32;
4930 if (RegCount
> DstSize
) {
4931 ErrInfo
= "Image instruction returns too many registers for dst "
4939 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4940 if (isVALU(MI
) && Desc
.getOpcode() != AMDGPU::V_WRITELANE_B32
) {
4941 unsigned ConstantBusCount
= 0;
4942 bool UsesLiteral
= false;
4943 const MachineOperand
*LiteralVal
= nullptr;
4945 int ImmIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::imm
);
4949 LiteralVal
= &MI
.getOperand(ImmIdx
);
4952 SmallVector
<Register
, 2> SGPRsUsed
;
4955 // Only look at the true operands. Only a real operand can use the constant
4956 // bus, and we don't want to check pseudo-operands like the source modifier
4958 for (int OpIdx
: {Src0Idx
, Src1Idx
, Src2Idx
, Src3Idx
}) {
4961 const MachineOperand
&MO
= MI
.getOperand(OpIdx
);
4962 if (usesConstantBus(MRI
, MO
, MI
.getDesc().operands()[OpIdx
])) {
4964 SGPRUsed
= MO
.getReg();
4965 if (!llvm::is_contained(SGPRsUsed
, SGPRUsed
)) {
4967 SGPRsUsed
.push_back(SGPRUsed
);
4969 } else if (!MO
.isFI()) { // Treat FI like a register.
4974 } else if (!MO
.isIdenticalTo(*LiteralVal
)) {
4975 assert(isVOP2(MI
) || isVOP3(MI
));
4976 ErrInfo
= "VOP2/VOP3 instruction uses more than one literal";
4983 SGPRUsed
= findImplicitSGPRRead(MI
);
4985 // Implicit uses may safely overlap true operands
4986 if (llvm::all_of(SGPRsUsed
, [this, SGPRUsed
](unsigned SGPR
) {
4987 return !RI
.regsOverlap(SGPRUsed
, SGPR
);
4990 SGPRsUsed
.push_back(SGPRUsed
);
4994 // v_writelane_b32 is an exception from constant bus restriction:
4995 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4996 if (ConstantBusCount
> ST
.getConstantBusLimit(Opcode
) &&
4997 Opcode
!= AMDGPU::V_WRITELANE_B32
) {
4998 ErrInfo
= "VOP* instruction violates constant bus restriction";
5002 if (isVOP3(MI
) && UsesLiteral
&& !ST
.hasVOP3Literal()) {
5003 ErrInfo
= "VOP3 instruction uses literal";
5008 // Special case for writelane - this can break the multiple constant bus rule,
5009 // but still can't use more than one SGPR register
5010 if (Desc
.getOpcode() == AMDGPU::V_WRITELANE_B32
) {
5011 unsigned SGPRCount
= 0;
5014 for (int OpIdx
: {Src0Idx
, Src1Idx
}) {
5018 const MachineOperand
&MO
= MI
.getOperand(OpIdx
);
5020 if (usesConstantBus(MRI
, MO
, MI
.getDesc().operands()[OpIdx
])) {
5021 if (MO
.isReg() && MO
.getReg() != AMDGPU::M0
) {
5022 if (MO
.getReg() != SGPRUsed
)
5024 SGPRUsed
= MO
.getReg();
5027 if (SGPRCount
> ST
.getConstantBusLimit(Opcode
)) {
5028 ErrInfo
= "WRITELANE instruction violates constant bus restriction";
5034 // Verify misc. restrictions on specific instructions.
5035 if (Desc
.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64
||
5036 Desc
.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64
) {
5037 const MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
5038 const MachineOperand
&Src1
= MI
.getOperand(Src1Idx
);
5039 const MachineOperand
&Src2
= MI
.getOperand(Src2Idx
);
5040 if (Src0
.isReg() && Src1
.isReg() && Src2
.isReg()) {
5041 if (!compareMachineOp(Src0
, Src1
) &&
5042 !compareMachineOp(Src0
, Src2
)) {
5043 ErrInfo
= "v_div_scale_{f32|f64} require src0 = src1 or src2";
5047 if ((getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm() &
5049 (getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm() &
5051 (getNamedOperand(MI
, AMDGPU::OpName::src2_modifiers
)->getImm() &
5053 ErrInfo
= "ABS not allowed in VOP3B instructions";
5058 if (isSOP2(MI
) || isSOPC(MI
)) {
5059 const MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
5060 const MachineOperand
&Src1
= MI
.getOperand(Src1Idx
);
5062 if (!isRegOrFI(Src0
) && !isRegOrFI(Src1
) &&
5063 !isInlineConstant(Src0
, Desc
.operands()[Src0Idx
]) &&
5064 !isInlineConstant(Src1
, Desc
.operands()[Src1Idx
]) &&
5065 !Src0
.isIdenticalTo(Src1
)) {
5066 ErrInfo
= "SOP2/SOPC instruction requires too many immediate constants";
5072 const auto *Op
= getNamedOperand(MI
, AMDGPU::OpName::simm16
);
5073 if (Desc
.isBranch()) {
5075 ErrInfo
= "invalid branch target for SOPK instruction";
5079 uint64_t Imm
= Op
->getImm();
5080 if (sopkIsZext(Opcode
)) {
5081 if (!isUInt
<16>(Imm
)) {
5082 ErrInfo
= "invalid immediate for SOPK instruction";
5086 if (!isInt
<16>(Imm
)) {
5087 ErrInfo
= "invalid immediate for SOPK instruction";
5094 if (Desc
.getOpcode() == AMDGPU::V_MOVRELS_B32_e32
||
5095 Desc
.getOpcode() == AMDGPU::V_MOVRELS_B32_e64
||
5096 Desc
.getOpcode() == AMDGPU::V_MOVRELD_B32_e32
||
5097 Desc
.getOpcode() == AMDGPU::V_MOVRELD_B32_e64
) {
5098 const bool IsDst
= Desc
.getOpcode() == AMDGPU::V_MOVRELD_B32_e32
||
5099 Desc
.getOpcode() == AMDGPU::V_MOVRELD_B32_e64
;
5101 const unsigned StaticNumOps
=
5102 Desc
.getNumOperands() + Desc
.implicit_uses().size();
5103 const unsigned NumImplicitOps
= IsDst
? 2 : 1;
5105 // Allow additional implicit operands. This allows a fixup done by the post
5106 // RA scheduler where the main implicit operand is killed and implicit-defs
5107 // are added for sub-registers that remain live after this instruction.
5108 if (MI
.getNumOperands() < StaticNumOps
+ NumImplicitOps
) {
5109 ErrInfo
= "missing implicit register operands";
5113 const MachineOperand
*Dst
= getNamedOperand(MI
, AMDGPU::OpName::vdst
);
5115 if (!Dst
->isUse()) {
5116 ErrInfo
= "v_movreld_b32 vdst should be a use operand";
5121 if (!MI
.isRegTiedToUseOperand(StaticNumOps
, &UseOpIdx
) ||
5122 UseOpIdx
!= StaticNumOps
+ 1) {
5123 ErrInfo
= "movrel implicit operands should be tied";
5128 const MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
5129 const MachineOperand
&ImpUse
5130 = MI
.getOperand(StaticNumOps
+ NumImplicitOps
- 1);
5131 if (!ImpUse
.isReg() || !ImpUse
.isUse() ||
5132 !isSubRegOf(RI
, ImpUse
, IsDst
? *Dst
: Src0
)) {
5133 ErrInfo
= "src0 should be subreg of implicit vector use";
5138 // Make sure we aren't losing exec uses in the td files. This mostly requires
5139 // being careful when using let Uses to try to add other use registers.
5140 if (shouldReadExec(MI
)) {
5141 if (!MI
.hasRegisterImplicitUseOperand(AMDGPU::EXEC
)) {
5142 ErrInfo
= "VALU instruction does not implicitly read exec mask";
5148 if (MI
.mayStore() &&
5149 ST
.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
5150 // The register offset form of scalar stores may only use m0 as the
5151 // soffset register.
5152 const MachineOperand
*Soff
= getNamedOperand(MI
, AMDGPU::OpName::soffset
);
5153 if (Soff
&& Soff
->getReg() != AMDGPU::M0
) {
5154 ErrInfo
= "scalar stores must use m0 as offset register";
5160 if (isFLAT(MI
) && !ST
.hasFlatInstOffsets()) {
5161 const MachineOperand
*Offset
= getNamedOperand(MI
, AMDGPU::OpName::offset
);
5162 if (Offset
->getImm() != 0) {
5163 ErrInfo
= "subtarget does not support offsets in flat instructions";
5168 if (isDS(MI
) && !ST
.hasGDS()) {
5169 const MachineOperand
*GDSOp
= getNamedOperand(MI
, AMDGPU::OpName::gds
);
5170 if (GDSOp
&& GDSOp
->getImm() != 0) {
5171 ErrInfo
= "GDS is not supported on this subtarget";
5177 const MachineOperand
*DimOp
= getNamedOperand(MI
, AMDGPU::OpName::dim
);
5179 int VAddr0Idx
= AMDGPU::getNamedOperandIdx(Opcode
,
5180 AMDGPU::OpName::vaddr0
);
5182 isMIMG(MI
) ? AMDGPU::OpName::srsrc
: AMDGPU::OpName::rsrc
;
5183 int RsrcIdx
= AMDGPU::getNamedOperandIdx(Opcode
, RSrcOpName
);
5184 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(Opcode
);
5185 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
5186 AMDGPU::getMIMGBaseOpcodeInfo(Info
->BaseOpcode
);
5187 const AMDGPU::MIMGDimInfo
*Dim
=
5188 AMDGPU::getMIMGDimInfoByEncoding(DimOp
->getImm());
5191 ErrInfo
= "dim is out of range";
5196 if (ST
.hasR128A16()) {
5197 const MachineOperand
*R128A16
= getNamedOperand(MI
, AMDGPU::OpName::r128
);
5198 IsA16
= R128A16
->getImm() != 0;
5199 } else if (ST
.hasA16()) {
5200 const MachineOperand
*A16
= getNamedOperand(MI
, AMDGPU::OpName::a16
);
5201 IsA16
= A16
->getImm() != 0;
5204 bool IsNSA
= RsrcIdx
- VAddr0Idx
> 1;
5206 unsigned AddrWords
=
5207 AMDGPU::getAddrSizeMIMGOp(BaseOpcode
, Dim
, IsA16
, ST
.hasG16());
5209 unsigned VAddrWords
;
5211 VAddrWords
= RsrcIdx
- VAddr0Idx
;
5212 if (ST
.hasPartialNSAEncoding() &&
5213 AddrWords
> ST
.getNSAMaxSize(isVSAMPLE(MI
))) {
5214 unsigned LastVAddrIdx
= RsrcIdx
- 1;
5215 VAddrWords
+= getOpSize(MI
, LastVAddrIdx
) / 4 - 1;
5218 VAddrWords
= getOpSize(MI
, VAddr0Idx
) / 4;
5223 if (VAddrWords
!= AddrWords
) {
5224 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5225 << " but got " << VAddrWords
<< "\n");
5226 ErrInfo
= "bad vaddr size";
5232 const MachineOperand
*DppCt
= getNamedOperand(MI
, AMDGPU::OpName::dpp_ctrl
);
5234 using namespace AMDGPU::DPP
;
5236 unsigned DC
= DppCt
->getImm();
5237 if (DC
== DppCtrl::DPP_UNUSED1
|| DC
== DppCtrl::DPP_UNUSED2
||
5238 DC
== DppCtrl::DPP_UNUSED3
|| DC
> DppCtrl::DPP_LAST
||
5239 (DC
>= DppCtrl::DPP_UNUSED4_FIRST
&& DC
<= DppCtrl::DPP_UNUSED4_LAST
) ||
5240 (DC
>= DppCtrl::DPP_UNUSED5_FIRST
&& DC
<= DppCtrl::DPP_UNUSED5_LAST
) ||
5241 (DC
>= DppCtrl::DPP_UNUSED6_FIRST
&& DC
<= DppCtrl::DPP_UNUSED6_LAST
) ||
5242 (DC
>= DppCtrl::DPP_UNUSED7_FIRST
&& DC
<= DppCtrl::DPP_UNUSED7_LAST
) ||
5243 (DC
>= DppCtrl::DPP_UNUSED8_FIRST
&& DC
<= DppCtrl::DPP_UNUSED8_LAST
)) {
5244 ErrInfo
= "Invalid dpp_ctrl value";
5247 if (DC
>= DppCtrl::WAVE_SHL1
&& DC
<= DppCtrl::WAVE_ROR1
&&
5248 ST
.getGeneration() >= AMDGPUSubtarget::GFX10
) {
5249 ErrInfo
= "Invalid dpp_ctrl value: "
5250 "wavefront shifts are not supported on GFX10+";
5253 if (DC
>= DppCtrl::BCAST15
&& DC
<= DppCtrl::BCAST31
&&
5254 ST
.getGeneration() >= AMDGPUSubtarget::GFX10
) {
5255 ErrInfo
= "Invalid dpp_ctrl value: "
5256 "broadcasts are not supported on GFX10+";
5259 if (DC
>= DppCtrl::ROW_SHARE_FIRST
&& DC
<= DppCtrl::ROW_XMASK_LAST
&&
5260 ST
.getGeneration() < AMDGPUSubtarget::GFX10
) {
5261 if (DC
>= DppCtrl::ROW_NEWBCAST_FIRST
&&
5262 DC
<= DppCtrl::ROW_NEWBCAST_LAST
&&
5263 !ST
.hasGFX90AInsts()) {
5264 ErrInfo
= "Invalid dpp_ctrl value: "
5265 "row_newbroadcast/row_share is not supported before "
5269 if (DC
> DppCtrl::ROW_NEWBCAST_LAST
|| !ST
.hasGFX90AInsts()) {
5270 ErrInfo
= "Invalid dpp_ctrl value: "
5271 "row_share and row_xmask are not supported before GFX10";
5276 if (Opcode
!= AMDGPU::V_MOV_B64_DPP_PSEUDO
&&
5277 !AMDGPU::isLegalDPALU_DPPControl(DC
) && AMDGPU::isDPALU_DPP(Desc
)) {
5278 ErrInfo
= "Invalid dpp_ctrl value: "
5279 "DP ALU dpp only support row_newbcast";
5284 if ((MI
.mayStore() || MI
.mayLoad()) && !isVGPRSpill(MI
)) {
5285 const MachineOperand
*Dst
= getNamedOperand(MI
, AMDGPU::OpName::vdst
);
5286 uint16_t DataNameIdx
= isDS(Opcode
) ? AMDGPU::OpName::data0
5287 : AMDGPU::OpName::vdata
;
5288 const MachineOperand
*Data
= getNamedOperand(MI
, DataNameIdx
);
5289 const MachineOperand
*Data2
= getNamedOperand(MI
, AMDGPU::OpName::data1
);
5290 if (Data
&& !Data
->isReg())
5293 if (ST
.hasGFX90AInsts()) {
5295 (RI
.isAGPR(MRI
, Dst
->getReg()) != RI
.isAGPR(MRI
, Data
->getReg()))) {
5296 ErrInfo
= "Invalid register class: "
5297 "vdata and vdst should be both VGPR or AGPR";
5300 if (Data
&& Data2
&&
5301 (RI
.isAGPR(MRI
, Data
->getReg()) != RI
.isAGPR(MRI
, Data2
->getReg()))) {
5302 ErrInfo
= "Invalid register class: "
5303 "both data operands should be VGPR or AGPR";
5307 if ((Dst
&& RI
.isAGPR(MRI
, Dst
->getReg())) ||
5308 (Data
&& RI
.isAGPR(MRI
, Data
->getReg())) ||
5309 (Data2
&& RI
.isAGPR(MRI
, Data2
->getReg()))) {
5310 ErrInfo
= "Invalid register class: "
5311 "agpr loads and stores not supported on this GPU";
5317 if (ST
.needsAlignedVGPRs()) {
5318 const auto isAlignedReg
= [&MI
, &MRI
, this](unsigned OpName
) -> bool {
5319 const MachineOperand
*Op
= getNamedOperand(MI
, OpName
);
5322 Register Reg
= Op
->getReg();
5323 if (Reg
.isPhysical())
5324 return !(RI
.getHWRegIndex(Reg
) & 1);
5325 const TargetRegisterClass
&RC
= *MRI
.getRegClass(Reg
);
5326 return RI
.getRegSizeInBits(RC
) > 32 && RI
.isProperlyAlignedRC(RC
) &&
5327 !(RI
.getChannelFromSubReg(Op
->getSubReg()) & 1);
5330 if (MI
.getOpcode() == AMDGPU::DS_GWS_INIT
||
5331 MI
.getOpcode() == AMDGPU::DS_GWS_SEMA_BR
||
5332 MI
.getOpcode() == AMDGPU::DS_GWS_BARRIER
) {
5334 if (!isAlignedReg(AMDGPU::OpName::data0
)) {
5335 ErrInfo
= "Subtarget requires even aligned vector registers "
5336 "for DS_GWS instructions";
5342 if (!isAlignedReg(AMDGPU::OpName::vaddr
)) {
5343 ErrInfo
= "Subtarget requires even aligned vector registers "
5344 "for vaddr operand of image instructions";
5350 if (MI
.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64
&&
5351 !ST
.hasGFX90AInsts()) {
5352 const MachineOperand
*Src
= getNamedOperand(MI
, AMDGPU::OpName::src0
);
5353 if (Src
->isReg() && RI
.isSGPRReg(MRI
, Src
->getReg())) {
5354 ErrInfo
= "Invalid register class: "
5355 "v_accvgpr_write with an SGPR is not supported on this GPU";
5360 if (Desc
.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
) {
5361 const MachineOperand
&SrcOp
= MI
.getOperand(1);
5362 if (!SrcOp
.isReg() || SrcOp
.getReg().isVirtual()) {
5363 ErrInfo
= "pseudo expects only physical SGPRs";
5371 // It is more readable to list mapped opcodes on the same line.
5374 unsigned SIInstrInfo::getVALUOp(const MachineInstr
&MI
) const {
5375 switch (MI
.getOpcode()) {
5376 default: return AMDGPU::INSTRUCTION_LIST_END
;
5377 case AMDGPU::REG_SEQUENCE
: return AMDGPU::REG_SEQUENCE
;
5378 case AMDGPU::COPY
: return AMDGPU::COPY
;
5379 case AMDGPU::PHI
: return AMDGPU::PHI
;
5380 case AMDGPU::INSERT_SUBREG
: return AMDGPU::INSERT_SUBREG
;
5381 case AMDGPU::WQM
: return AMDGPU::WQM
;
5382 case AMDGPU::SOFT_WQM
: return AMDGPU::SOFT_WQM
;
5383 case AMDGPU::STRICT_WWM
: return AMDGPU::STRICT_WWM
;
5384 case AMDGPU::STRICT_WQM
: return AMDGPU::STRICT_WQM
;
5385 case AMDGPU::S_MOV_B32
: {
5386 const MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
5387 return MI
.getOperand(1).isReg() ||
5388 RI
.isAGPR(MRI
, MI
.getOperand(0).getReg()) ?
5389 AMDGPU::COPY
: AMDGPU::V_MOV_B32_e32
;
5391 case AMDGPU::S_ADD_I32
:
5392 return ST
.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64
: AMDGPU::V_ADD_CO_U32_e32
;
5393 case AMDGPU::S_ADDC_U32
:
5394 return AMDGPU::V_ADDC_U32_e32
;
5395 case AMDGPU::S_SUB_I32
:
5396 return ST
.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64
: AMDGPU::V_SUB_CO_U32_e32
;
5397 // FIXME: These are not consistently handled, and selected when the carry is
5399 case AMDGPU::S_ADD_U32
:
5400 return AMDGPU::V_ADD_CO_U32_e32
;
5401 case AMDGPU::S_SUB_U32
:
5402 return AMDGPU::V_SUB_CO_U32_e32
;
5403 case AMDGPU::S_SUBB_U32
: return AMDGPU::V_SUBB_U32_e32
;
5404 case AMDGPU::S_MUL_I32
: return AMDGPU::V_MUL_LO_U32_e64
;
5405 case AMDGPU::S_MUL_HI_U32
: return AMDGPU::V_MUL_HI_U32_e64
;
5406 case AMDGPU::S_MUL_HI_I32
: return AMDGPU::V_MUL_HI_I32_e64
;
5407 case AMDGPU::S_AND_B32
: return AMDGPU::V_AND_B32_e64
;
5408 case AMDGPU::S_OR_B32
: return AMDGPU::V_OR_B32_e64
;
5409 case AMDGPU::S_XOR_B32
: return AMDGPU::V_XOR_B32_e64
;
5410 case AMDGPU::S_XNOR_B32
:
5411 return ST
.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64
: AMDGPU::INSTRUCTION_LIST_END
;
5412 case AMDGPU::S_MIN_I32
: return AMDGPU::V_MIN_I32_e64
;
5413 case AMDGPU::S_MIN_U32
: return AMDGPU::V_MIN_U32_e64
;
5414 case AMDGPU::S_MAX_I32
: return AMDGPU::V_MAX_I32_e64
;
5415 case AMDGPU::S_MAX_U32
: return AMDGPU::V_MAX_U32_e64
;
5416 case AMDGPU::S_ASHR_I32
: return AMDGPU::V_ASHR_I32_e32
;
5417 case AMDGPU::S_ASHR_I64
: return AMDGPU::V_ASHR_I64_e64
;
5418 case AMDGPU::S_LSHL_B32
: return AMDGPU::V_LSHL_B32_e32
;
5419 case AMDGPU::S_LSHL_B64
: return AMDGPU::V_LSHL_B64_e64
;
5420 case AMDGPU::S_LSHR_B32
: return AMDGPU::V_LSHR_B32_e32
;
5421 case AMDGPU::S_LSHR_B64
: return AMDGPU::V_LSHR_B64_e64
;
5422 case AMDGPU::S_SEXT_I32_I8
: return AMDGPU::V_BFE_I32_e64
;
5423 case AMDGPU::S_SEXT_I32_I16
: return AMDGPU::V_BFE_I32_e64
;
5424 case AMDGPU::S_BFE_U32
: return AMDGPU::V_BFE_U32_e64
;
5425 case AMDGPU::S_BFE_I32
: return AMDGPU::V_BFE_I32_e64
;
5426 case AMDGPU::S_BFM_B32
: return AMDGPU::V_BFM_B32_e64
;
5427 case AMDGPU::S_BREV_B32
: return AMDGPU::V_BFREV_B32_e32
;
5428 case AMDGPU::S_NOT_B32
: return AMDGPU::V_NOT_B32_e32
;
5429 case AMDGPU::S_NOT_B64
: return AMDGPU::V_NOT_B32_e32
;
5430 case AMDGPU::S_CMP_EQ_I32
: return AMDGPU::V_CMP_EQ_I32_e64
;
5431 case AMDGPU::S_CMP_LG_I32
: return AMDGPU::V_CMP_NE_I32_e64
;
5432 case AMDGPU::S_CMP_GT_I32
: return AMDGPU::V_CMP_GT_I32_e64
;
5433 case AMDGPU::S_CMP_GE_I32
: return AMDGPU::V_CMP_GE_I32_e64
;
5434 case AMDGPU::S_CMP_LT_I32
: return AMDGPU::V_CMP_LT_I32_e64
;
5435 case AMDGPU::S_CMP_LE_I32
: return AMDGPU::V_CMP_LE_I32_e64
;
5436 case AMDGPU::S_CMP_EQ_U32
: return AMDGPU::V_CMP_EQ_U32_e64
;
5437 case AMDGPU::S_CMP_LG_U32
: return AMDGPU::V_CMP_NE_U32_e64
;
5438 case AMDGPU::S_CMP_GT_U32
: return AMDGPU::V_CMP_GT_U32_e64
;
5439 case AMDGPU::S_CMP_GE_U32
: return AMDGPU::V_CMP_GE_U32_e64
;
5440 case AMDGPU::S_CMP_LT_U32
: return AMDGPU::V_CMP_LT_U32_e64
;
5441 case AMDGPU::S_CMP_LE_U32
: return AMDGPU::V_CMP_LE_U32_e64
;
5442 case AMDGPU::S_CMP_EQ_U64
: return AMDGPU::V_CMP_EQ_U64_e64
;
5443 case AMDGPU::S_CMP_LG_U64
: return AMDGPU::V_CMP_NE_U64_e64
;
5444 case AMDGPU::S_BCNT1_I32_B32
: return AMDGPU::V_BCNT_U32_B32_e64
;
5445 case AMDGPU::S_FF1_I32_B32
: return AMDGPU::V_FFBL_B32_e32
;
5446 case AMDGPU::S_FLBIT_I32_B32
: return AMDGPU::V_FFBH_U32_e32
;
5447 case AMDGPU::S_FLBIT_I32
: return AMDGPU::V_FFBH_I32_e64
;
5448 case AMDGPU::S_CBRANCH_SCC0
: return AMDGPU::S_CBRANCH_VCCZ
;
5449 case AMDGPU::S_CBRANCH_SCC1
: return AMDGPU::S_CBRANCH_VCCNZ
;
5450 case AMDGPU::S_CVT_F32_I32
: return AMDGPU::V_CVT_F32_I32_e64
;
5451 case AMDGPU::S_CVT_F32_U32
: return AMDGPU::V_CVT_F32_U32_e64
;
5452 case AMDGPU::S_CVT_I32_F32
: return AMDGPU::V_CVT_I32_F32_e64
;
5453 case AMDGPU::S_CVT_U32_F32
: return AMDGPU::V_CVT_U32_F32_e64
;
5454 case AMDGPU::S_CVT_F32_F16
:
5455 case AMDGPU::S_CVT_HI_F32_F16
:
5456 return ST
.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5457 : AMDGPU::V_CVT_F32_F16_fake16_e64
;
5458 case AMDGPU::S_CVT_F16_F32
:
5459 return ST
.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5460 : AMDGPU::V_CVT_F16_F32_fake16_e64
;
5461 case AMDGPU::S_CEIL_F32
: return AMDGPU::V_CEIL_F32_e64
;
5462 case AMDGPU::S_FLOOR_F32
: return AMDGPU::V_FLOOR_F32_e64
;
5463 case AMDGPU::S_TRUNC_F32
: return AMDGPU::V_TRUNC_F32_e64
;
5464 case AMDGPU::S_RNDNE_F32
: return AMDGPU::V_RNDNE_F32_e64
;
5465 case AMDGPU::S_CEIL_F16
:
5466 return ST
.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5467 : AMDGPU::V_CEIL_F16_fake16_e64
;
5468 case AMDGPU::S_FLOOR_F16
:
5469 return ST
.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5470 : AMDGPU::V_FLOOR_F16_fake16_e64
;
5471 case AMDGPU::S_TRUNC_F16
:
5472 return AMDGPU::V_TRUNC_F16_fake16_e64
;
5473 case AMDGPU::S_RNDNE_F16
:
5474 return AMDGPU::V_RNDNE_F16_fake16_e64
;
5475 case AMDGPU::S_ADD_F32
: return AMDGPU::V_ADD_F32_e64
;
5476 case AMDGPU::S_SUB_F32
: return AMDGPU::V_SUB_F32_e64
;
5477 case AMDGPU::S_MIN_F32
: return AMDGPU::V_MIN_F32_e64
;
5478 case AMDGPU::S_MAX_F32
: return AMDGPU::V_MAX_F32_e64
;
5479 case AMDGPU::S_MINIMUM_F32
: return AMDGPU::V_MINIMUM_F32_e64
;
5480 case AMDGPU::S_MAXIMUM_F32
: return AMDGPU::V_MAXIMUM_F32_e64
;
5481 case AMDGPU::S_MUL_F32
: return AMDGPU::V_MUL_F32_e64
;
5482 case AMDGPU::S_ADD_F16
: return AMDGPU::V_ADD_F16_fake16_e64
;
5483 case AMDGPU::S_SUB_F16
: return AMDGPU::V_SUB_F16_fake16_e64
;
5484 case AMDGPU::S_MIN_F16
: return AMDGPU::V_MIN_F16_fake16_e64
;
5485 case AMDGPU::S_MAX_F16
: return AMDGPU::V_MAX_F16_fake16_e64
;
5486 case AMDGPU::S_MINIMUM_F16
: return AMDGPU::V_MINIMUM_F16_e64
;
5487 case AMDGPU::S_MAXIMUM_F16
: return AMDGPU::V_MAXIMUM_F16_e64
;
5488 case AMDGPU::S_MUL_F16
: return AMDGPU::V_MUL_F16_fake16_e64
;
5489 case AMDGPU::S_CVT_PK_RTZ_F16_F32
: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64
;
5490 case AMDGPU::S_FMAC_F32
: return AMDGPU::V_FMAC_F32_e64
;
5491 case AMDGPU::S_FMAC_F16
: return AMDGPU::V_FMAC_F16_fake16_e64
;
5492 case AMDGPU::S_FMAMK_F32
: return AMDGPU::V_FMAMK_F32
;
5493 case AMDGPU::S_FMAAK_F32
: return AMDGPU::V_FMAAK_F32
;
5494 case AMDGPU::S_CMP_LT_F32
: return AMDGPU::V_CMP_LT_F32_e64
;
5495 case AMDGPU::S_CMP_EQ_F32
: return AMDGPU::V_CMP_EQ_F32_e64
;
5496 case AMDGPU::S_CMP_LE_F32
: return AMDGPU::V_CMP_LE_F32_e64
;
5497 case AMDGPU::S_CMP_GT_F32
: return AMDGPU::V_CMP_GT_F32_e64
;
5498 case AMDGPU::S_CMP_LG_F32
: return AMDGPU::V_CMP_LG_F32_e64
;
5499 case AMDGPU::S_CMP_GE_F32
: return AMDGPU::V_CMP_GE_F32_e64
;
5500 case AMDGPU::S_CMP_O_F32
: return AMDGPU::V_CMP_O_F32_e64
;
5501 case AMDGPU::S_CMP_U_F32
: return AMDGPU::V_CMP_U_F32_e64
;
5502 case AMDGPU::S_CMP_NGE_F32
: return AMDGPU::V_CMP_NGE_F32_e64
;
5503 case AMDGPU::S_CMP_NLG_F32
: return AMDGPU::V_CMP_NLG_F32_e64
;
5504 case AMDGPU::S_CMP_NGT_F32
: return AMDGPU::V_CMP_NGT_F32_e64
;
5505 case AMDGPU::S_CMP_NLE_F32
: return AMDGPU::V_CMP_NLE_F32_e64
;
5506 case AMDGPU::S_CMP_NEQ_F32
: return AMDGPU::V_CMP_NEQ_F32_e64
;
5507 case AMDGPU::S_CMP_NLT_F32
: return AMDGPU::V_CMP_NLT_F32_e64
;
5508 case AMDGPU::S_CMP_LT_F16
:
5509 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5510 : AMDGPU::V_CMP_LT_F16_fake16_e64
;
5511 case AMDGPU::S_CMP_EQ_F16
:
5512 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5513 : AMDGPU::V_CMP_EQ_F16_fake16_e64
;
5514 case AMDGPU::S_CMP_LE_F16
:
5515 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5516 : AMDGPU::V_CMP_LE_F16_fake16_e64
;
5517 case AMDGPU::S_CMP_GT_F16
:
5518 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5519 : AMDGPU::V_CMP_GT_F16_fake16_e64
;
5520 case AMDGPU::S_CMP_LG_F16
:
5521 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5522 : AMDGPU::V_CMP_LG_F16_fake16_e64
;
5523 case AMDGPU::S_CMP_GE_F16
:
5524 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5525 : AMDGPU::V_CMP_GE_F16_fake16_e64
;
5526 case AMDGPU::S_CMP_O_F16
:
5527 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5528 : AMDGPU::V_CMP_O_F16_fake16_e64
;
5529 case AMDGPU::S_CMP_U_F16
:
5530 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5531 : AMDGPU::V_CMP_U_F16_fake16_e64
;
5532 case AMDGPU::S_CMP_NGE_F16
:
5533 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5534 : AMDGPU::V_CMP_NGE_F16_fake16_e64
;
5535 case AMDGPU::S_CMP_NLG_F16
:
5536 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5537 : AMDGPU::V_CMP_NLG_F16_fake16_e64
;
5538 case AMDGPU::S_CMP_NGT_F16
:
5539 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5540 : AMDGPU::V_CMP_NGT_F16_fake16_e64
;
5541 case AMDGPU::S_CMP_NLE_F16
:
5542 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5543 : AMDGPU::V_CMP_NLE_F16_fake16_e64
;
5544 case AMDGPU::S_CMP_NEQ_F16
:
5545 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5546 : AMDGPU::V_CMP_NEQ_F16_fake16_e64
;
5547 case AMDGPU::S_CMP_NLT_F16
:
5548 return ST
.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5549 : AMDGPU::V_CMP_NLT_F16_fake16_e64
;
5550 case AMDGPU::V_S_EXP_F32_e64
: return AMDGPU::V_EXP_F32_e64
;
5551 case AMDGPU::V_S_EXP_F16_e64
: return AMDGPU::V_EXP_F16_fake16_e64
;
5552 case AMDGPU::V_S_LOG_F32_e64
: return AMDGPU::V_LOG_F32_e64
;
5553 case AMDGPU::V_S_LOG_F16_e64
: return AMDGPU::V_LOG_F16_fake16_e64
;
5554 case AMDGPU::V_S_RCP_F32_e64
: return AMDGPU::V_RCP_F32_e64
;
5555 case AMDGPU::V_S_RCP_F16_e64
: return AMDGPU::V_RCP_F16_fake16_e64
;
5556 case AMDGPU::V_S_RSQ_F32_e64
: return AMDGPU::V_RSQ_F32_e64
;
5557 case AMDGPU::V_S_RSQ_F16_e64
: return AMDGPU::V_RSQ_F16_fake16_e64
;
5558 case AMDGPU::V_S_SQRT_F32_e64
: return AMDGPU::V_SQRT_F32_e64
;
5559 case AMDGPU::V_S_SQRT_F16_e64
: return AMDGPU::V_SQRT_F16_fake16_e64
;
5562 "Unexpected scalar opcode without corresponding vector one!");
5567 void SIInstrInfo::insertScratchExecCopy(MachineFunction
&MF
,
5568 MachineBasicBlock
&MBB
,
5569 MachineBasicBlock::iterator MBBI
,
5570 const DebugLoc
&DL
, Register Reg
,
5572 SlotIndexes
*Indexes
) const {
5573 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
5574 const SIInstrInfo
*TII
= ST
.getInstrInfo();
5575 bool IsWave32
= ST
.isWave32();
5577 // Insert two move instructions, one to save the original value of EXEC and
5578 // the other to turn on all bits in EXEC. This is required as we can't use
5579 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5580 unsigned MovOpc
= IsWave32
? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
5581 MCRegister Exec
= IsWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
5582 auto StoreExecMI
= BuildMI(MBB
, MBBI
, DL
, TII
->get(MovOpc
), Reg
)
5583 .addReg(Exec
, RegState::Kill
);
5584 auto FlipExecMI
= BuildMI(MBB
, MBBI
, DL
, TII
->get(MovOpc
), Exec
).addImm(-1);
5586 Indexes
->insertMachineInstrInMaps(*StoreExecMI
);
5587 Indexes
->insertMachineInstrInMaps(*FlipExecMI
);
5590 const unsigned OrSaveExec
=
5591 IsWave32
? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64
;
5593 BuildMI(MBB
, MBBI
, DL
, TII
->get(OrSaveExec
), Reg
).addImm(-1);
5594 SaveExec
->getOperand(3).setIsDead(); // Mark SCC as dead.
5596 Indexes
->insertMachineInstrInMaps(*SaveExec
);
5600 void SIInstrInfo::restoreExec(MachineFunction
&MF
, MachineBasicBlock
&MBB
,
5601 MachineBasicBlock::iterator MBBI
,
5602 const DebugLoc
&DL
, Register Reg
,
5603 SlotIndexes
*Indexes
) const {
5604 unsigned ExecMov
= isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
5605 MCRegister Exec
= isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
5606 auto ExecRestoreMI
=
5607 BuildMI(MBB
, MBBI
, DL
, get(ExecMov
), Exec
).addReg(Reg
, RegState::Kill
);
5609 Indexes
->insertMachineInstrInMaps(*ExecRestoreMI
);
5612 static const TargetRegisterClass
*
5613 adjustAllocatableRegClass(const GCNSubtarget
&ST
, const SIRegisterInfo
&RI
,
5614 const MachineRegisterInfo
&MRI
,
5615 const MCInstrDesc
&TID
, unsigned RCID
,
5616 bool IsAllocatable
) {
5617 if ((IsAllocatable
|| !ST
.hasGFX90AInsts() || !MRI
.reservedRegsFrozen()) &&
5618 (((TID
.mayLoad() || TID
.mayStore()) &&
5619 !(TID
.TSFlags
& SIInstrFlags::Spill
)) ||
5620 (TID
.TSFlags
& (SIInstrFlags::DS
| SIInstrFlags::MIMG
)))) {
5622 case AMDGPU::AV_32RegClassID
:
5623 RCID
= AMDGPU::VGPR_32RegClassID
;
5625 case AMDGPU::AV_64RegClassID
:
5626 RCID
= AMDGPU::VReg_64RegClassID
;
5628 case AMDGPU::AV_96RegClassID
:
5629 RCID
= AMDGPU::VReg_96RegClassID
;
5631 case AMDGPU::AV_128RegClassID
:
5632 RCID
= AMDGPU::VReg_128RegClassID
;
5634 case AMDGPU::AV_160RegClassID
:
5635 RCID
= AMDGPU::VReg_160RegClassID
;
5637 case AMDGPU::AV_512RegClassID
:
5638 RCID
= AMDGPU::VReg_512RegClassID
;
5645 return RI
.getProperlyAlignedRC(RI
.getRegClass(RCID
));
5648 const TargetRegisterClass
*SIInstrInfo::getRegClass(const MCInstrDesc
&TID
,
5649 unsigned OpNum
, const TargetRegisterInfo
*TRI
,
5650 const MachineFunction
&MF
)
5652 if (OpNum
>= TID
.getNumOperands())
5654 auto RegClass
= TID
.operands()[OpNum
].RegClass
;
5655 bool IsAllocatable
= false;
5656 if (TID
.TSFlags
& (SIInstrFlags::DS
| SIInstrFlags::FLAT
)) {
5657 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5658 // with two data operands. Request register class constrained to VGPR only
5659 // of both operands present as Machine Copy Propagation can not check this
5660 // constraint and possibly other passes too.
5662 // The check is limited to FLAT and DS because atomics in non-flat encoding
5663 // have their vdst and vdata tied to be the same register.
5664 const int VDstIdx
= AMDGPU::getNamedOperandIdx(TID
.Opcode
,
5665 AMDGPU::OpName::vdst
);
5666 const int DataIdx
= AMDGPU::getNamedOperandIdx(TID
.Opcode
,
5667 (TID
.TSFlags
& SIInstrFlags::DS
) ? AMDGPU::OpName::data0
5668 : AMDGPU::OpName::vdata
);
5669 if (DataIdx
!= -1) {
5670 IsAllocatable
= VDstIdx
!= -1 || AMDGPU::hasNamedOperand(
5671 TID
.Opcode
, AMDGPU::OpName::data1
);
5674 return adjustAllocatableRegClass(ST
, RI
, MF
.getRegInfo(), TID
, RegClass
,
5678 const TargetRegisterClass
*SIInstrInfo::getOpRegClass(const MachineInstr
&MI
,
5679 unsigned OpNo
) const {
5680 const MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
5681 const MCInstrDesc
&Desc
= get(MI
.getOpcode());
5682 if (MI
.isVariadic() || OpNo
>= Desc
.getNumOperands() ||
5683 Desc
.operands()[OpNo
].RegClass
== -1) {
5684 Register Reg
= MI
.getOperand(OpNo
).getReg();
5686 if (Reg
.isVirtual())
5687 return MRI
.getRegClass(Reg
);
5688 return RI
.getPhysRegBaseClass(Reg
);
5691 unsigned RCID
= Desc
.operands()[OpNo
].RegClass
;
5692 return adjustAllocatableRegClass(ST
, RI
, MRI
, Desc
, RCID
, true);
5695 void SIInstrInfo::legalizeOpWithMove(MachineInstr
&MI
, unsigned OpIdx
) const {
5696 MachineBasicBlock::iterator I
= MI
;
5697 MachineBasicBlock
*MBB
= MI
.getParent();
5698 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
5699 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
5700 unsigned RCID
= get(MI
.getOpcode()).operands()[OpIdx
].RegClass
;
5701 const TargetRegisterClass
*RC
= RI
.getRegClass(RCID
);
5702 unsigned Size
= RI
.getRegSizeInBits(*RC
);
5703 unsigned Opcode
= (Size
== 64) ? AMDGPU::V_MOV_B64_PSEUDO
5704 : Size
== 16 ? AMDGPU::V_MOV_B16_t16_e64
5705 : AMDGPU::V_MOV_B32_e32
;
5707 Opcode
= AMDGPU::COPY
;
5708 else if (RI
.isSGPRClass(RC
))
5709 Opcode
= (Size
== 64) ? AMDGPU::S_MOV_B64
: AMDGPU::S_MOV_B32
;
5711 const TargetRegisterClass
*VRC
= RI
.getEquivalentVGPRClass(RC
);
5712 Register Reg
= MRI
.createVirtualRegister(VRC
);
5713 DebugLoc DL
= MBB
->findDebugLoc(I
);
5714 BuildMI(*MI
.getParent(), I
, DL
, get(Opcode
), Reg
).add(MO
);
5715 MO
.ChangeToRegister(Reg
, false);
5718 unsigned SIInstrInfo::buildExtractSubReg(
5719 MachineBasicBlock::iterator MI
, MachineRegisterInfo
&MRI
,
5720 const MachineOperand
&SuperReg
, const TargetRegisterClass
*SuperRC
,
5721 unsigned SubIdx
, const TargetRegisterClass
*SubRC
) const {
5722 if (!SuperReg
.getReg().isVirtual())
5723 return RI
.getSubReg(SuperReg
.getReg(), SubIdx
);
5725 MachineBasicBlock
*MBB
= MI
->getParent();
5726 DebugLoc DL
= MI
->getDebugLoc();
5727 Register SubReg
= MRI
.createVirtualRegister(SubRC
);
5729 unsigned NewSubIdx
= RI
.composeSubRegIndices(SuperReg
.getSubReg(), SubIdx
);
5730 BuildMI(*MBB
, MI
, DL
, get(TargetOpcode::COPY
), SubReg
)
5731 .addReg(SuperReg
.getReg(), 0, NewSubIdx
);
5735 MachineOperand
SIInstrInfo::buildExtractSubRegOrImm(
5736 MachineBasicBlock::iterator MII
, MachineRegisterInfo
&MRI
,
5737 const MachineOperand
&Op
, const TargetRegisterClass
*SuperRC
,
5738 unsigned SubIdx
, const TargetRegisterClass
*SubRC
) const {
5740 if (SubIdx
== AMDGPU::sub0
)
5741 return MachineOperand::CreateImm(static_cast<int32_t>(Op
.getImm()));
5742 if (SubIdx
== AMDGPU::sub1
)
5743 return MachineOperand::CreateImm(static_cast<int32_t>(Op
.getImm() >> 32));
5745 llvm_unreachable("Unhandled register index for immediate");
5748 unsigned SubReg
= buildExtractSubReg(MII
, MRI
, Op
, SuperRC
,
5750 return MachineOperand::CreateReg(SubReg
, false);
5753 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
5754 void SIInstrInfo::swapOperands(MachineInstr
&Inst
) const {
5755 assert(Inst
.getNumExplicitOperands() == 3);
5756 MachineOperand Op1
= Inst
.getOperand(1);
5757 Inst
.removeOperand(1);
5758 Inst
.addOperand(Op1
);
5761 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo
&MRI
,
5762 const MCOperandInfo
&OpInfo
,
5763 const MachineOperand
&MO
) const {
5767 Register Reg
= MO
.getReg();
5769 const TargetRegisterClass
*DRC
= RI
.getRegClass(OpInfo
.RegClass
);
5770 if (Reg
.isPhysical())
5771 return DRC
->contains(Reg
);
5773 const TargetRegisterClass
*RC
= MRI
.getRegClass(Reg
);
5775 if (MO
.getSubReg()) {
5776 const MachineFunction
*MF
= MO
.getParent()->getParent()->getParent();
5777 const TargetRegisterClass
*SuperRC
= RI
.getLargestLegalSuperClass(RC
, *MF
);
5781 DRC
= RI
.getMatchingSuperRegClass(SuperRC
, DRC
, MO
.getSubReg());
5785 return RC
->hasSuperClassEq(DRC
);
5788 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo
&MRI
,
5789 const MCOperandInfo
&OpInfo
,
5790 const MachineOperand
&MO
) const {
5792 return isLegalRegOperand(MRI
, OpInfo
, MO
);
5794 // Handle non-register types that are treated like immediates.
5795 assert(MO
.isImm() || MO
.isTargetIndex() || MO
.isFI() || MO
.isGlobal());
5799 bool SIInstrInfo::isOperandLegal(const MachineInstr
&MI
, unsigned OpIdx
,
5800 const MachineOperand
*MO
) const {
5801 const MachineFunction
&MF
= *MI
.getParent()->getParent();
5802 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
5803 const MCInstrDesc
&InstDesc
= MI
.getDesc();
5804 const MCOperandInfo
&OpInfo
= InstDesc
.operands()[OpIdx
];
5805 const TargetRegisterClass
*DefinedRC
=
5806 OpInfo
.RegClass
!= -1 ? RI
.getRegClass(OpInfo
.RegClass
) : nullptr;
5808 MO
= &MI
.getOperand(OpIdx
);
5810 int ConstantBusLimit
= ST
.getConstantBusLimit(MI
.getOpcode());
5811 int LiteralLimit
= !isVOP3(MI
) || ST
.hasVOP3Literal() ? 1 : 0;
5812 if (isVALU(MI
) && usesConstantBus(MRI
, *MO
, OpInfo
)) {
5813 if (!MO
->isReg() && !isInlineConstant(*MO
, OpInfo
) && !LiteralLimit
--)
5816 SmallDenseSet
<RegSubRegPair
> SGPRsUsed
;
5818 SGPRsUsed
.insert(RegSubRegPair(MO
->getReg(), MO
->getSubReg()));
5820 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
5823 const MachineOperand
&Op
= MI
.getOperand(i
);
5825 RegSubRegPair
SGPR(Op
.getReg(), Op
.getSubReg());
5826 if (!SGPRsUsed
.count(SGPR
) &&
5827 // FIXME: This can access off the end of the operands() array.
5828 usesConstantBus(MRI
, Op
, InstDesc
.operands().begin()[i
])) {
5829 if (--ConstantBusLimit
<= 0)
5831 SGPRsUsed
.insert(SGPR
);
5833 } else if (AMDGPU::isSISrcOperand(InstDesc
, i
) &&
5834 !isInlineConstant(Op
, InstDesc
.operands()[i
])) {
5835 if (!LiteralLimit
--)
5837 if (--ConstantBusLimit
<= 0)
5841 } else if (ST
.hasNoF16PseudoScalarTransInlineConstants() && !MO
->isReg() &&
5842 isF16PseudoScalarTrans(MI
.getOpcode()) &&
5843 isInlineConstant(*MO
, OpInfo
)) {
5849 return OpInfo
.OperandType
== MCOI::OPERAND_UNKNOWN
;
5850 if (!isLegalRegOperand(MRI
, OpInfo
, *MO
))
5852 bool IsAGPR
= RI
.isAGPR(MRI
, MO
->getReg());
5853 if (IsAGPR
&& !ST
.hasMAIInsts())
5855 unsigned Opc
= MI
.getOpcode();
5857 (!ST
.hasGFX90AInsts() || !MRI
.reservedRegsFrozen()) &&
5858 (MI
.mayLoad() || MI
.mayStore() || isDS(Opc
) || isMIMG(Opc
)))
5860 // Atomics should have both vdst and vdata either vgpr or agpr.
5861 const int VDstIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdst
);
5862 const int DataIdx
= AMDGPU::getNamedOperandIdx(Opc
,
5863 isDS(Opc
) ? AMDGPU::OpName::data0
: AMDGPU::OpName::vdata
);
5864 if ((int)OpIdx
== VDstIdx
&& DataIdx
!= -1 &&
5865 MI
.getOperand(DataIdx
).isReg() &&
5866 RI
.isAGPR(MRI
, MI
.getOperand(DataIdx
).getReg()) != IsAGPR
)
5868 if ((int)OpIdx
== DataIdx
) {
5869 if (VDstIdx
!= -1 &&
5870 RI
.isAGPR(MRI
, MI
.getOperand(VDstIdx
).getReg()) != IsAGPR
)
5872 // DS instructions with 2 src operands also must have tied RC.
5873 const int Data1Idx
= AMDGPU::getNamedOperandIdx(Opc
,
5874 AMDGPU::OpName::data1
);
5875 if (Data1Idx
!= -1 && MI
.getOperand(Data1Idx
).isReg() &&
5876 RI
.isAGPR(MRI
, MI
.getOperand(Data1Idx
).getReg()) != IsAGPR
)
5879 if (Opc
== AMDGPU::V_ACCVGPR_WRITE_B32_e64
&& !ST
.hasGFX90AInsts() &&
5880 (int)OpIdx
== AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
) &&
5881 RI
.isSGPRReg(MRI
, MO
->getReg()))
5887 uint64_t Imm
= MO
->getImm();
5888 bool Is64BitFPOp
= OpInfo
.OperandType
== AMDGPU::OPERAND_REG_IMM_FP64
;
5889 bool Is64BitOp
= Is64BitFPOp
||
5890 OpInfo
.OperandType
== AMDGPU::OPERAND_REG_IMM_INT64
||
5891 OpInfo
.OperandType
== AMDGPU::OPERAND_REG_IMM_V2INT32
||
5892 OpInfo
.OperandType
== AMDGPU::OPERAND_REG_IMM_V2FP32
;
5894 !AMDGPU::isInlinableLiteral64(Imm
, ST
.hasInv2PiInlineImm())) {
5895 if (!AMDGPU::isValid32BitLiteral(Imm
, Is64BitFPOp
))
5898 // FIXME: We can use sign extended 64-bit literals, but only for signed
5899 // operands. At the moment we do not know if an operand is signed.
5900 // Such operand will be encoded as its low 32 bits and then either
5901 // correctly sign extended or incorrectly zero extended by HW.
5902 if (!Is64BitFPOp
&& (int32_t)Imm
< 0)
5907 // Handle non-register types that are treated like immediates.
5908 assert(MO
->isImm() || MO
->isTargetIndex() || MO
->isFI() || MO
->isGlobal());
5911 // This operand expects an immediate.
5915 return isImmOperandLegal(MI
, OpIdx
, *MO
);
5918 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo
&MRI
,
5919 MachineInstr
&MI
) const {
5920 unsigned Opc
= MI
.getOpcode();
5921 const MCInstrDesc
&InstrDesc
= get(Opc
);
5923 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
5924 MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
5926 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
5927 MachineOperand
&Src1
= MI
.getOperand(Src1Idx
);
5929 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5930 // we need to only have one constant bus use before GFX10.
5931 bool HasImplicitSGPR
= findImplicitSGPRRead(MI
);
5932 if (HasImplicitSGPR
&& ST
.getConstantBusLimit(Opc
) <= 1 && Src0
.isReg() &&
5933 RI
.isSGPRReg(MRI
, Src0
.getReg()))
5934 legalizeOpWithMove(MI
, Src0Idx
);
5936 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5937 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5938 // src0/src1 with V_READFIRSTLANE.
5939 if (Opc
== AMDGPU::V_WRITELANE_B32
) {
5940 const DebugLoc
&DL
= MI
.getDebugLoc();
5941 if (Src0
.isReg() && RI
.isVGPR(MRI
, Src0
.getReg())) {
5942 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
5943 BuildMI(*MI
.getParent(), MI
, DL
, get(AMDGPU::V_READFIRSTLANE_B32
), Reg
)
5945 Src0
.ChangeToRegister(Reg
, false);
5947 if (Src1
.isReg() && RI
.isVGPR(MRI
, Src1
.getReg())) {
5948 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
5949 const DebugLoc
&DL
= MI
.getDebugLoc();
5950 BuildMI(*MI
.getParent(), MI
, DL
, get(AMDGPU::V_READFIRSTLANE_B32
), Reg
)
5952 Src1
.ChangeToRegister(Reg
, false);
5957 // No VOP2 instructions support AGPRs.
5958 if (Src0
.isReg() && RI
.isAGPR(MRI
, Src0
.getReg()))
5959 legalizeOpWithMove(MI
, Src0Idx
);
5961 if (Src1
.isReg() && RI
.isAGPR(MRI
, Src1
.getReg()))
5962 legalizeOpWithMove(MI
, Src1Idx
);
5964 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5965 if (Opc
== AMDGPU::V_FMAC_F32_e32
|| Opc
== AMDGPU::V_FMAC_F16_e32
) {
5966 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
5967 if (!RI
.isVGPR(MRI
, MI
.getOperand(Src2Idx
).getReg()))
5968 legalizeOpWithMove(MI
, Src2Idx
);
5971 // VOP2 src0 instructions support all operand types, so we don't need to check
5972 // their legality. If src1 is already legal, we don't need to do anything.
5973 if (isLegalRegOperand(MRI
, InstrDesc
.operands()[Src1Idx
], Src1
))
5976 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5977 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5978 // select is uniform.
5979 if (Opc
== AMDGPU::V_READLANE_B32
&& Src1
.isReg() &&
5980 RI
.isVGPR(MRI
, Src1
.getReg())) {
5981 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
5982 const DebugLoc
&DL
= MI
.getDebugLoc();
5983 BuildMI(*MI
.getParent(), MI
, DL
, get(AMDGPU::V_READFIRSTLANE_B32
), Reg
)
5985 Src1
.ChangeToRegister(Reg
, false);
5989 // We do not use commuteInstruction here because it is too aggressive and will
5990 // commute if it is possible. We only want to commute here if it improves
5991 // legality. This can be called a fairly large number of times so don't waste
5992 // compile time pointlessly swapping and checking legality again.
5993 if (HasImplicitSGPR
|| !MI
.isCommutable()) {
5994 legalizeOpWithMove(MI
, Src1Idx
);
5998 // If src0 can be used as src1, commuting will make the operands legal.
5999 // Otherwise we have to give up and insert a move.
6001 // TODO: Other immediate-like operand kinds could be commuted if there was a
6002 // MachineOperand::ChangeTo* for them.
6003 if ((!Src1
.isImm() && !Src1
.isReg()) ||
6004 !isLegalRegOperand(MRI
, InstrDesc
.operands()[Src1Idx
], Src0
)) {
6005 legalizeOpWithMove(MI
, Src1Idx
);
6009 int CommutedOpc
= commuteOpcode(MI
);
6010 if (CommutedOpc
== -1) {
6011 legalizeOpWithMove(MI
, Src1Idx
);
6015 MI
.setDesc(get(CommutedOpc
));
6017 Register Src0Reg
= Src0
.getReg();
6018 unsigned Src0SubReg
= Src0
.getSubReg();
6019 bool Src0Kill
= Src0
.isKill();
6022 Src0
.ChangeToImmediate(Src1
.getImm());
6023 else if (Src1
.isReg()) {
6024 Src0
.ChangeToRegister(Src1
.getReg(), false, false, Src1
.isKill());
6025 Src0
.setSubReg(Src1
.getSubReg());
6027 llvm_unreachable("Should only have register or immediate operands");
6029 Src1
.ChangeToRegister(Src0Reg
, false, false, Src0Kill
);
6030 Src1
.setSubReg(Src0SubReg
);
6031 fixImplicitOperands(MI
);
6034 // Legalize VOP3 operands. All operand types are supported for any operand
6035 // but only one literal constant and only starting from GFX10.
6036 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo
&MRI
,
6037 MachineInstr
&MI
) const {
6038 unsigned Opc
= MI
.getOpcode();
6041 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
6042 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
),
6043 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
)
6046 if (Opc
== AMDGPU::V_PERMLANE16_B32_e64
||
6047 Opc
== AMDGPU::V_PERMLANEX16_B32_e64
) {
6048 // src1 and src2 must be scalar
6049 MachineOperand
&Src1
= MI
.getOperand(VOP3Idx
[1]);
6050 MachineOperand
&Src2
= MI
.getOperand(VOP3Idx
[2]);
6051 const DebugLoc
&DL
= MI
.getDebugLoc();
6052 if (Src1
.isReg() && !RI
.isSGPRClass(MRI
.getRegClass(Src1
.getReg()))) {
6053 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
6054 BuildMI(*MI
.getParent(), MI
, DL
, get(AMDGPU::V_READFIRSTLANE_B32
), Reg
)
6056 Src1
.ChangeToRegister(Reg
, false);
6058 if (Src2
.isReg() && !RI
.isSGPRClass(MRI
.getRegClass(Src2
.getReg()))) {
6059 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
6060 BuildMI(*MI
.getParent(), MI
, DL
, get(AMDGPU::V_READFIRSTLANE_B32
), Reg
)
6062 Src2
.ChangeToRegister(Reg
, false);
6066 // Find the one SGPR operand we are allowed to use.
6067 int ConstantBusLimit
= ST
.getConstantBusLimit(Opc
);
6068 int LiteralLimit
= ST
.hasVOP3Literal() ? 1 : 0;
6069 SmallDenseSet
<unsigned> SGPRsUsed
;
6070 Register SGPRReg
= findUsedSGPR(MI
, VOP3Idx
);
6072 SGPRsUsed
.insert(SGPRReg
);
6076 for (int Idx
: VOP3Idx
) {
6079 MachineOperand
&MO
= MI
.getOperand(Idx
);
6082 if (isInlineConstant(MO
, get(Opc
).operands()[Idx
]))
6085 if (LiteralLimit
> 0 && ConstantBusLimit
> 0) {
6093 legalizeOpWithMove(MI
, Idx
);
6097 if (RI
.hasAGPRs(RI
.getRegClassForReg(MRI
, MO
.getReg())) &&
6098 !isOperandLegal(MI
, Idx
, &MO
)) {
6099 legalizeOpWithMove(MI
, Idx
);
6103 if (!RI
.isSGPRClass(RI
.getRegClassForReg(MRI
, MO
.getReg())))
6104 continue; // VGPRs are legal
6106 // We can use one SGPR in each VOP3 instruction prior to GFX10
6107 // and two starting from GFX10.
6108 if (SGPRsUsed
.count(MO
.getReg()))
6110 if (ConstantBusLimit
> 0) {
6111 SGPRsUsed
.insert(MO
.getReg());
6116 // If we make it this far, then the operand is not legal and we must
6118 legalizeOpWithMove(MI
, Idx
);
6121 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6122 if ((Opc
== AMDGPU::V_FMAC_F32_e64
|| Opc
== AMDGPU::V_FMAC_F16_e64
) &&
6123 !RI
.isVGPR(MRI
, MI
.getOperand(VOP3Idx
[2]).getReg()))
6124 legalizeOpWithMove(MI
, VOP3Idx
[2]);
6127 Register
SIInstrInfo::readlaneVGPRToSGPR(
6128 Register SrcReg
, MachineInstr
&UseMI
, MachineRegisterInfo
&MRI
,
6129 const TargetRegisterClass
*DstRC
/*=nullptr*/) const {
6130 const TargetRegisterClass
*VRC
= MRI
.getRegClass(SrcReg
);
6131 const TargetRegisterClass
*SRC
= RI
.getEquivalentSGPRClass(VRC
);
6133 SRC
= RI
.getCommonSubClass(SRC
, DstRC
);
6135 Register DstReg
= MRI
.createVirtualRegister(SRC
);
6136 unsigned SubRegs
= RI
.getRegSizeInBits(*VRC
) / 32;
6138 if (RI
.hasAGPRs(VRC
)) {
6139 VRC
= RI
.getEquivalentVGPRClass(VRC
);
6140 Register NewSrcReg
= MRI
.createVirtualRegister(VRC
);
6141 BuildMI(*UseMI
.getParent(), UseMI
, UseMI
.getDebugLoc(),
6142 get(TargetOpcode::COPY
), NewSrcReg
)
6148 BuildMI(*UseMI
.getParent(), UseMI
, UseMI
.getDebugLoc(),
6149 get(AMDGPU::V_READFIRSTLANE_B32
), DstReg
)
6154 SmallVector
<Register
, 8> SRegs
;
6155 for (unsigned i
= 0; i
< SubRegs
; ++i
) {
6156 Register SGPR
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
6157 BuildMI(*UseMI
.getParent(), UseMI
, UseMI
.getDebugLoc(),
6158 get(AMDGPU::V_READFIRSTLANE_B32
), SGPR
)
6159 .addReg(SrcReg
, 0, RI
.getSubRegFromChannel(i
));
6160 SRegs
.push_back(SGPR
);
6163 MachineInstrBuilder MIB
=
6164 BuildMI(*UseMI
.getParent(), UseMI
, UseMI
.getDebugLoc(),
6165 get(AMDGPU::REG_SEQUENCE
), DstReg
);
6166 for (unsigned i
= 0; i
< SubRegs
; ++i
) {
6167 MIB
.addReg(SRegs
[i
]);
6168 MIB
.addImm(RI
.getSubRegFromChannel(i
));
6173 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo
&MRI
,
6174 MachineInstr
&MI
) const {
6176 // If the pointer is store in VGPRs, then we need to move them to
6177 // SGPRs using v_readfirstlane. This is safe because we only select
6178 // loads with uniform pointers to SMRD instruction so we know the
6179 // pointer value is uniform.
6180 MachineOperand
*SBase
= getNamedOperand(MI
, AMDGPU::OpName::sbase
);
6181 if (SBase
&& !RI
.isSGPRClass(MRI
.getRegClass(SBase
->getReg()))) {
6182 Register SGPR
= readlaneVGPRToSGPR(SBase
->getReg(), MI
, MRI
);
6183 SBase
->setReg(SGPR
);
6185 MachineOperand
*SOff
= getNamedOperand(MI
, AMDGPU::OpName::soffset
);
6186 if (SOff
&& !RI
.isSGPRReg(MRI
, SOff
->getReg())) {
6187 Register SGPR
= readlaneVGPRToSGPR(SOff
->getReg(), MI
, MRI
);
6192 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr
&Inst
) const {
6193 unsigned Opc
= Inst
.getOpcode();
6194 int OldSAddrIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::saddr
);
6195 if (OldSAddrIdx
< 0)
6198 assert(isSegmentSpecificFLAT(Inst
));
6200 int NewOpc
= AMDGPU::getGlobalVaddrOp(Opc
);
6202 NewOpc
= AMDGPU::getFlatScratchInstSVfromSS(Opc
);
6206 MachineRegisterInfo
&MRI
= Inst
.getMF()->getRegInfo();
6207 MachineOperand
&SAddr
= Inst
.getOperand(OldSAddrIdx
);
6208 if (RI
.isSGPRReg(MRI
, SAddr
.getReg()))
6211 int NewVAddrIdx
= AMDGPU::getNamedOperandIdx(NewOpc
, AMDGPU::OpName::vaddr
);
6212 if (NewVAddrIdx
< 0)
6215 int OldVAddrIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr
);
6217 // Check vaddr, it shall be zero or absent.
6218 MachineInstr
*VAddrDef
= nullptr;
6219 if (OldVAddrIdx
>= 0) {
6220 MachineOperand
&VAddr
= Inst
.getOperand(OldVAddrIdx
);
6221 VAddrDef
= MRI
.getUniqueVRegDef(VAddr
.getReg());
6222 if (!VAddrDef
|| VAddrDef
->getOpcode() != AMDGPU::V_MOV_B32_e32
||
6223 !VAddrDef
->getOperand(1).isImm() ||
6224 VAddrDef
->getOperand(1).getImm() != 0)
6228 const MCInstrDesc
&NewDesc
= get(NewOpc
);
6229 Inst
.setDesc(NewDesc
);
6231 // Callers expect iterator to be valid after this call, so modify the
6232 // instruction in place.
6233 if (OldVAddrIdx
== NewVAddrIdx
) {
6234 MachineOperand
&NewVAddr
= Inst
.getOperand(NewVAddrIdx
);
6235 // Clear use list from the old vaddr holding a zero register.
6236 MRI
.removeRegOperandFromUseList(&NewVAddr
);
6237 MRI
.moveOperands(&NewVAddr
, &SAddr
, 1);
6238 Inst
.removeOperand(OldSAddrIdx
);
6239 // Update the use list with the pointer we have just moved from vaddr to
6240 // saddr position. Otherwise new vaddr will be missing from the use list.
6241 MRI
.removeRegOperandFromUseList(&NewVAddr
);
6242 MRI
.addRegOperandToUseList(&NewVAddr
);
6244 assert(OldSAddrIdx
== NewVAddrIdx
);
6246 if (OldVAddrIdx
>= 0) {
6247 int NewVDstIn
= AMDGPU::getNamedOperandIdx(NewOpc
,
6248 AMDGPU::OpName::vdst_in
);
6250 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6251 // it asserts. Untie the operands for now and retie them afterwards.
6252 if (NewVDstIn
!= -1) {
6253 int OldVDstIn
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vdst_in
);
6254 Inst
.untieRegOperand(OldVDstIn
);
6257 Inst
.removeOperand(OldVAddrIdx
);
6259 if (NewVDstIn
!= -1) {
6260 int NewVDst
= AMDGPU::getNamedOperandIdx(NewOpc
, AMDGPU::OpName::vdst
);
6261 Inst
.tieOperands(NewVDst
, NewVDstIn
);
6266 if (VAddrDef
&& MRI
.use_nodbg_empty(VAddrDef
->getOperand(0).getReg()))
6267 VAddrDef
->eraseFromParent();
6272 // FIXME: Remove this when SelectionDAG is obsoleted.
6273 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo
&MRI
,
6274 MachineInstr
&MI
) const {
6275 if (!isSegmentSpecificFLAT(MI
))
6278 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6279 // thinks they are uniform, so a readfirstlane should be valid.
6280 MachineOperand
*SAddr
= getNamedOperand(MI
, AMDGPU::OpName::saddr
);
6281 if (!SAddr
|| RI
.isSGPRClass(MRI
.getRegClass(SAddr
->getReg())))
6284 if (moveFlatAddrToVGPR(MI
))
6287 const TargetRegisterClass
*DeclaredRC
= getRegClass(
6288 MI
.getDesc(), SAddr
->getOperandNo(), &RI
, *MI
.getParent()->getParent());
6290 Register ToSGPR
= readlaneVGPRToSGPR(SAddr
->getReg(), MI
, MRI
, DeclaredRC
);
6291 SAddr
->setReg(ToSGPR
);
6294 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock
&InsertMBB
,
6295 MachineBasicBlock::iterator I
,
6296 const TargetRegisterClass
*DstRC
,
6298 MachineRegisterInfo
&MRI
,
6299 const DebugLoc
&DL
) const {
6300 Register OpReg
= Op
.getReg();
6301 unsigned OpSubReg
= Op
.getSubReg();
6303 const TargetRegisterClass
*OpRC
= RI
.getSubClassWithSubReg(
6304 RI
.getRegClassForReg(MRI
, OpReg
), OpSubReg
);
6306 // Check if operand is already the correct register class.
6310 Register DstReg
= MRI
.createVirtualRegister(DstRC
);
6312 BuildMI(InsertMBB
, I
, DL
, get(AMDGPU::COPY
), DstReg
).addReg(OpReg
);
6315 MachineInstr
*Def
= MRI
.getVRegDef(OpReg
);
6319 // Try to eliminate the copy if it is copying an immediate value.
6320 if (Def
->isMoveImmediate() && DstRC
!= &AMDGPU::VReg_1RegClass
)
6321 foldImmediate(*Copy
, *Def
, OpReg
, &MRI
);
6323 bool ImpDef
= Def
->isImplicitDef();
6324 while (!ImpDef
&& Def
&& Def
->isCopy()) {
6325 if (Def
->getOperand(1).getReg().isPhysical())
6327 Def
= MRI
.getUniqueVRegDef(Def
->getOperand(1).getReg());
6328 ImpDef
= Def
&& Def
->isImplicitDef();
6330 if (!RI
.isSGPRClass(DstRC
) && !Copy
->readsRegister(AMDGPU::EXEC
, &RI
) &&
6332 Copy
.addReg(AMDGPU::EXEC
, RegState::Implicit
);
6335 // Emit the actual waterfall loop, executing the wrapped instruction for each
6336 // unique value of \p ScalarOps across all lanes. In the best case we execute 1
6337 // iteration, in the worst case we execute 64 (once per lane).
6339 emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo
&TII
,
6340 MachineRegisterInfo
&MRI
,
6341 MachineBasicBlock
&LoopBB
,
6342 MachineBasicBlock
&BodyBB
,
6344 ArrayRef
<MachineOperand
*> ScalarOps
) {
6345 MachineFunction
&MF
= *LoopBB
.getParent();
6346 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6347 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
6348 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
6349 unsigned SaveExecOpc
=
6350 ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64
;
6351 unsigned XorTermOpc
=
6352 ST
.isWave32() ? AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term
;
6354 ST
.isWave32() ? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
6355 const auto *BoolXExecRC
= TRI
->getWaveMaskRegClass();
6357 MachineBasicBlock::iterator I
= LoopBB
.begin();
6360 for (MachineOperand
*ScalarOp
: ScalarOps
) {
6361 unsigned RegSize
= TRI
->getRegSizeInBits(ScalarOp
->getReg(), MRI
);
6362 unsigned NumSubRegs
= RegSize
/ 32;
6363 Register VScalarOp
= ScalarOp
->getReg();
6365 if (NumSubRegs
== 1) {
6366 Register CurReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
6368 BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::V_READFIRSTLANE_B32
), CurReg
)
6371 Register NewCondReg
= MRI
.createVirtualRegister(BoolXExecRC
);
6373 BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::V_CMP_EQ_U32_e64
), NewCondReg
)
6377 // Combine the comparison results with AND.
6378 if (!CondReg
) // First.
6379 CondReg
= NewCondReg
;
6380 else { // If not the first, we create an AND.
6381 Register AndReg
= MRI
.createVirtualRegister(BoolXExecRC
);
6382 BuildMI(LoopBB
, I
, DL
, TII
.get(AndOpc
), AndReg
)
6384 .addReg(NewCondReg
);
6388 // Update ScalarOp operand to use the SGPR ScalarOp.
6389 ScalarOp
->setReg(CurReg
);
6390 ScalarOp
->setIsKill();
6392 SmallVector
<Register
, 8> ReadlanePieces
;
6393 unsigned VScalarOpUndef
= getUndefRegState(ScalarOp
->isUndef());
6394 assert(NumSubRegs
% 2 == 0 && NumSubRegs
<= 32 &&
6395 "Unhandled register size");
6397 for (unsigned Idx
= 0; Idx
< NumSubRegs
; Idx
+= 2) {
6398 Register CurRegLo
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
6399 Register CurRegHi
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
6401 // Read the next variant <- also loop target.
6402 BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::V_READFIRSTLANE_B32
), CurRegLo
)
6403 .addReg(VScalarOp
, VScalarOpUndef
, TRI
->getSubRegFromChannel(Idx
));
6405 // Read the next variant <- also loop target.
6406 BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::V_READFIRSTLANE_B32
), CurRegHi
)
6407 .addReg(VScalarOp
, VScalarOpUndef
,
6408 TRI
->getSubRegFromChannel(Idx
+ 1));
6410 ReadlanePieces
.push_back(CurRegLo
);
6411 ReadlanePieces
.push_back(CurRegHi
);
6413 // Comparison is to be done as 64-bit.
6414 Register CurReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_64RegClass
);
6415 BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), CurReg
)
6417 .addImm(AMDGPU::sub0
)
6419 .addImm(AMDGPU::sub1
);
6421 Register NewCondReg
= MRI
.createVirtualRegister(BoolXExecRC
);
6422 auto Cmp
= BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::V_CMP_EQ_U64_e64
),
6425 if (NumSubRegs
<= 2)
6426 Cmp
.addReg(VScalarOp
);
6428 Cmp
.addReg(VScalarOp
, VScalarOpUndef
,
6429 TRI
->getSubRegFromChannel(Idx
, 2));
6431 // Combine the comparison results with AND.
6432 if (!CondReg
) // First.
6433 CondReg
= NewCondReg
;
6434 else { // If not the first, we create an AND.
6435 Register AndReg
= MRI
.createVirtualRegister(BoolXExecRC
);
6436 BuildMI(LoopBB
, I
, DL
, TII
.get(AndOpc
), AndReg
)
6438 .addReg(NewCondReg
);
6443 const auto *SScalarOpRC
=
6444 TRI
->getEquivalentSGPRClass(MRI
.getRegClass(VScalarOp
));
6445 Register SScalarOp
= MRI
.createVirtualRegister(SScalarOpRC
);
6447 // Build scalar ScalarOp.
6449 BuildMI(LoopBB
, I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), SScalarOp
);
6450 unsigned Channel
= 0;
6451 for (Register Piece
: ReadlanePieces
) {
6452 Merge
.addReg(Piece
).addImm(TRI
->getSubRegFromChannel(Channel
++));
6455 // Update ScalarOp operand to use the SGPR ScalarOp.
6456 ScalarOp
->setReg(SScalarOp
);
6457 ScalarOp
->setIsKill();
6461 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
6462 MRI
.setSimpleHint(SaveExec
, CondReg
);
6464 // Update EXEC to matching lanes, saving original to SaveExec.
6465 BuildMI(LoopBB
, I
, DL
, TII
.get(SaveExecOpc
), SaveExec
)
6466 .addReg(CondReg
, RegState::Kill
);
6468 // The original instruction is here; we insert the terminators after it.
6471 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6472 BuildMI(BodyBB
, I
, DL
, TII
.get(XorTermOpc
), Exec
)
6476 BuildMI(BodyBB
, I
, DL
, TII
.get(AMDGPU::SI_WATERFALL_LOOP
)).addMBB(&LoopBB
);
6479 // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6480 // with SGPRs by iterating over all unique values across all lanes.
6481 // Returns the loop basic block that now contains \p MI.
6482 static MachineBasicBlock
*
6483 loadMBUFScalarOperandsFromVGPR(const SIInstrInfo
&TII
, MachineInstr
&MI
,
6484 ArrayRef
<MachineOperand
*> ScalarOps
,
6485 MachineDominatorTree
*MDT
,
6486 MachineBasicBlock::iterator Begin
= nullptr,
6487 MachineBasicBlock::iterator End
= nullptr) {
6488 MachineBasicBlock
&MBB
= *MI
.getParent();
6489 MachineFunction
&MF
= *MBB
.getParent();
6490 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6491 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
6492 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
6493 if (!Begin
.isValid())
6495 if (!End
.isValid()) {
6499 const DebugLoc
&DL
= MI
.getDebugLoc();
6500 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
6501 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
6502 const auto *BoolXExecRC
= TRI
->getWaveMaskRegClass();
6504 // Save SCC. Waterfall Loop may overwrite SCC.
6505 Register SaveSCCReg
;
6507 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6508 // rather than unlimited scan everywhere
6510 MBB
.computeRegisterLiveness(TRI
, AMDGPU::SCC
, MI
,
6511 std::numeric_limits
<unsigned>::max()) !=
6512 MachineBasicBlock::LQR_Dead
;
6514 SaveSCCReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
6515 BuildMI(MBB
, Begin
, DL
, TII
.get(AMDGPU::S_CSELECT_B32
), SaveSCCReg
)
6520 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
6522 // Save the EXEC mask
6523 BuildMI(MBB
, Begin
, DL
, TII
.get(MovExecOpc
), SaveExec
).addReg(Exec
);
6525 // Killed uses in the instruction we are waterfalling around will be
6526 // incorrect due to the added control-flow.
6527 MachineBasicBlock::iterator AfterMI
= MI
;
6529 for (auto I
= Begin
; I
!= AfterMI
; I
++) {
6530 for (auto &MO
: I
->all_uses())
6531 MRI
.clearKillFlags(MO
.getReg());
6534 // To insert the loop we need to split the block. Move everything after this
6535 // point to a new block, and insert a new empty block between the two.
6536 MachineBasicBlock
*LoopBB
= MF
.CreateMachineBasicBlock();
6537 MachineBasicBlock
*BodyBB
= MF
.CreateMachineBasicBlock();
6538 MachineBasicBlock
*RemainderBB
= MF
.CreateMachineBasicBlock();
6539 MachineFunction::iterator
MBBI(MBB
);
6542 MF
.insert(MBBI
, LoopBB
);
6543 MF
.insert(MBBI
, BodyBB
);
6544 MF
.insert(MBBI
, RemainderBB
);
6546 LoopBB
->addSuccessor(BodyBB
);
6547 BodyBB
->addSuccessor(LoopBB
);
6548 BodyBB
->addSuccessor(RemainderBB
);
6550 // Move Begin to MI to the BodyBB, and the remainder of the block to
6552 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
6553 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, End
, MBB
.end());
6554 BodyBB
->splice(BodyBB
->begin(), &MBB
, Begin
, MBB
.end());
6556 MBB
.addSuccessor(LoopBB
);
6558 // Update dominators. We know that MBB immediately dominates LoopBB, that
6559 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6560 // RemainderBB. RemainderBB immediately dominates all of the successors
6561 // transferred to it from MBB that MBB used to properly dominate.
6563 MDT
->addNewBlock(LoopBB
, &MBB
);
6564 MDT
->addNewBlock(BodyBB
, LoopBB
);
6565 MDT
->addNewBlock(RemainderBB
, BodyBB
);
6566 for (auto &Succ
: RemainderBB
->successors()) {
6567 if (MDT
->properlyDominates(&MBB
, Succ
)) {
6568 MDT
->changeImmediateDominator(Succ
, RemainderBB
);
6573 emitLoadScalarOpsFromVGPRLoop(TII
, MRI
, *LoopBB
, *BodyBB
, DL
, ScalarOps
);
6575 MachineBasicBlock::iterator First
= RemainderBB
->begin();
6578 BuildMI(*RemainderBB
, First
, DL
, TII
.get(AMDGPU::S_CMP_LG_U32
))
6579 .addReg(SaveSCCReg
, RegState::Kill
)
6583 // Restore the EXEC mask
6584 BuildMI(*RemainderBB
, First
, DL
, TII
.get(MovExecOpc
), Exec
).addReg(SaveExec
);
6588 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6589 static std::tuple
<unsigned, unsigned>
6590 extractRsrcPtr(const SIInstrInfo
&TII
, MachineInstr
&MI
, MachineOperand
&Rsrc
) {
6591 MachineBasicBlock
&MBB
= *MI
.getParent();
6592 MachineFunction
&MF
= *MBB
.getParent();
6593 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
6595 // Extract the ptr from the resource descriptor.
6597 TII
.buildExtractSubReg(MI
, MRI
, Rsrc
, &AMDGPU::VReg_128RegClass
,
6598 AMDGPU::sub0_sub1
, &AMDGPU::VReg_64RegClass
);
6600 // Create an empty resource descriptor
6601 Register Zero64
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
6602 Register SRsrcFormatLo
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
6603 Register SRsrcFormatHi
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
6604 Register NewSRsrc
= MRI
.createVirtualRegister(&AMDGPU::SGPR_128RegClass
);
6605 uint64_t RsrcDataFormat
= TII
.getDefaultRsrcDataFormat();
6608 BuildMI(MBB
, MI
, MI
.getDebugLoc(), TII
.get(AMDGPU::S_MOV_B64
), Zero64
)
6611 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6612 BuildMI(MBB
, MI
, MI
.getDebugLoc(), TII
.get(AMDGPU::S_MOV_B32
), SRsrcFormatLo
)
6613 .addImm(Lo_32(RsrcDataFormat
));
6615 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6616 BuildMI(MBB
, MI
, MI
.getDebugLoc(), TII
.get(AMDGPU::S_MOV_B32
), SRsrcFormatHi
)
6617 .addImm(Hi_32(RsrcDataFormat
));
6619 // NewSRsrc = {Zero64, SRsrcFormat}
6620 BuildMI(MBB
, MI
, MI
.getDebugLoc(), TII
.get(AMDGPU::REG_SEQUENCE
), NewSRsrc
)
6622 .addImm(AMDGPU::sub0_sub1
)
6623 .addReg(SRsrcFormatLo
)
6624 .addImm(AMDGPU::sub2
)
6625 .addReg(SRsrcFormatHi
)
6626 .addImm(AMDGPU::sub3
);
6628 return std::tuple(RsrcPtr
, NewSRsrc
);
6632 SIInstrInfo::legalizeOperands(MachineInstr
&MI
,
6633 MachineDominatorTree
*MDT
) const {
6634 MachineFunction
&MF
= *MI
.getParent()->getParent();
6635 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
6636 MachineBasicBlock
*CreatedBB
= nullptr;
6639 if (isVOP2(MI
) || isVOPC(MI
)) {
6640 legalizeOperandsVOP2(MRI
, MI
);
6646 legalizeOperandsVOP3(MRI
, MI
);
6652 legalizeOperandsSMRD(MRI
, MI
);
6658 legalizeOperandsFLAT(MRI
, MI
);
6662 // Legalize REG_SEQUENCE and PHI
6663 // The register class of the operands much be the same type as the register
6664 // class of the output.
6665 if (MI
.getOpcode() == AMDGPU::PHI
) {
6666 const TargetRegisterClass
*RC
= nullptr, *SRC
= nullptr, *VRC
= nullptr;
6667 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; i
+= 2) {
6668 if (!MI
.getOperand(i
).isReg() || !MI
.getOperand(i
).getReg().isVirtual())
6670 const TargetRegisterClass
*OpRC
=
6671 MRI
.getRegClass(MI
.getOperand(i
).getReg());
6672 if (RI
.hasVectorRegisters(OpRC
)) {
6679 // If any of the operands are VGPR registers, then they all most be
6680 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6682 if (VRC
|| !RI
.isSGPRClass(getOpRegClass(MI
, 0))) {
6685 if (getOpRegClass(MI
, 0) == &AMDGPU::VReg_1RegClass
) {
6686 VRC
= &AMDGPU::VReg_1RegClass
;
6688 VRC
= RI
.isAGPRClass(getOpRegClass(MI
, 0))
6689 ? RI
.getEquivalentAGPRClass(SRC
)
6690 : RI
.getEquivalentVGPRClass(SRC
);
6692 VRC
= RI
.isAGPRClass(getOpRegClass(MI
, 0))
6693 ? RI
.getEquivalentAGPRClass(VRC
)
6694 : RI
.getEquivalentVGPRClass(VRC
);
6701 // Update all the operands so they have the same type.
6702 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
6703 MachineOperand
&Op
= MI
.getOperand(I
);
6704 if (!Op
.isReg() || !Op
.getReg().isVirtual())
6707 // MI is a PHI instruction.
6708 MachineBasicBlock
*InsertBB
= MI
.getOperand(I
+ 1).getMBB();
6709 MachineBasicBlock::iterator Insert
= InsertBB
->getFirstTerminator();
6711 // Avoid creating no-op copies with the same src and dst reg class. These
6712 // confuse some of the machine passes.
6713 legalizeGenericOperand(*InsertBB
, Insert
, RC
, Op
, MRI
, MI
.getDebugLoc());
6717 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6718 // VGPR dest type and SGPR sources, insert copies so all operands are
6719 // VGPRs. This seems to help operand folding / the register coalescer.
6720 if (MI
.getOpcode() == AMDGPU::REG_SEQUENCE
) {
6721 MachineBasicBlock
*MBB
= MI
.getParent();
6722 const TargetRegisterClass
*DstRC
= getOpRegClass(MI
, 0);
6723 if (RI
.hasVGPRs(DstRC
)) {
6724 // Update all the operands so they are VGPR register classes. These may
6725 // not be the same register class because REG_SEQUENCE supports mixing
6726 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6727 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
6728 MachineOperand
&Op
= MI
.getOperand(I
);
6729 if (!Op
.isReg() || !Op
.getReg().isVirtual())
6732 const TargetRegisterClass
*OpRC
= MRI
.getRegClass(Op
.getReg());
6733 const TargetRegisterClass
*VRC
= RI
.getEquivalentVGPRClass(OpRC
);
6737 legalizeGenericOperand(*MBB
, MI
, VRC
, Op
, MRI
, MI
.getDebugLoc());
6745 // Legalize INSERT_SUBREG
6746 // src0 must have the same register class as dst
6747 if (MI
.getOpcode() == AMDGPU::INSERT_SUBREG
) {
6748 Register Dst
= MI
.getOperand(0).getReg();
6749 Register Src0
= MI
.getOperand(1).getReg();
6750 const TargetRegisterClass
*DstRC
= MRI
.getRegClass(Dst
);
6751 const TargetRegisterClass
*Src0RC
= MRI
.getRegClass(Src0
);
6752 if (DstRC
!= Src0RC
) {
6753 MachineBasicBlock
*MBB
= MI
.getParent();
6754 MachineOperand
&Op
= MI
.getOperand(1);
6755 legalizeGenericOperand(*MBB
, MI
, DstRC
, Op
, MRI
, MI
.getDebugLoc());
6760 // Legalize SI_INIT_M0
6761 if (MI
.getOpcode() == AMDGPU::SI_INIT_M0
) {
6762 MachineOperand
&Src
= MI
.getOperand(0);
6763 if (Src
.isReg() && RI
.hasVectorRegisters(MRI
.getRegClass(Src
.getReg())))
6764 Src
.setReg(readlaneVGPRToSGPR(Src
.getReg(), MI
, MRI
));
6768 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6769 if (MI
.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32
||
6770 MI
.getOpcode() == AMDGPU::S_QUADMASK_B32
||
6771 MI
.getOpcode() == AMDGPU::S_QUADMASK_B64
||
6772 MI
.getOpcode() == AMDGPU::S_WQM_B32
||
6773 MI
.getOpcode() == AMDGPU::S_WQM_B64
||
6774 MI
.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32
||
6775 MI
.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64
) {
6776 MachineOperand
&Src
= MI
.getOperand(1);
6777 if (Src
.isReg() && RI
.hasVectorRegisters(MRI
.getRegClass(Src
.getReg())))
6778 Src
.setReg(readlaneVGPRToSGPR(Src
.getReg(), MI
, MRI
));
6782 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6784 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6785 // scratch memory access. In both cases, the legalization never involves
6786 // conversion to the addr64 form.
6787 if (isImage(MI
) || (AMDGPU::isGraphics(MF
.getFunction().getCallingConv()) &&
6788 (isMUBUF(MI
) || isMTBUF(MI
)))) {
6789 int RSrcOpName
= (isVIMAGE(MI
) || isVSAMPLE(MI
)) ? AMDGPU::OpName::rsrc
6790 : AMDGPU::OpName::srsrc
;
6791 MachineOperand
*SRsrc
= getNamedOperand(MI
, RSrcOpName
);
6792 if (SRsrc
&& !RI
.isSGPRClass(MRI
.getRegClass(SRsrc
->getReg())))
6793 CreatedBB
= loadMBUFScalarOperandsFromVGPR(*this, MI
, {SRsrc
}, MDT
);
6795 int SampOpName
= isMIMG(MI
) ? AMDGPU::OpName::ssamp
: AMDGPU::OpName::samp
;
6796 MachineOperand
*SSamp
= getNamedOperand(MI
, SampOpName
);
6797 if (SSamp
&& !RI
.isSGPRClass(MRI
.getRegClass(SSamp
->getReg())))
6798 CreatedBB
= loadMBUFScalarOperandsFromVGPR(*this, MI
, {SSamp
}, MDT
);
6804 if (MI
.getOpcode() == AMDGPU::SI_CALL_ISEL
) {
6805 MachineOperand
*Dest
= &MI
.getOperand(0);
6806 if (!RI
.isSGPRClass(MRI
.getRegClass(Dest
->getReg()))) {
6807 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6808 // following copies, we also need to move copies from and to physical
6809 // registers into the loop block.
6810 unsigned FrameSetupOpcode
= getCallFrameSetupOpcode();
6811 unsigned FrameDestroyOpcode
= getCallFrameDestroyOpcode();
6813 // Also move the copies to physical registers into the loop block
6814 MachineBasicBlock
&MBB
= *MI
.getParent();
6815 MachineBasicBlock::iterator
Start(&MI
);
6816 while (Start
->getOpcode() != FrameSetupOpcode
)
6818 MachineBasicBlock::iterator
End(&MI
);
6819 while (End
->getOpcode() != FrameDestroyOpcode
)
6821 // Also include following copies of the return value
6823 while (End
!= MBB
.end() && End
->isCopy() && End
->getOperand(1).isReg() &&
6824 MI
.definesRegister(End
->getOperand(1).getReg(), /*TRI=*/nullptr))
6827 loadMBUFScalarOperandsFromVGPR(*this, MI
, {Dest
}, MDT
, Start
, End
);
6831 // Legalize s_sleep_var.
6832 if (MI
.getOpcode() == AMDGPU::S_SLEEP_VAR
) {
6833 const DebugLoc
&DL
= MI
.getDebugLoc();
6834 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
6836 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::src0
);
6837 MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
6838 BuildMI(*MI
.getParent(), MI
, DL
, get(AMDGPU::V_READFIRSTLANE_B32
), Reg
)
6840 Src0
.ChangeToRegister(Reg
, false);
6844 // Legalize MUBUF instructions.
6845 bool isSoffsetLegal
= true;
6847 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::soffset
);
6848 if (SoffsetIdx
!= -1) {
6849 MachineOperand
*Soffset
= &MI
.getOperand(SoffsetIdx
);
6850 if (Soffset
->isReg() && Soffset
->getReg().isVirtual() &&
6851 !RI
.isSGPRClass(MRI
.getRegClass(Soffset
->getReg()))) {
6852 isSoffsetLegal
= false;
6856 bool isRsrcLegal
= true;
6858 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::srsrc
);
6859 if (RsrcIdx
!= -1) {
6860 MachineOperand
*Rsrc
= &MI
.getOperand(RsrcIdx
);
6861 if (Rsrc
->isReg() && !RI
.isSGPRClass(MRI
.getRegClass(Rsrc
->getReg()))) {
6862 isRsrcLegal
= false;
6866 // The operands are legal.
6867 if (isRsrcLegal
&& isSoffsetLegal
)
6871 // Legalize a VGPR Rsrc
6873 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6874 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6875 // a zero-value SRsrc.
6877 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6878 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6881 // Otherwise we are on non-ADDR64 hardware, and/or we have
6882 // idxen/offen/bothen and we fall back to a waterfall loop.
6884 MachineOperand
*Rsrc
= &MI
.getOperand(RsrcIdx
);
6885 MachineBasicBlock
&MBB
= *MI
.getParent();
6887 MachineOperand
*VAddr
= getNamedOperand(MI
, AMDGPU::OpName::vaddr
);
6888 if (VAddr
&& AMDGPU::getIfAddr64Inst(MI
.getOpcode()) != -1) {
6889 // This is already an ADDR64 instruction so we need to add the pointer
6890 // extracted from the resource descriptor to the current value of VAddr.
6891 Register NewVAddrLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
6892 Register NewVAddrHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
6893 Register NewVAddr
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
6895 const auto *BoolXExecRC
= RI
.getWaveMaskRegClass();
6896 Register CondReg0
= MRI
.createVirtualRegister(BoolXExecRC
);
6897 Register CondReg1
= MRI
.createVirtualRegister(BoolXExecRC
);
6899 unsigned RsrcPtr
, NewSRsrc
;
6900 std::tie(RsrcPtr
, NewSRsrc
) = extractRsrcPtr(*this, MI
, *Rsrc
);
6902 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6903 const DebugLoc
&DL
= MI
.getDebugLoc();
6904 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_ADD_CO_U32_e64
), NewVAddrLo
)
6906 .addReg(RsrcPtr
, 0, AMDGPU::sub0
)
6907 .addReg(VAddr
->getReg(), 0, AMDGPU::sub0
)
6910 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6911 BuildMI(MBB
, MI
, DL
, get(AMDGPU::V_ADDC_U32_e64
), NewVAddrHi
)
6912 .addDef(CondReg1
, RegState::Dead
)
6913 .addReg(RsrcPtr
, 0, AMDGPU::sub1
)
6914 .addReg(VAddr
->getReg(), 0, AMDGPU::sub1
)
6915 .addReg(CondReg0
, RegState::Kill
)
6918 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6919 BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(AMDGPU::REG_SEQUENCE
), NewVAddr
)
6921 .addImm(AMDGPU::sub0
)
6923 .addImm(AMDGPU::sub1
);
6925 VAddr
->setReg(NewVAddr
);
6926 Rsrc
->setReg(NewSRsrc
);
6927 } else if (!VAddr
&& ST
.hasAddr64()) {
6928 // This instructions is the _OFFSET variant, so we need to convert it to
6930 assert(ST
.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
&&
6931 "FIXME: Need to emit flat atomics here");
6933 unsigned RsrcPtr
, NewSRsrc
;
6934 std::tie(RsrcPtr
, NewSRsrc
) = extractRsrcPtr(*this, MI
, *Rsrc
);
6936 Register NewVAddr
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
6937 MachineOperand
*VData
= getNamedOperand(MI
, AMDGPU::OpName::vdata
);
6938 MachineOperand
*Offset
= getNamedOperand(MI
, AMDGPU::OpName::offset
);
6939 MachineOperand
*SOffset
= getNamedOperand(MI
, AMDGPU::OpName::soffset
);
6940 unsigned Addr64Opcode
= AMDGPU::getAddr64Inst(MI
.getOpcode());
6942 // Atomics with return have an additional tied operand and are
6943 // missing some of the special bits.
6944 MachineOperand
*VDataIn
= getNamedOperand(MI
, AMDGPU::OpName::vdata_in
);
6945 MachineInstr
*Addr64
;
6948 // Regular buffer load / store.
6949 MachineInstrBuilder MIB
=
6950 BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(Addr64Opcode
))
6957 if (const MachineOperand
*CPol
=
6958 getNamedOperand(MI
, AMDGPU::OpName::cpol
)) {
6959 MIB
.addImm(CPol
->getImm());
6962 if (const MachineOperand
*TFE
=
6963 getNamedOperand(MI
, AMDGPU::OpName::tfe
)) {
6964 MIB
.addImm(TFE
->getImm());
6967 MIB
.addImm(getNamedImmOperand(MI
, AMDGPU::OpName::swz
));
6969 MIB
.cloneMemRefs(MI
);
6972 // Atomics with return.
6973 Addr64
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), get(Addr64Opcode
))
6980 .addImm(getNamedImmOperand(MI
, AMDGPU::OpName::cpol
))
6984 MI
.removeFromParent();
6986 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6987 BuildMI(MBB
, Addr64
, Addr64
->getDebugLoc(), get(AMDGPU::REG_SEQUENCE
),
6989 .addReg(RsrcPtr
, 0, AMDGPU::sub0
)
6990 .addImm(AMDGPU::sub0
)
6991 .addReg(RsrcPtr
, 0, AMDGPU::sub1
)
6992 .addImm(AMDGPU::sub1
);
6994 // Legalize a VGPR Rsrc and soffset together.
6995 if (!isSoffsetLegal
) {
6996 MachineOperand
*Soffset
= getNamedOperand(MI
, AMDGPU::OpName::soffset
);
6998 loadMBUFScalarOperandsFromVGPR(*this, MI
, {Rsrc
, Soffset
}, MDT
);
7001 CreatedBB
= loadMBUFScalarOperandsFromVGPR(*this, MI
, {Rsrc
}, MDT
);
7006 // Legalize a VGPR soffset.
7007 if (!isSoffsetLegal
) {
7008 MachineOperand
*Soffset
= getNamedOperand(MI
, AMDGPU::OpName::soffset
);
7009 CreatedBB
= loadMBUFScalarOperandsFromVGPR(*this, MI
, {Soffset
}, MDT
);
7015 void SIInstrWorklist::insert(MachineInstr
*MI
) {
7016 InstrList
.insert(MI
);
7017 // Add MBUF instructiosn to deferred list.
7019 AMDGPU::getNamedOperandIdx(MI
->getOpcode(), AMDGPU::OpName::srsrc
);
7020 if (RsrcIdx
!= -1) {
7021 DeferredList
.insert(MI
);
7025 bool SIInstrWorklist::isDeferred(MachineInstr
*MI
) {
7026 return DeferredList
.contains(MI
);
7029 void SIInstrInfo::moveToVALU(SIInstrWorklist
&Worklist
,
7030 MachineDominatorTree
*MDT
) const {
7032 while (!Worklist
.empty()) {
7033 MachineInstr
&Inst
= *Worklist
.top();
7034 Worklist
.erase_top();
7035 // Skip MachineInstr in the deferred list.
7036 if (Worklist
.isDeferred(&Inst
))
7038 moveToVALUImpl(Worklist
, MDT
, Inst
);
7041 // Deferred list of instructions will be processed once
7042 // all the MachineInstr in the worklist are done.
7043 for (MachineInstr
*Inst
: Worklist
.getDeferredList()) {
7044 moveToVALUImpl(Worklist
, MDT
, *Inst
);
7045 assert(Worklist
.empty() &&
7046 "Deferred MachineInstr are not supposed to re-populate worklist");
7050 void SIInstrInfo::moveToVALUImpl(SIInstrWorklist
&Worklist
,
7051 MachineDominatorTree
*MDT
,
7052 MachineInstr
&Inst
) const {
7054 MachineBasicBlock
*MBB
= Inst
.getParent();
7057 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
7058 unsigned Opcode
= Inst
.getOpcode();
7059 unsigned NewOpcode
= getVALUOp(Inst
);
7060 // Handle some special cases
7064 case AMDGPU::S_ADD_U64_PSEUDO
:
7065 NewOpcode
= AMDGPU::V_ADD_U64_PSEUDO
;
7067 case AMDGPU::S_SUB_U64_PSEUDO
:
7068 NewOpcode
= AMDGPU::V_SUB_U64_PSEUDO
;
7070 case AMDGPU::S_ADD_I32
:
7071 case AMDGPU::S_SUB_I32
: {
7072 // FIXME: The u32 versions currently selected use the carry.
7074 MachineBasicBlock
*CreatedBBTmp
= nullptr;
7075 std::tie(Changed
, CreatedBBTmp
) = moveScalarAddSub(Worklist
, Inst
, MDT
);
7083 case AMDGPU::S_MUL_U64
:
7084 // Split s_mul_u64 in 32-bit vector multiplications.
7085 splitScalarSMulU64(Worklist
, Inst
, MDT
);
7086 Inst
.eraseFromParent();
7089 case AMDGPU::S_MUL_U64_U32_PSEUDO
:
7090 case AMDGPU::S_MUL_I64_I32_PSEUDO
:
7091 // This is a special case of s_mul_u64 where all the operands are either
7092 // zero extended or sign extended.
7093 splitScalarSMulPseudo(Worklist
, Inst
, MDT
);
7094 Inst
.eraseFromParent();
7097 case AMDGPU::S_AND_B64
:
7098 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_AND_B32
, MDT
);
7099 Inst
.eraseFromParent();
7102 case AMDGPU::S_OR_B64
:
7103 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_OR_B32
, MDT
);
7104 Inst
.eraseFromParent();
7107 case AMDGPU::S_XOR_B64
:
7108 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_XOR_B32
, MDT
);
7109 Inst
.eraseFromParent();
7112 case AMDGPU::S_NAND_B64
:
7113 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_NAND_B32
, MDT
);
7114 Inst
.eraseFromParent();
7117 case AMDGPU::S_NOR_B64
:
7118 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_NOR_B32
, MDT
);
7119 Inst
.eraseFromParent();
7122 case AMDGPU::S_XNOR_B64
:
7123 if (ST
.hasDLInsts())
7124 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_XNOR_B32
, MDT
);
7126 splitScalar64BitXnor(Worklist
, Inst
, MDT
);
7127 Inst
.eraseFromParent();
7130 case AMDGPU::S_ANDN2_B64
:
7131 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_ANDN2_B32
, MDT
);
7132 Inst
.eraseFromParent();
7135 case AMDGPU::S_ORN2_B64
:
7136 splitScalar64BitBinaryOp(Worklist
, Inst
, AMDGPU::S_ORN2_B32
, MDT
);
7137 Inst
.eraseFromParent();
7140 case AMDGPU::S_BREV_B64
:
7141 splitScalar64BitUnaryOp(Worklist
, Inst
, AMDGPU::S_BREV_B32
, true);
7142 Inst
.eraseFromParent();
7145 case AMDGPU::S_NOT_B64
:
7146 splitScalar64BitUnaryOp(Worklist
, Inst
, AMDGPU::S_NOT_B32
);
7147 Inst
.eraseFromParent();
7150 case AMDGPU::S_BCNT1_I32_B64
:
7151 splitScalar64BitBCNT(Worklist
, Inst
);
7152 Inst
.eraseFromParent();
7155 case AMDGPU::S_BFE_I64
:
7156 splitScalar64BitBFE(Worklist
, Inst
);
7157 Inst
.eraseFromParent();
7160 case AMDGPU::S_FLBIT_I32_B64
:
7161 splitScalar64BitCountOp(Worklist
, Inst
, AMDGPU::V_FFBH_U32_e32
);
7162 Inst
.eraseFromParent();
7164 case AMDGPU::S_FF1_I32_B64
:
7165 splitScalar64BitCountOp(Worklist
, Inst
, AMDGPU::V_FFBL_B32_e32
);
7166 Inst
.eraseFromParent();
7169 case AMDGPU::S_LSHL_B32
:
7170 if (ST
.hasOnlyRevVALUShifts()) {
7171 NewOpcode
= AMDGPU::V_LSHLREV_B32_e64
;
7175 case AMDGPU::S_ASHR_I32
:
7176 if (ST
.hasOnlyRevVALUShifts()) {
7177 NewOpcode
= AMDGPU::V_ASHRREV_I32_e64
;
7181 case AMDGPU::S_LSHR_B32
:
7182 if (ST
.hasOnlyRevVALUShifts()) {
7183 NewOpcode
= AMDGPU::V_LSHRREV_B32_e64
;
7187 case AMDGPU::S_LSHL_B64
:
7188 if (ST
.hasOnlyRevVALUShifts()) {
7189 NewOpcode
= ST
.getGeneration() >= AMDGPUSubtarget::GFX12
7190 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7191 : AMDGPU::V_LSHLREV_B64_e64
;
7195 case AMDGPU::S_ASHR_I64
:
7196 if (ST
.hasOnlyRevVALUShifts()) {
7197 NewOpcode
= AMDGPU::V_ASHRREV_I64_e64
;
7201 case AMDGPU::S_LSHR_B64
:
7202 if (ST
.hasOnlyRevVALUShifts()) {
7203 NewOpcode
= AMDGPU::V_LSHRREV_B64_e64
;
7208 case AMDGPU::S_ABS_I32
:
7209 lowerScalarAbs(Worklist
, Inst
);
7210 Inst
.eraseFromParent();
7213 case AMDGPU::S_CBRANCH_SCC0
:
7214 case AMDGPU::S_CBRANCH_SCC1
: {
7215 // Clear unused bits of vcc
7216 Register CondReg
= Inst
.getOperand(1).getReg();
7217 bool IsSCC
= CondReg
== AMDGPU::SCC
;
7218 Register VCC
= RI
.getVCC();
7219 Register EXEC
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
7220 unsigned Opc
= ST
.isWave32() ? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
7221 BuildMI(*MBB
, Inst
, Inst
.getDebugLoc(), get(Opc
), VCC
)
7223 .addReg(IsSCC
? VCC
: CondReg
);
7224 Inst
.removeOperand(1);
7227 case AMDGPU::S_BFE_U64
:
7228 case AMDGPU::S_BFM_B64
:
7229 llvm_unreachable("Moving this op to VALU not implemented");
7231 case AMDGPU::S_PACK_LL_B32_B16
:
7232 case AMDGPU::S_PACK_LH_B32_B16
:
7233 case AMDGPU::S_PACK_HL_B32_B16
:
7234 case AMDGPU::S_PACK_HH_B32_B16
:
7235 movePackToVALU(Worklist
, MRI
, Inst
);
7236 Inst
.eraseFromParent();
7239 case AMDGPU::S_XNOR_B32
:
7240 lowerScalarXnor(Worklist
, Inst
);
7241 Inst
.eraseFromParent();
7244 case AMDGPU::S_NAND_B32
:
7245 splitScalarNotBinop(Worklist
, Inst
, AMDGPU::S_AND_B32
);
7246 Inst
.eraseFromParent();
7249 case AMDGPU::S_NOR_B32
:
7250 splitScalarNotBinop(Worklist
, Inst
, AMDGPU::S_OR_B32
);
7251 Inst
.eraseFromParent();
7254 case AMDGPU::S_ANDN2_B32
:
7255 splitScalarBinOpN2(Worklist
, Inst
, AMDGPU::S_AND_B32
);
7256 Inst
.eraseFromParent();
7259 case AMDGPU::S_ORN2_B32
:
7260 splitScalarBinOpN2(Worklist
, Inst
, AMDGPU::S_OR_B32
);
7261 Inst
.eraseFromParent();
7264 // TODO: remove as soon as everything is ready
7265 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7266 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7267 // can only be selected from the uniform SDNode.
7268 case AMDGPU::S_ADD_CO_PSEUDO
:
7269 case AMDGPU::S_SUB_CO_PSEUDO
: {
7270 unsigned Opc
= (Inst
.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
)
7271 ? AMDGPU::V_ADDC_U32_e64
7272 : AMDGPU::V_SUBB_U32_e64
;
7273 const auto *CarryRC
= RI
.getWaveMaskRegClass();
7275 Register CarryInReg
= Inst
.getOperand(4).getReg();
7276 if (!MRI
.constrainRegClass(CarryInReg
, CarryRC
)) {
7277 Register NewCarryReg
= MRI
.createVirtualRegister(CarryRC
);
7278 BuildMI(*MBB
, Inst
, Inst
.getDebugLoc(), get(AMDGPU::COPY
), NewCarryReg
)
7279 .addReg(CarryInReg
);
7282 Register CarryOutReg
= Inst
.getOperand(1).getReg();
7284 Register DestReg
= MRI
.createVirtualRegister(RI
.getEquivalentVGPRClass(
7285 MRI
.getRegClass(Inst
.getOperand(0).getReg())));
7286 MachineInstr
*CarryOp
=
7287 BuildMI(*MBB
, &Inst
, Inst
.getDebugLoc(), get(Opc
), DestReg
)
7288 .addReg(CarryOutReg
, RegState::Define
)
7289 .add(Inst
.getOperand(2))
7290 .add(Inst
.getOperand(3))
7293 legalizeOperands(*CarryOp
);
7294 MRI
.replaceRegWith(Inst
.getOperand(0).getReg(), DestReg
);
7295 addUsersToMoveToVALUWorklist(DestReg
, MRI
, Worklist
);
7296 Inst
.eraseFromParent();
7299 case AMDGPU::S_UADDO_PSEUDO
:
7300 case AMDGPU::S_USUBO_PSEUDO
: {
7301 const DebugLoc
&DL
= Inst
.getDebugLoc();
7302 MachineOperand
&Dest0
= Inst
.getOperand(0);
7303 MachineOperand
&Dest1
= Inst
.getOperand(1);
7304 MachineOperand
&Src0
= Inst
.getOperand(2);
7305 MachineOperand
&Src1
= Inst
.getOperand(3);
7307 unsigned Opc
= (Inst
.getOpcode() == AMDGPU::S_UADDO_PSEUDO
)
7308 ? AMDGPU::V_ADD_CO_U32_e64
7309 : AMDGPU::V_SUB_CO_U32_e64
;
7310 const TargetRegisterClass
*NewRC
=
7311 RI
.getEquivalentVGPRClass(MRI
.getRegClass(Dest0
.getReg()));
7312 Register DestReg
= MRI
.createVirtualRegister(NewRC
);
7313 MachineInstr
*NewInstr
= BuildMI(*MBB
, &Inst
, DL
, get(Opc
), DestReg
)
7314 .addReg(Dest1
.getReg(), RegState::Define
)
7317 .addImm(0); // clamp bit
7319 legalizeOperands(*NewInstr
, MDT
);
7320 MRI
.replaceRegWith(Dest0
.getReg(), DestReg
);
7321 addUsersToMoveToVALUWorklist(NewInstr
->getOperand(0).getReg(), MRI
,
7323 Inst
.eraseFromParent();
7327 case AMDGPU::S_CSELECT_B32
:
7328 case AMDGPU::S_CSELECT_B64
:
7329 lowerSelect(Worklist
, Inst
, MDT
);
7330 Inst
.eraseFromParent();
7332 case AMDGPU::S_CMP_EQ_I32
:
7333 case AMDGPU::S_CMP_LG_I32
:
7334 case AMDGPU::S_CMP_GT_I32
:
7335 case AMDGPU::S_CMP_GE_I32
:
7336 case AMDGPU::S_CMP_LT_I32
:
7337 case AMDGPU::S_CMP_LE_I32
:
7338 case AMDGPU::S_CMP_EQ_U32
:
7339 case AMDGPU::S_CMP_LG_U32
:
7340 case AMDGPU::S_CMP_GT_U32
:
7341 case AMDGPU::S_CMP_GE_U32
:
7342 case AMDGPU::S_CMP_LT_U32
:
7343 case AMDGPU::S_CMP_LE_U32
:
7344 case AMDGPU::S_CMP_EQ_U64
:
7345 case AMDGPU::S_CMP_LG_U64
:
7346 case AMDGPU::S_CMP_LT_F32
:
7347 case AMDGPU::S_CMP_EQ_F32
:
7348 case AMDGPU::S_CMP_LE_F32
:
7349 case AMDGPU::S_CMP_GT_F32
:
7350 case AMDGPU::S_CMP_LG_F32
:
7351 case AMDGPU::S_CMP_GE_F32
:
7352 case AMDGPU::S_CMP_O_F32
:
7353 case AMDGPU::S_CMP_U_F32
:
7354 case AMDGPU::S_CMP_NGE_F32
:
7355 case AMDGPU::S_CMP_NLG_F32
:
7356 case AMDGPU::S_CMP_NGT_F32
:
7357 case AMDGPU::S_CMP_NLE_F32
:
7358 case AMDGPU::S_CMP_NEQ_F32
:
7359 case AMDGPU::S_CMP_NLT_F32
: {
7360 Register CondReg
= MRI
.createVirtualRegister(RI
.getWaveMaskRegClass());
7362 BuildMI(*MBB
, Inst
, Inst
.getDebugLoc(), get(NewOpcode
), CondReg
)
7363 .setMIFlags(Inst
.getFlags());
7364 if (AMDGPU::getNamedOperandIdx(NewOpcode
, AMDGPU::OpName::src0_modifiers
) >=
7367 .addImm(0) // src0_modifiers
7368 .add(Inst
.getOperand(0)) // src0
7369 .addImm(0) // src1_modifiers
7370 .add(Inst
.getOperand(1)) // src1
7371 .addImm(0); // clamp
7373 NewInstr
.add(Inst
.getOperand(0)).add(Inst
.getOperand(1));
7375 legalizeOperands(*NewInstr
, MDT
);
7376 int SCCIdx
= Inst
.findRegisterDefOperandIdx(AMDGPU::SCC
, /*TRI=*/nullptr);
7377 MachineOperand SCCOp
= Inst
.getOperand(SCCIdx
);
7378 addSCCDefUsersToVALUWorklist(SCCOp
, Inst
, Worklist
, CondReg
);
7379 Inst
.eraseFromParent();
7382 case AMDGPU::S_CMP_LT_F16
:
7383 case AMDGPU::S_CMP_EQ_F16
:
7384 case AMDGPU::S_CMP_LE_F16
:
7385 case AMDGPU::S_CMP_GT_F16
:
7386 case AMDGPU::S_CMP_LG_F16
:
7387 case AMDGPU::S_CMP_GE_F16
:
7388 case AMDGPU::S_CMP_O_F16
:
7389 case AMDGPU::S_CMP_U_F16
:
7390 case AMDGPU::S_CMP_NGE_F16
:
7391 case AMDGPU::S_CMP_NLG_F16
:
7392 case AMDGPU::S_CMP_NGT_F16
:
7393 case AMDGPU::S_CMP_NLE_F16
:
7394 case AMDGPU::S_CMP_NEQ_F16
:
7395 case AMDGPU::S_CMP_NLT_F16
: {
7396 Register CondReg
= MRI
.createVirtualRegister(RI
.getWaveMaskRegClass());
7398 BuildMI(*MBB
, Inst
, Inst
.getDebugLoc(), get(NewOpcode
), CondReg
)
7399 .setMIFlags(Inst
.getFlags());
7400 if (AMDGPU::hasNamedOperand(NewOpcode
, AMDGPU::OpName::src0_modifiers
)) {
7402 .addImm(0) // src0_modifiers
7403 .add(Inst
.getOperand(0)) // src0
7404 .addImm(0) // src1_modifiers
7405 .add(Inst
.getOperand(1)) // src1
7406 .addImm(0); // clamp
7407 if (AMDGPU::hasNamedOperand(NewOpcode
, AMDGPU::OpName::op_sel
))
7408 NewInstr
.addImm(0); // op_sel0
7411 .add(Inst
.getOperand(0))
7412 .add(Inst
.getOperand(1));
7414 legalizeOperands(*NewInstr
, MDT
);
7415 int SCCIdx
= Inst
.findRegisterDefOperandIdx(AMDGPU::SCC
, /*TRI=*/nullptr);
7416 MachineOperand SCCOp
= Inst
.getOperand(SCCIdx
);
7417 addSCCDefUsersToVALUWorklist(SCCOp
, Inst
, Worklist
, CondReg
);
7418 Inst
.eraseFromParent();
7421 case AMDGPU::S_CVT_HI_F32_F16
: {
7422 const DebugLoc
&DL
= Inst
.getDebugLoc();
7423 Register TmpReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7424 Register NewDst
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7425 if (ST
.useRealTrue16Insts()) {
7426 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::COPY
), TmpReg
)
7427 .add(Inst
.getOperand(1));
7428 BuildMI(*MBB
, Inst
, DL
, get(NewOpcode
), NewDst
)
7429 .addImm(0) // src0_modifiers
7430 .addReg(TmpReg
, 0, AMDGPU::hi16
)
7433 .addImm(0); // op_sel0
7435 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_LSHRREV_B32_e64
), TmpReg
)
7437 .add(Inst
.getOperand(1));
7438 BuildMI(*MBB
, Inst
, DL
, get(NewOpcode
), NewDst
)
7439 .addImm(0) // src0_modifiers
7445 MRI
.replaceRegWith(Inst
.getOperand(0).getReg(), NewDst
);
7446 addUsersToMoveToVALUWorklist(NewDst
, MRI
, Worklist
);
7447 Inst
.eraseFromParent();
7450 case AMDGPU::S_MINIMUM_F32
:
7451 case AMDGPU::S_MAXIMUM_F32
:
7452 case AMDGPU::S_MINIMUM_F16
:
7453 case AMDGPU::S_MAXIMUM_F16
: {
7454 const DebugLoc
&DL
= Inst
.getDebugLoc();
7455 Register NewDst
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7456 MachineInstr
*NewInstr
= BuildMI(*MBB
, Inst
, DL
, get(NewOpcode
), NewDst
)
7457 .addImm(0) // src0_modifiers
7458 .add(Inst
.getOperand(1))
7459 .addImm(0) // src1_modifiers
7460 .add(Inst
.getOperand(2))
7463 MRI
.replaceRegWith(Inst
.getOperand(0).getReg(), NewDst
);
7465 legalizeOperands(*NewInstr
, MDT
);
7466 addUsersToMoveToVALUWorklist(NewDst
, MRI
, Worklist
);
7467 Inst
.eraseFromParent();
7472 if (NewOpcode
== AMDGPU::INSTRUCTION_LIST_END
) {
7473 // We cannot move this instruction to the VALU, so we should try to
7474 // legalize its operands instead.
7475 legalizeOperands(Inst
, MDT
);
7478 // Handle converting generic instructions like COPY-to-SGPR into
7480 if (NewOpcode
== Opcode
) {
7481 Register DstReg
= Inst
.getOperand(0).getReg();
7482 const TargetRegisterClass
*NewDstRC
= getDestEquivalentVGPRClass(Inst
);
7484 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7485 // hope for the best.
7486 if (Inst
.isCopy() && DstReg
.isPhysical() &&
7487 RI
.isVGPR(MRI
, Inst
.getOperand(1).getReg())) {
7488 // TODO: Only works for 32 bit registers.
7489 BuildMI(*Inst
.getParent(), &Inst
, Inst
.getDebugLoc(),
7490 get(AMDGPU::V_READFIRSTLANE_B32
), Inst
.getOperand(0).getReg())
7491 .add(Inst
.getOperand(1));
7492 Inst
.eraseFromParent();
7496 if (Inst
.isCopy() && Inst
.getOperand(1).getReg().isVirtual() &&
7497 NewDstRC
== RI
.getRegClassForReg(MRI
, Inst
.getOperand(1).getReg())) {
7498 // Instead of creating a copy where src and dst are the same register
7499 // class, we just replace all uses of dst with src. These kinds of
7500 // copies interfere with the heuristics MachineSink uses to decide
7501 // whether or not to split a critical edge. Since the pass assumes
7502 // that copies will end up as machine instructions and not be
7504 addUsersToMoveToVALUWorklist(DstReg
, MRI
, Worklist
);
7505 MRI
.replaceRegWith(DstReg
, Inst
.getOperand(1).getReg());
7506 MRI
.clearKillFlags(Inst
.getOperand(1).getReg());
7507 Inst
.getOperand(0).setReg(DstReg
);
7508 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7509 // these are deleted later, but at -O0 it would leave a suspicious
7510 // looking illegal copy of an undef register.
7511 for (unsigned I
= Inst
.getNumOperands() - 1; I
!= 0; --I
)
7512 Inst
.removeOperand(I
);
7513 Inst
.setDesc(get(AMDGPU::IMPLICIT_DEF
));
7516 Register NewDstReg
= MRI
.createVirtualRegister(NewDstRC
);
7517 MRI
.replaceRegWith(DstReg
, NewDstReg
);
7518 legalizeOperands(Inst
, MDT
);
7519 addUsersToMoveToVALUWorklist(NewDstReg
, MRI
, Worklist
);
7523 // Use the new VALU Opcode.
7524 auto NewInstr
= BuildMI(*MBB
, Inst
, Inst
.getDebugLoc(), get(NewOpcode
))
7525 .setMIFlags(Inst
.getFlags());
7526 if (isVOP3(NewOpcode
) && !isVOP3(Opcode
)) {
7527 // Intersperse VOP3 modifiers among the SALU operands.
7528 NewInstr
->addOperand(Inst
.getOperand(0));
7529 if (AMDGPU::getNamedOperandIdx(NewOpcode
,
7530 AMDGPU::OpName::src0_modifiers
) >= 0)
7532 if (AMDGPU::hasNamedOperand(NewOpcode
, AMDGPU::OpName::src0
)) {
7533 MachineOperand Src
= Inst
.getOperand(1);
7534 if (AMDGPU::isTrue16Inst(NewOpcode
) && ST
.useRealTrue16Insts() &&
7535 Src
.isReg() && RI
.isVGPR(MRI
, Src
.getReg()))
7536 NewInstr
.addReg(Src
.getReg(), 0, AMDGPU::lo16
);
7538 NewInstr
->addOperand(Src
);
7541 if (Opcode
== AMDGPU::S_SEXT_I32_I8
|| Opcode
== AMDGPU::S_SEXT_I32_I16
) {
7542 // We are converting these to a BFE, so we need to add the missing
7543 // operands for the size and offset.
7544 unsigned Size
= (Opcode
== AMDGPU::S_SEXT_I32_I8
) ? 8 : 16;
7546 NewInstr
.addImm(Size
);
7547 } else if (Opcode
== AMDGPU::S_BCNT1_I32_B32
) {
7548 // The VALU version adds the second operand to the result, so insert an
7551 } else if (Opcode
== AMDGPU::S_BFE_I32
|| Opcode
== AMDGPU::S_BFE_U32
) {
7552 const MachineOperand
&OffsetWidthOp
= Inst
.getOperand(2);
7553 // If we need to move this to VGPRs, we need to unpack the second
7554 // operand back into the 2 separate ones for bit offset and width.
7555 assert(OffsetWidthOp
.isImm() &&
7556 "Scalar BFE is only implemented for constant width and offset");
7557 uint32_t Imm
= OffsetWidthOp
.getImm();
7559 uint32_t Offset
= Imm
& 0x3f; // Extract bits [5:0].
7560 uint32_t BitWidth
= (Imm
& 0x7f0000) >> 16; // Extract bits [22:16].
7561 NewInstr
.addImm(Offset
);
7562 NewInstr
.addImm(BitWidth
);
7564 if (AMDGPU::getNamedOperandIdx(NewOpcode
,
7565 AMDGPU::OpName::src1_modifiers
) >= 0)
7567 if (AMDGPU::getNamedOperandIdx(NewOpcode
, AMDGPU::OpName::src1
) >= 0)
7568 NewInstr
->addOperand(Inst
.getOperand(2));
7569 if (AMDGPU::getNamedOperandIdx(NewOpcode
,
7570 AMDGPU::OpName::src2_modifiers
) >= 0)
7572 if (AMDGPU::getNamedOperandIdx(NewOpcode
, AMDGPU::OpName::src2
) >= 0)
7573 NewInstr
->addOperand(Inst
.getOperand(3));
7574 if (AMDGPU::getNamedOperandIdx(NewOpcode
, AMDGPU::OpName::clamp
) >= 0)
7576 if (AMDGPU::getNamedOperandIdx(NewOpcode
, AMDGPU::OpName::omod
) >= 0)
7578 if (AMDGPU::getNamedOperandIdx(NewOpcode
, AMDGPU::OpName::op_sel
) >= 0)
7582 // Just copy the SALU operands.
7583 for (const MachineOperand
&Op
: Inst
.explicit_operands())
7584 NewInstr
->addOperand(Op
);
7587 // Remove any references to SCC. Vector instructions can't read from it, and
7588 // We're just about to add the implicit use / defs of VCC, and we don't want
7590 for (MachineOperand
&Op
: Inst
.implicit_operands()) {
7591 if (Op
.getReg() == AMDGPU::SCC
) {
7592 // Only propagate through live-def of SCC.
7593 if (Op
.isDef() && !Op
.isDead())
7594 addSCCDefUsersToVALUWorklist(Op
, Inst
, Worklist
);
7596 addSCCDefsToVALUWorklist(NewInstr
, Worklist
);
7599 Inst
.eraseFromParent();
7601 if (NewInstr
->getOperand(0).isReg() && NewInstr
->getOperand(0).isDef()) {
7602 Register DstReg
= NewInstr
->getOperand(0).getReg();
7603 assert(DstReg
.isVirtual());
7604 // Update the destination register class.
7605 const TargetRegisterClass
*NewDstRC
= getDestEquivalentVGPRClass(*NewInstr
);
7607 NewDstReg
= MRI
.createVirtualRegister(NewDstRC
);
7608 MRI
.replaceRegWith(DstReg
, NewDstReg
);
7610 fixImplicitOperands(*NewInstr
);
7611 // Legalize the operands
7612 legalizeOperands(*NewInstr
, MDT
);
7614 addUsersToMoveToVALUWorklist(NewDstReg
, MRI
, Worklist
);
7617 // Add/sub require special handling to deal with carry outs.
7618 std::pair
<bool, MachineBasicBlock
*>
7619 SIInstrInfo::moveScalarAddSub(SIInstrWorklist
&Worklist
, MachineInstr
&Inst
,
7620 MachineDominatorTree
*MDT
) const {
7621 if (ST
.hasAddNoCarry()) {
7622 // Assume there is no user of scc since we don't select this in that case.
7623 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7626 MachineBasicBlock
&MBB
= *Inst
.getParent();
7627 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7629 Register OldDstReg
= Inst
.getOperand(0).getReg();
7630 Register ResultReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7632 unsigned Opc
= Inst
.getOpcode();
7633 assert(Opc
== AMDGPU::S_ADD_I32
|| Opc
== AMDGPU::S_SUB_I32
);
7635 unsigned NewOpc
= Opc
== AMDGPU::S_ADD_I32
?
7636 AMDGPU::V_ADD_U32_e64
: AMDGPU::V_SUB_U32_e64
;
7638 assert(Inst
.getOperand(3).getReg() == AMDGPU::SCC
);
7639 Inst
.removeOperand(3);
7641 Inst
.setDesc(get(NewOpc
));
7642 Inst
.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7643 Inst
.addImplicitDefUseOperands(*MBB
.getParent());
7644 MRI
.replaceRegWith(OldDstReg
, ResultReg
);
7645 MachineBasicBlock
*NewBB
= legalizeOperands(Inst
, MDT
);
7647 addUsersToMoveToVALUWorklist(ResultReg
, MRI
, Worklist
);
7648 return std::pair(true, NewBB
);
7651 return std::pair(false, nullptr);
7654 void SIInstrInfo::lowerSelect(SIInstrWorklist
&Worklist
, MachineInstr
&Inst
,
7655 MachineDominatorTree
*MDT
) const {
7657 MachineBasicBlock
&MBB
= *Inst
.getParent();
7658 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7659 MachineBasicBlock::iterator MII
= Inst
;
7660 DebugLoc DL
= Inst
.getDebugLoc();
7662 MachineOperand
&Dest
= Inst
.getOperand(0);
7663 MachineOperand
&Src0
= Inst
.getOperand(1);
7664 MachineOperand
&Src1
= Inst
.getOperand(2);
7665 MachineOperand
&Cond
= Inst
.getOperand(3);
7667 Register CondReg
= Cond
.getReg();
7668 bool IsSCC
= (CondReg
== AMDGPU::SCC
);
7670 // If this is a trivial select where the condition is effectively not SCC
7671 // (CondReg is a source of copy to SCC), then the select is semantically
7672 // equivalent to copying CondReg. Hence, there is no need to create
7673 // V_CNDMASK, we can just use that and bail out.
7674 if (!IsSCC
&& Src0
.isImm() && (Src0
.getImm() == -1) && Src1
.isImm() &&
7675 (Src1
.getImm() == 0)) {
7676 MRI
.replaceRegWith(Dest
.getReg(), CondReg
);
7680 Register NewCondReg
= CondReg
;
7682 const TargetRegisterClass
*TC
= RI
.getWaveMaskRegClass();
7683 NewCondReg
= MRI
.createVirtualRegister(TC
);
7685 // Now look for the closest SCC def if it is a copy
7686 // replacing the CondReg with the COPY source register
7687 bool CopyFound
= false;
7688 for (MachineInstr
&CandI
:
7689 make_range(std::next(MachineBasicBlock::reverse_iterator(Inst
)),
7690 Inst
.getParent()->rend())) {
7691 if (CandI
.findRegisterDefOperandIdx(AMDGPU::SCC
, &RI
, false, false) !=
7693 if (CandI
.isCopy() && CandI
.getOperand(0).getReg() == AMDGPU::SCC
) {
7694 BuildMI(MBB
, MII
, DL
, get(AMDGPU::COPY
), NewCondReg
)
7695 .addReg(CandI
.getOperand(1).getReg());
7702 // SCC def is not a copy
7703 // Insert a trivial select instead of creating a copy, because a copy from
7704 // SCC would semantically mean just copying a single bit, but we may need
7705 // the result to be a vector condition mask that needs preserving.
7707 ST
.isWave64() ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32
;
7709 BuildMI(MBB
, MII
, DL
, get(Opcode
), NewCondReg
).addImm(-1).addImm(0);
7710 NewSelect
->getOperand(3).setIsUndef(Cond
.isUndef());
7714 Register NewDestReg
= MRI
.createVirtualRegister(
7715 RI
.getEquivalentVGPRClass(MRI
.getRegClass(Dest
.getReg())));
7716 MachineInstr
*NewInst
;
7717 if (Inst
.getOpcode() == AMDGPU::S_CSELECT_B32
) {
7718 NewInst
= BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_CNDMASK_B32_e64
), NewDestReg
)
7723 .addReg(NewCondReg
);
7726 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_CNDMASK_B64_PSEUDO
), NewDestReg
)
7729 .addReg(NewCondReg
);
7731 MRI
.replaceRegWith(Dest
.getReg(), NewDestReg
);
7732 legalizeOperands(*NewInst
, MDT
);
7733 addUsersToMoveToVALUWorklist(NewDestReg
, MRI
, Worklist
);
7736 void SIInstrInfo::lowerScalarAbs(SIInstrWorklist
&Worklist
,
7737 MachineInstr
&Inst
) const {
7738 MachineBasicBlock
&MBB
= *Inst
.getParent();
7739 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7740 MachineBasicBlock::iterator MII
= Inst
;
7741 DebugLoc DL
= Inst
.getDebugLoc();
7743 MachineOperand
&Dest
= Inst
.getOperand(0);
7744 MachineOperand
&Src
= Inst
.getOperand(1);
7745 Register TmpReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7746 Register ResultReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7748 unsigned SubOp
= ST
.hasAddNoCarry() ?
7749 AMDGPU::V_SUB_U32_e32
: AMDGPU::V_SUB_CO_U32_e32
;
7751 BuildMI(MBB
, MII
, DL
, get(SubOp
), TmpReg
)
7753 .addReg(Src
.getReg());
7755 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MAX_I32_e64
), ResultReg
)
7756 .addReg(Src
.getReg())
7759 MRI
.replaceRegWith(Dest
.getReg(), ResultReg
);
7760 addUsersToMoveToVALUWorklist(ResultReg
, MRI
, Worklist
);
7763 void SIInstrInfo::lowerScalarXnor(SIInstrWorklist
&Worklist
,
7764 MachineInstr
&Inst
) const {
7765 MachineBasicBlock
&MBB
= *Inst
.getParent();
7766 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7767 MachineBasicBlock::iterator MII
= Inst
;
7768 const DebugLoc
&DL
= Inst
.getDebugLoc();
7770 MachineOperand
&Dest
= Inst
.getOperand(0);
7771 MachineOperand
&Src0
= Inst
.getOperand(1);
7772 MachineOperand
&Src1
= Inst
.getOperand(2);
7774 if (ST
.hasDLInsts()) {
7775 Register NewDest
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7776 legalizeGenericOperand(MBB
, MII
, &AMDGPU::VGPR_32RegClass
, Src0
, MRI
, DL
);
7777 legalizeGenericOperand(MBB
, MII
, &AMDGPU::VGPR_32RegClass
, Src1
, MRI
, DL
);
7779 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_XNOR_B32_e64
), NewDest
)
7783 MRI
.replaceRegWith(Dest
.getReg(), NewDest
);
7784 addUsersToMoveToVALUWorklist(NewDest
, MRI
, Worklist
);
7786 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7787 // invert either source and then perform the XOR. If either source is a
7788 // scalar register, then we can leave the inversion on the scalar unit to
7789 // achieve a better distribution of scalar and vector instructions.
7790 bool Src0IsSGPR
= Src0
.isReg() &&
7791 RI
.isSGPRClass(MRI
.getRegClass(Src0
.getReg()));
7792 bool Src1IsSGPR
= Src1
.isReg() &&
7793 RI
.isSGPRClass(MRI
.getRegClass(Src1
.getReg()));
7795 Register Temp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
7796 Register NewDest
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
7798 // Build a pair of scalar instructions and add them to the work list.
7799 // The next iteration over the work list will lower these to the vector
7800 // unit as necessary.
7802 BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_NOT_B32
), Temp
).add(Src0
);
7803 Xor
= BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_XOR_B32
), NewDest
)
7806 } else if (Src1IsSGPR
) {
7807 BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_NOT_B32
), Temp
).add(Src1
);
7808 Xor
= BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_XOR_B32
), NewDest
)
7812 Xor
= BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_XOR_B32
), Temp
)
7816 BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_NOT_B32
), NewDest
).addReg(Temp
);
7817 Worklist
.insert(Not
);
7820 MRI
.replaceRegWith(Dest
.getReg(), NewDest
);
7822 Worklist
.insert(Xor
);
7824 addUsersToMoveToVALUWorklist(NewDest
, MRI
, Worklist
);
7828 void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist
&Worklist
,
7830 unsigned Opcode
) const {
7831 MachineBasicBlock
&MBB
= *Inst
.getParent();
7832 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7833 MachineBasicBlock::iterator MII
= Inst
;
7834 const DebugLoc
&DL
= Inst
.getDebugLoc();
7836 MachineOperand
&Dest
= Inst
.getOperand(0);
7837 MachineOperand
&Src0
= Inst
.getOperand(1);
7838 MachineOperand
&Src1
= Inst
.getOperand(2);
7840 Register NewDest
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
7841 Register Interm
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
7843 MachineInstr
&Op
= *BuildMI(MBB
, MII
, DL
, get(Opcode
), Interm
)
7847 MachineInstr
&Not
= *BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_NOT_B32
), NewDest
)
7850 Worklist
.insert(&Op
);
7851 Worklist
.insert(&Not
);
7853 MRI
.replaceRegWith(Dest
.getReg(), NewDest
);
7854 addUsersToMoveToVALUWorklist(NewDest
, MRI
, Worklist
);
7857 void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist
&Worklist
,
7859 unsigned Opcode
) const {
7860 MachineBasicBlock
&MBB
= *Inst
.getParent();
7861 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7862 MachineBasicBlock::iterator MII
= Inst
;
7863 const DebugLoc
&DL
= Inst
.getDebugLoc();
7865 MachineOperand
&Dest
= Inst
.getOperand(0);
7866 MachineOperand
&Src0
= Inst
.getOperand(1);
7867 MachineOperand
&Src1
= Inst
.getOperand(2);
7869 Register NewDest
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
7870 Register Interm
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
7872 MachineInstr
&Not
= *BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_NOT_B32
), Interm
)
7875 MachineInstr
&Op
= *BuildMI(MBB
, MII
, DL
, get(Opcode
), NewDest
)
7879 Worklist
.insert(&Not
);
7880 Worklist
.insert(&Op
);
7882 MRI
.replaceRegWith(Dest
.getReg(), NewDest
);
7883 addUsersToMoveToVALUWorklist(NewDest
, MRI
, Worklist
);
7886 void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist
&Worklist
,
7887 MachineInstr
&Inst
, unsigned Opcode
,
7889 MachineBasicBlock
&MBB
= *Inst
.getParent();
7890 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7892 MachineOperand
&Dest
= Inst
.getOperand(0);
7893 MachineOperand
&Src0
= Inst
.getOperand(1);
7894 DebugLoc DL
= Inst
.getDebugLoc();
7896 MachineBasicBlock::iterator MII
= Inst
;
7898 const MCInstrDesc
&InstDesc
= get(Opcode
);
7899 const TargetRegisterClass
*Src0RC
= Src0
.isReg() ?
7900 MRI
.getRegClass(Src0
.getReg()) :
7901 &AMDGPU::SGPR_32RegClass
;
7903 const TargetRegisterClass
*Src0SubRC
=
7904 RI
.getSubRegisterClass(Src0RC
, AMDGPU::sub0
);
7906 MachineOperand SrcReg0Sub0
= buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
,
7907 AMDGPU::sub0
, Src0SubRC
);
7909 const TargetRegisterClass
*DestRC
= MRI
.getRegClass(Dest
.getReg());
7910 const TargetRegisterClass
*NewDestRC
= RI
.getEquivalentVGPRClass(DestRC
);
7911 const TargetRegisterClass
*NewDestSubRC
=
7912 RI
.getSubRegisterClass(NewDestRC
, AMDGPU::sub0
);
7914 Register DestSub0
= MRI
.createVirtualRegister(NewDestSubRC
);
7915 MachineInstr
&LoHalf
= *BuildMI(MBB
, MII
, DL
, InstDesc
, DestSub0
).add(SrcReg0Sub0
);
7917 MachineOperand SrcReg0Sub1
= buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
,
7918 AMDGPU::sub1
, Src0SubRC
);
7920 Register DestSub1
= MRI
.createVirtualRegister(NewDestSubRC
);
7921 MachineInstr
&HiHalf
= *BuildMI(MBB
, MII
, DL
, InstDesc
, DestSub1
).add(SrcReg0Sub1
);
7924 std::swap(DestSub0
, DestSub1
);
7926 Register FullDestReg
= MRI
.createVirtualRegister(NewDestRC
);
7927 BuildMI(MBB
, MII
, DL
, get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
7929 .addImm(AMDGPU::sub0
)
7931 .addImm(AMDGPU::sub1
);
7933 MRI
.replaceRegWith(Dest
.getReg(), FullDestReg
);
7935 Worklist
.insert(&LoHalf
);
7936 Worklist
.insert(&HiHalf
);
7938 // We don't need to legalizeOperands here because for a single operand, src0
7939 // will support any kind of input.
7941 // Move all users of this moved value.
7942 addUsersToMoveToVALUWorklist(FullDestReg
, MRI
, Worklist
);
7945 // There is not a vector equivalent of s_mul_u64. For this reason, we need to
7946 // split the s_mul_u64 in 32-bit vector multiplications.
7947 void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist
&Worklist
,
7949 MachineDominatorTree
*MDT
) const {
7950 MachineBasicBlock
&MBB
= *Inst
.getParent();
7951 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
7953 Register FullDestReg
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
7954 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7955 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
7957 MachineOperand
&Dest
= Inst
.getOperand(0);
7958 MachineOperand
&Src0
= Inst
.getOperand(1);
7959 MachineOperand
&Src1
= Inst
.getOperand(2);
7960 const DebugLoc
&DL
= Inst
.getDebugLoc();
7961 MachineBasicBlock::iterator MII
= Inst
;
7963 const TargetRegisterClass
*Src0RC
= MRI
.getRegClass(Src0
.getReg());
7964 const TargetRegisterClass
*Src1RC
= MRI
.getRegClass(Src1
.getReg());
7965 const TargetRegisterClass
*Src0SubRC
=
7966 RI
.getSubRegisterClass(Src0RC
, AMDGPU::sub0
);
7967 if (RI
.isSGPRClass(Src0SubRC
))
7968 Src0SubRC
= RI
.getEquivalentVGPRClass(Src0SubRC
);
7969 const TargetRegisterClass
*Src1SubRC
=
7970 RI
.getSubRegisterClass(Src1RC
, AMDGPU::sub0
);
7971 if (RI
.isSGPRClass(Src1SubRC
))
7972 Src1SubRC
= RI
.getEquivalentVGPRClass(Src1SubRC
);
7974 // First, we extract the low 32-bit and high 32-bit values from each of the
7976 MachineOperand Op0L
=
7977 buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
, AMDGPU::sub0
, Src0SubRC
);
7978 MachineOperand Op1L
=
7979 buildExtractSubRegOrImm(MII
, MRI
, Src1
, Src1RC
, AMDGPU::sub0
, Src1SubRC
);
7980 MachineOperand Op0H
=
7981 buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
, AMDGPU::sub1
, Src0SubRC
);
7982 MachineOperand Op1H
=
7983 buildExtractSubRegOrImm(MII
, MRI
, Src1
, Src1RC
, AMDGPU::sub1
, Src1SubRC
);
7985 // The multilication is done as follows:
7989 // --------------------
7990 // Op1H*Op0L Op1L*Op0L
7991 // + Op1H*Op0H Op1L*Op0H
7992 // -----------------------------------------
7993 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7995 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7996 // value and that would overflow.
7997 // The low 32-bit value is Op1L*Op0L.
7998 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8000 Register Op1L_Op0H_Reg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8001 MachineInstr
*Op1L_Op0H
=
8002 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MUL_LO_U32_e64
), Op1L_Op0H_Reg
)
8006 Register Op1H_Op0L_Reg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8007 MachineInstr
*Op1H_Op0L
=
8008 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MUL_LO_U32_e64
), Op1H_Op0L_Reg
)
8012 Register CarryReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8013 MachineInstr
*Carry
=
8014 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MUL_HI_U32_e64
), CarryReg
)
8018 MachineInstr
*LoHalf
=
8019 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MUL_LO_U32_e64
), DestSub0
)
8023 Register AddReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8024 MachineInstr
*Add
= BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_ADD_U32_e32
), AddReg
)
8025 .addReg(Op1L_Op0H_Reg
)
8026 .addReg(Op1H_Op0L_Reg
);
8028 MachineInstr
*HiHalf
=
8029 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_ADD_U32_e32
), DestSub1
)
8033 BuildMI(MBB
, MII
, DL
, get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
8035 .addImm(AMDGPU::sub0
)
8037 .addImm(AMDGPU::sub1
);
8039 MRI
.replaceRegWith(Dest
.getReg(), FullDestReg
);
8041 // Try to legalize the operands in case we need to swap the order to keep it
8043 legalizeOperands(*Op1L_Op0H
, MDT
);
8044 legalizeOperands(*Op1H_Op0L
, MDT
);
8045 legalizeOperands(*Carry
, MDT
);
8046 legalizeOperands(*LoHalf
, MDT
);
8047 legalizeOperands(*Add
, MDT
);
8048 legalizeOperands(*HiHalf
, MDT
);
8050 // Move all users of this moved value.
8051 addUsersToMoveToVALUWorklist(FullDestReg
, MRI
, Worklist
);
8054 // Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8056 void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist
&Worklist
,
8058 MachineDominatorTree
*MDT
) const {
8059 MachineBasicBlock
&MBB
= *Inst
.getParent();
8060 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8062 Register FullDestReg
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
8063 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8064 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8066 MachineOperand
&Dest
= Inst
.getOperand(0);
8067 MachineOperand
&Src0
= Inst
.getOperand(1);
8068 MachineOperand
&Src1
= Inst
.getOperand(2);
8069 const DebugLoc
&DL
= Inst
.getDebugLoc();
8070 MachineBasicBlock::iterator MII
= Inst
;
8072 const TargetRegisterClass
*Src0RC
= MRI
.getRegClass(Src0
.getReg());
8073 const TargetRegisterClass
*Src1RC
= MRI
.getRegClass(Src1
.getReg());
8074 const TargetRegisterClass
*Src0SubRC
=
8075 RI
.getSubRegisterClass(Src0RC
, AMDGPU::sub0
);
8076 if (RI
.isSGPRClass(Src0SubRC
))
8077 Src0SubRC
= RI
.getEquivalentVGPRClass(Src0SubRC
);
8078 const TargetRegisterClass
*Src1SubRC
=
8079 RI
.getSubRegisterClass(Src1RC
, AMDGPU::sub0
);
8080 if (RI
.isSGPRClass(Src1SubRC
))
8081 Src1SubRC
= RI
.getEquivalentVGPRClass(Src1SubRC
);
8083 // First, we extract the low 32-bit and high 32-bit values from each of the
8085 MachineOperand Op0L
=
8086 buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
, AMDGPU::sub0
, Src0SubRC
);
8087 MachineOperand Op1L
=
8088 buildExtractSubRegOrImm(MII
, MRI
, Src1
, Src1RC
, AMDGPU::sub0
, Src1SubRC
);
8090 unsigned Opc
= Inst
.getOpcode();
8091 unsigned NewOpc
= Opc
== AMDGPU::S_MUL_U64_U32_PSEUDO
8092 ? AMDGPU::V_MUL_HI_U32_e64
8093 : AMDGPU::V_MUL_HI_I32_e64
;
8094 MachineInstr
*HiHalf
=
8095 BuildMI(MBB
, MII
, DL
, get(NewOpc
), DestSub1
).add(Op1L
).add(Op0L
);
8097 MachineInstr
*LoHalf
=
8098 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MUL_LO_U32_e64
), DestSub0
)
8102 BuildMI(MBB
, MII
, DL
, get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
8104 .addImm(AMDGPU::sub0
)
8106 .addImm(AMDGPU::sub1
);
8108 MRI
.replaceRegWith(Dest
.getReg(), FullDestReg
);
8110 // Try to legalize the operands in case we need to swap the order to keep it
8112 legalizeOperands(*HiHalf
, MDT
);
8113 legalizeOperands(*LoHalf
, MDT
);
8115 // Move all users of this moved value.
8116 addUsersToMoveToVALUWorklist(FullDestReg
, MRI
, Worklist
);
8119 void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist
&Worklist
,
8120 MachineInstr
&Inst
, unsigned Opcode
,
8121 MachineDominatorTree
*MDT
) const {
8122 MachineBasicBlock
&MBB
= *Inst
.getParent();
8123 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8125 MachineOperand
&Dest
= Inst
.getOperand(0);
8126 MachineOperand
&Src0
= Inst
.getOperand(1);
8127 MachineOperand
&Src1
= Inst
.getOperand(2);
8128 DebugLoc DL
= Inst
.getDebugLoc();
8130 MachineBasicBlock::iterator MII
= Inst
;
8132 const MCInstrDesc
&InstDesc
= get(Opcode
);
8133 const TargetRegisterClass
*Src0RC
= Src0
.isReg() ?
8134 MRI
.getRegClass(Src0
.getReg()) :
8135 &AMDGPU::SGPR_32RegClass
;
8137 const TargetRegisterClass
*Src0SubRC
=
8138 RI
.getSubRegisterClass(Src0RC
, AMDGPU::sub0
);
8139 const TargetRegisterClass
*Src1RC
= Src1
.isReg() ?
8140 MRI
.getRegClass(Src1
.getReg()) :
8141 &AMDGPU::SGPR_32RegClass
;
8143 const TargetRegisterClass
*Src1SubRC
=
8144 RI
.getSubRegisterClass(Src1RC
, AMDGPU::sub0
);
8146 MachineOperand SrcReg0Sub0
= buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
,
8147 AMDGPU::sub0
, Src0SubRC
);
8148 MachineOperand SrcReg1Sub0
= buildExtractSubRegOrImm(MII
, MRI
, Src1
, Src1RC
,
8149 AMDGPU::sub0
, Src1SubRC
);
8150 MachineOperand SrcReg0Sub1
= buildExtractSubRegOrImm(MII
, MRI
, Src0
, Src0RC
,
8151 AMDGPU::sub1
, Src0SubRC
);
8152 MachineOperand SrcReg1Sub1
= buildExtractSubRegOrImm(MII
, MRI
, Src1
, Src1RC
,
8153 AMDGPU::sub1
, Src1SubRC
);
8155 const TargetRegisterClass
*DestRC
= MRI
.getRegClass(Dest
.getReg());
8156 const TargetRegisterClass
*NewDestRC
= RI
.getEquivalentVGPRClass(DestRC
);
8157 const TargetRegisterClass
*NewDestSubRC
=
8158 RI
.getSubRegisterClass(NewDestRC
, AMDGPU::sub0
);
8160 Register DestSub0
= MRI
.createVirtualRegister(NewDestSubRC
);
8161 MachineInstr
&LoHalf
= *BuildMI(MBB
, MII
, DL
, InstDesc
, DestSub0
)
8165 Register DestSub1
= MRI
.createVirtualRegister(NewDestSubRC
);
8166 MachineInstr
&HiHalf
= *BuildMI(MBB
, MII
, DL
, InstDesc
, DestSub1
)
8170 Register FullDestReg
= MRI
.createVirtualRegister(NewDestRC
);
8171 BuildMI(MBB
, MII
, DL
, get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
8173 .addImm(AMDGPU::sub0
)
8175 .addImm(AMDGPU::sub1
);
8177 MRI
.replaceRegWith(Dest
.getReg(), FullDestReg
);
8179 Worklist
.insert(&LoHalf
);
8180 Worklist
.insert(&HiHalf
);
8182 // Move all users of this moved value.
8183 addUsersToMoveToVALUWorklist(FullDestReg
, MRI
, Worklist
);
8186 void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist
&Worklist
,
8188 MachineDominatorTree
*MDT
) const {
8189 MachineBasicBlock
&MBB
= *Inst
.getParent();
8190 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8192 MachineOperand
&Dest
= Inst
.getOperand(0);
8193 MachineOperand
&Src0
= Inst
.getOperand(1);
8194 MachineOperand
&Src1
= Inst
.getOperand(2);
8195 const DebugLoc
&DL
= Inst
.getDebugLoc();
8197 MachineBasicBlock::iterator MII
= Inst
;
8199 const TargetRegisterClass
*DestRC
= MRI
.getRegClass(Dest
.getReg());
8201 Register Interm
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
8203 MachineOperand
* Op0
;
8204 MachineOperand
* Op1
;
8206 if (Src0
.isReg() && RI
.isSGPRReg(MRI
, Src0
.getReg())) {
8214 BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_NOT_B64
), Interm
)
8217 Register NewDest
= MRI
.createVirtualRegister(DestRC
);
8219 MachineInstr
&Xor
= *BuildMI(MBB
, MII
, DL
, get(AMDGPU::S_XOR_B64
), NewDest
)
8223 MRI
.replaceRegWith(Dest
.getReg(), NewDest
);
8225 Worklist
.insert(&Xor
);
8228 void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist
&Worklist
,
8229 MachineInstr
&Inst
) const {
8230 MachineBasicBlock
&MBB
= *Inst
.getParent();
8231 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8233 MachineBasicBlock::iterator MII
= Inst
;
8234 const DebugLoc
&DL
= Inst
.getDebugLoc();
8236 MachineOperand
&Dest
= Inst
.getOperand(0);
8237 MachineOperand
&Src
= Inst
.getOperand(1);
8239 const MCInstrDesc
&InstDesc
= get(AMDGPU::V_BCNT_U32_B32_e64
);
8240 const TargetRegisterClass
*SrcRC
= Src
.isReg() ?
8241 MRI
.getRegClass(Src
.getReg()) :
8242 &AMDGPU::SGPR_32RegClass
;
8244 Register MidReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8245 Register ResultReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8247 const TargetRegisterClass
*SrcSubRC
=
8248 RI
.getSubRegisterClass(SrcRC
, AMDGPU::sub0
);
8250 MachineOperand SrcRegSub0
= buildExtractSubRegOrImm(MII
, MRI
, Src
, SrcRC
,
8251 AMDGPU::sub0
, SrcSubRC
);
8252 MachineOperand SrcRegSub1
= buildExtractSubRegOrImm(MII
, MRI
, Src
, SrcRC
,
8253 AMDGPU::sub1
, SrcSubRC
);
8255 BuildMI(MBB
, MII
, DL
, InstDesc
, MidReg
).add(SrcRegSub0
).addImm(0);
8257 BuildMI(MBB
, MII
, DL
, InstDesc
, ResultReg
).add(SrcRegSub1
).addReg(MidReg
);
8259 MRI
.replaceRegWith(Dest
.getReg(), ResultReg
);
8261 // We don't need to legalize operands here. src0 for either instruction can be
8262 // an SGPR, and the second input is unused or determined here.
8263 addUsersToMoveToVALUWorklist(ResultReg
, MRI
, Worklist
);
8266 void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist
&Worklist
,
8267 MachineInstr
&Inst
) const {
8268 MachineBasicBlock
&MBB
= *Inst
.getParent();
8269 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8270 MachineBasicBlock::iterator MII
= Inst
;
8271 const DebugLoc
&DL
= Inst
.getDebugLoc();
8273 MachineOperand
&Dest
= Inst
.getOperand(0);
8274 uint32_t Imm
= Inst
.getOperand(2).getImm();
8275 uint32_t Offset
= Imm
& 0x3f; // Extract bits [5:0].
8276 uint32_t BitWidth
= (Imm
& 0x7f0000) >> 16; // Extract bits [22:16].
8280 // Only sext_inreg cases handled.
8281 assert(Inst
.getOpcode() == AMDGPU::S_BFE_I64
&& BitWidth
<= 32 &&
8282 Offset
== 0 && "Not implemented");
8284 if (BitWidth
< 32) {
8285 Register MidRegLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8286 Register MidRegHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8287 Register ResultReg
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
8289 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_BFE_I32_e64
), MidRegLo
)
8290 .addReg(Inst
.getOperand(1).getReg(), 0, AMDGPU::sub0
)
8294 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_ASHRREV_I32_e32
), MidRegHi
)
8298 BuildMI(MBB
, MII
, DL
, get(TargetOpcode::REG_SEQUENCE
), ResultReg
)
8300 .addImm(AMDGPU::sub0
)
8302 .addImm(AMDGPU::sub1
);
8304 MRI
.replaceRegWith(Dest
.getReg(), ResultReg
);
8305 addUsersToMoveToVALUWorklist(ResultReg
, MRI
, Worklist
);
8309 MachineOperand
&Src
= Inst
.getOperand(1);
8310 Register TmpReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8311 Register ResultReg
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
8313 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_ASHRREV_I32_e64
), TmpReg
)
8315 .addReg(Src
.getReg(), 0, AMDGPU::sub0
);
8317 BuildMI(MBB
, MII
, DL
, get(TargetOpcode::REG_SEQUENCE
), ResultReg
)
8318 .addReg(Src
.getReg(), 0, AMDGPU::sub0
)
8319 .addImm(AMDGPU::sub0
)
8321 .addImm(AMDGPU::sub1
);
8323 MRI
.replaceRegWith(Dest
.getReg(), ResultReg
);
8324 addUsersToMoveToVALUWorklist(ResultReg
, MRI
, Worklist
);
8327 void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist
&Worklist
,
8328 MachineInstr
&Inst
, unsigned Opcode
,
8329 MachineDominatorTree
*MDT
) const {
8330 // (S_FLBIT_I32_B64 hi:lo) ->
8331 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8332 // (S_FF1_I32_B64 hi:lo) ->
8333 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8335 MachineBasicBlock
&MBB
= *Inst
.getParent();
8336 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8337 MachineBasicBlock::iterator MII
= Inst
;
8338 const DebugLoc
&DL
= Inst
.getDebugLoc();
8340 MachineOperand
&Dest
= Inst
.getOperand(0);
8341 MachineOperand
&Src
= Inst
.getOperand(1);
8343 const MCInstrDesc
&InstDesc
= get(Opcode
);
8345 bool IsCtlz
= Opcode
== AMDGPU::V_FFBH_U32_e32
;
8346 unsigned OpcodeAdd
=
8347 ST
.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64
: AMDGPU::V_ADD_CO_U32_e32
;
8349 const TargetRegisterClass
*SrcRC
=
8350 Src
.isReg() ? MRI
.getRegClass(Src
.getReg()) : &AMDGPU::SGPR_32RegClass
;
8351 const TargetRegisterClass
*SrcSubRC
=
8352 RI
.getSubRegisterClass(SrcRC
, AMDGPU::sub0
);
8354 MachineOperand SrcRegSub0
=
8355 buildExtractSubRegOrImm(MII
, MRI
, Src
, SrcRC
, AMDGPU::sub0
, SrcSubRC
);
8356 MachineOperand SrcRegSub1
=
8357 buildExtractSubRegOrImm(MII
, MRI
, Src
, SrcRC
, AMDGPU::sub1
, SrcSubRC
);
8359 Register MidReg1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8360 Register MidReg2
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8361 Register MidReg3
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8362 Register MidReg4
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8364 BuildMI(MBB
, MII
, DL
, InstDesc
, MidReg1
).add(SrcRegSub0
);
8366 BuildMI(MBB
, MII
, DL
, InstDesc
, MidReg2
).add(SrcRegSub1
);
8368 BuildMI(MBB
, MII
, DL
, get(OpcodeAdd
), MidReg3
)
8369 .addReg(IsCtlz
? MidReg1
: MidReg2
)
8371 .addImm(1); // enable clamp
8373 BuildMI(MBB
, MII
, DL
, get(AMDGPU::V_MIN_U32_e64
), MidReg4
)
8375 .addReg(IsCtlz
? MidReg2
: MidReg1
);
8377 MRI
.replaceRegWith(Dest
.getReg(), MidReg4
);
8379 addUsersToMoveToVALUWorklist(MidReg4
, MRI
, Worklist
);
8382 void SIInstrInfo::addUsersToMoveToVALUWorklist(
8383 Register DstReg
, MachineRegisterInfo
&MRI
,
8384 SIInstrWorklist
&Worklist
) const {
8385 for (MachineRegisterInfo::use_iterator I
= MRI
.use_begin(DstReg
),
8386 E
= MRI
.use_end(); I
!= E
;) {
8387 MachineInstr
&UseMI
= *I
->getParent();
8391 switch (UseMI
.getOpcode()) {
8394 case AMDGPU::SOFT_WQM
:
8395 case AMDGPU::STRICT_WWM
:
8396 case AMDGPU::STRICT_WQM
:
8397 case AMDGPU::REG_SEQUENCE
:
8399 case AMDGPU::INSERT_SUBREG
:
8402 OpNo
= I
.getOperandNo();
8406 if (!RI
.hasVectorRegisters(getOpRegClass(UseMI
, OpNo
))) {
8407 Worklist
.insert(&UseMI
);
8411 } while (I
!= E
&& I
->getParent() == &UseMI
);
8418 void SIInstrInfo::movePackToVALU(SIInstrWorklist
&Worklist
,
8419 MachineRegisterInfo
&MRI
,
8420 MachineInstr
&Inst
) const {
8421 Register ResultReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8422 MachineBasicBlock
*MBB
= Inst
.getParent();
8423 MachineOperand
&Src0
= Inst
.getOperand(1);
8424 MachineOperand
&Src1
= Inst
.getOperand(2);
8425 const DebugLoc
&DL
= Inst
.getDebugLoc();
8427 switch (Inst
.getOpcode()) {
8428 case AMDGPU::S_PACK_LL_B32_B16
: {
8429 Register ImmReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8430 Register TmpReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8432 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8434 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_MOV_B32_e32
), ImmReg
)
8437 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_AND_B32_e64
), TmpReg
)
8438 .addReg(ImmReg
, RegState::Kill
)
8441 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_LSHL_OR_B32_e64
), ResultReg
)
8444 .addReg(TmpReg
, RegState::Kill
);
8447 case AMDGPU::S_PACK_LH_B32_B16
: {
8448 Register ImmReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8449 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_MOV_B32_e32
), ImmReg
)
8451 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_BFI_B32_e64
), ResultReg
)
8452 .addReg(ImmReg
, RegState::Kill
)
8457 case AMDGPU::S_PACK_HL_B32_B16
: {
8458 Register TmpReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8459 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_LSHRREV_B32_e64
), TmpReg
)
8462 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_LSHL_OR_B32_e64
), ResultReg
)
8465 .addReg(TmpReg
, RegState::Kill
);
8468 case AMDGPU::S_PACK_HH_B32_B16
: {
8469 Register ImmReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8470 Register TmpReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
8471 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_LSHRREV_B32_e64
), TmpReg
)
8474 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_MOV_B32_e32
), ImmReg
)
8475 .addImm(0xffff0000);
8476 BuildMI(*MBB
, Inst
, DL
, get(AMDGPU::V_AND_OR_B32_e64
), ResultReg
)
8478 .addReg(ImmReg
, RegState::Kill
)
8479 .addReg(TmpReg
, RegState::Kill
);
8483 llvm_unreachable("unhandled s_pack_* instruction");
8486 MachineOperand
&Dest
= Inst
.getOperand(0);
8487 MRI
.replaceRegWith(Dest
.getReg(), ResultReg
);
8488 addUsersToMoveToVALUWorklist(ResultReg
, MRI
, Worklist
);
8491 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand
&Op
,
8492 MachineInstr
&SCCDefInst
,
8493 SIInstrWorklist
&Worklist
,
8494 Register NewCond
) const {
8496 // Ensure that def inst defines SCC, which is still live.
8497 assert(Op
.isReg() && Op
.getReg() == AMDGPU::SCC
&& Op
.isDef() &&
8498 !Op
.isDead() && Op
.getParent() == &SCCDefInst
);
8499 SmallVector
<MachineInstr
*, 4> CopyToDelete
;
8500 // This assumes that all the users of SCC are in the same block
8502 for (MachineInstr
&MI
: // Skip the def inst itself.
8503 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst
)),
8504 SCCDefInst
.getParent()->end())) {
8505 // Check if SCC is used first.
8506 int SCCIdx
= MI
.findRegisterUseOperandIdx(AMDGPU::SCC
, &RI
, false);
8509 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
8510 Register DestReg
= MI
.getOperand(0).getReg();
8512 MRI
.replaceRegWith(DestReg
, NewCond
);
8513 CopyToDelete
.push_back(&MI
);
8516 if (NewCond
.isValid())
8517 MI
.getOperand(SCCIdx
).setReg(NewCond
);
8519 Worklist
.insert(&MI
);
8522 // Exit if we find another SCC def.
8523 if (MI
.findRegisterDefOperandIdx(AMDGPU::SCC
, &RI
, false, false) != -1)
8526 for (auto &Copy
: CopyToDelete
)
8527 Copy
->eraseFromParent();
8530 // Instructions that use SCC may be converted to VALU instructions. When that
8531 // happens, the SCC register is changed to VCC_LO. The instruction that defines
8532 // SCC must be changed to an instruction that defines VCC. This function makes
8533 // sure that the instruction that defines SCC is added to the moveToVALU
8535 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr
*SCCUseInst
,
8536 SIInstrWorklist
&Worklist
) const {
8537 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8538 // then there is nothing to do because the defining instruction has been
8539 // converted to a VALU already. If SCC then that instruction needs to be
8540 // converted to a VALU.
8541 for (MachineInstr
&MI
:
8542 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst
)),
8543 SCCUseInst
->getParent()->rend())) {
8544 if (MI
.modifiesRegister(AMDGPU::VCC
, &RI
))
8546 if (MI
.definesRegister(AMDGPU::SCC
, &RI
)) {
8547 Worklist
.insert(&MI
);
8553 const TargetRegisterClass
*SIInstrInfo::getDestEquivalentVGPRClass(
8554 const MachineInstr
&Inst
) const {
8555 const TargetRegisterClass
*NewDstRC
= getOpRegClass(Inst
, 0);
8557 switch (Inst
.getOpcode()) {
8558 // For target instructions, getOpRegClass just returns the virtual register
8559 // class associated with the operand, so we need to find an equivalent VGPR
8560 // register class in order to move the instruction to the VALU.
8563 case AMDGPU::REG_SEQUENCE
:
8564 case AMDGPU::INSERT_SUBREG
:
8566 case AMDGPU::SOFT_WQM
:
8567 case AMDGPU::STRICT_WWM
:
8568 case AMDGPU::STRICT_WQM
: {
8569 const TargetRegisterClass
*SrcRC
= getOpRegClass(Inst
, 1);
8570 if (RI
.isAGPRClass(SrcRC
)) {
8571 if (RI
.isAGPRClass(NewDstRC
))
8574 switch (Inst
.getOpcode()) {
8576 case AMDGPU::REG_SEQUENCE
:
8577 case AMDGPU::INSERT_SUBREG
:
8578 NewDstRC
= RI
.getEquivalentAGPRClass(NewDstRC
);
8581 NewDstRC
= RI
.getEquivalentVGPRClass(NewDstRC
);
8587 if (RI
.isVGPRClass(NewDstRC
) || NewDstRC
== &AMDGPU::VReg_1RegClass
)
8590 NewDstRC
= RI
.getEquivalentVGPRClass(NewDstRC
);
8602 // Find the one SGPR operand we are allowed to use.
8603 Register
SIInstrInfo::findUsedSGPR(const MachineInstr
&MI
,
8604 int OpIndices
[3]) const {
8605 const MCInstrDesc
&Desc
= MI
.getDesc();
8607 // Find the one SGPR operand we are allowed to use.
8609 // First we need to consider the instruction's operand requirements before
8610 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8611 // of VCC, but we are still bound by the constant bus requirement to only use
8614 // If the operand's class is an SGPR, we can never move it.
8616 Register SGPRReg
= findImplicitSGPRRead(MI
);
8620 Register UsedSGPRs
[3] = {Register()};
8621 const MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
8623 for (unsigned i
= 0; i
< 3; ++i
) {
8624 int Idx
= OpIndices
[i
];
8628 const MachineOperand
&MO
= MI
.getOperand(Idx
);
8632 // Is this operand statically required to be an SGPR based on the operand
8634 const TargetRegisterClass
*OpRC
=
8635 RI
.getRegClass(Desc
.operands()[Idx
].RegClass
);
8636 bool IsRequiredSGPR
= RI
.isSGPRClass(OpRC
);
8640 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8641 Register Reg
= MO
.getReg();
8642 const TargetRegisterClass
*RegRC
= MRI
.getRegClass(Reg
);
8643 if (RI
.isSGPRClass(RegRC
))
8647 // We don't have a required SGPR operand, so we have a bit more freedom in
8648 // selecting operands to move.
8650 // Try to select the most used SGPR. If an SGPR is equal to one of the
8651 // others, we choose that.
8654 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8655 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8657 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8661 if (UsedSGPRs
[0] == UsedSGPRs
[1] || UsedSGPRs
[0] == UsedSGPRs
[2])
8662 SGPRReg
= UsedSGPRs
[0];
8665 if (!SGPRReg
&& UsedSGPRs
[1]) {
8666 if (UsedSGPRs
[1] == UsedSGPRs
[2])
8667 SGPRReg
= UsedSGPRs
[1];
8673 MachineOperand
*SIInstrInfo::getNamedOperand(MachineInstr
&MI
,
8674 unsigned OperandName
) const {
8675 int Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), OperandName
);
8679 return &MI
.getOperand(Idx
);
8682 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
8683 if (ST
.getGeneration() >= AMDGPUSubtarget::GFX10
) {
8684 int64_t Format
= ST
.getGeneration() >= AMDGPUSubtarget::GFX11
8685 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
8686 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT
;
8687 return (Format
<< 44) |
8688 (1ULL << 56) | // RESOURCE_LEVEL = 1
8689 (3ULL << 60); // OOB_SELECT = 3
8692 uint64_t RsrcDataFormat
= AMDGPU::RSRC_DATA_FORMAT
;
8693 if (ST
.isAmdHsaOS()) {
8694 // Set ATC = 1. GFX9 doesn't have this bit.
8695 if (ST
.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8696 RsrcDataFormat
|= (1ULL << 56);
8698 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8699 // BTW, it disables TC L2 and therefore decreases performance.
8700 if (ST
.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8701 RsrcDataFormat
|= (2ULL << 59);
8704 return RsrcDataFormat
;
8707 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
8708 uint64_t Rsrc23
= getDefaultRsrcDataFormat() |
8709 AMDGPU::RSRC_TID_ENABLE
|
8710 0xffffffff; // Size;
8712 // GFX9 doesn't have ELEMENT_SIZE.
8713 if (ST
.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
8714 uint64_t EltSizeValue
= Log2_32(ST
.getMaxPrivateElementSize(true)) - 1;
8715 Rsrc23
|= EltSizeValue
<< AMDGPU::RSRC_ELEMENT_SIZE_SHIFT
;
8718 // IndexStride = 64 / 32.
8719 uint64_t IndexStride
= ST
.isWave64() ? 3 : 2;
8720 Rsrc23
|= IndexStride
<< AMDGPU::RSRC_INDEX_STRIDE_SHIFT
;
8722 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8723 // Clear them unless we want a huge stride.
8724 if (ST
.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
&&
8725 ST
.getGeneration() <= AMDGPUSubtarget::GFX9
)
8726 Rsrc23
&= ~AMDGPU::RSRC_DATA_FORMAT
;
8731 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr
&MI
) const {
8732 unsigned Opc
= MI
.getOpcode();
8737 bool SIInstrInfo::isHighLatencyDef(int Opc
) const {
8738 return get(Opc
).mayLoad() &&
8739 (isMUBUF(Opc
) || isMTBUF(Opc
) || isMIMG(Opc
) || isFLAT(Opc
));
8742 unsigned SIInstrInfo::isStackAccess(const MachineInstr
&MI
,
8743 int &FrameIndex
) const {
8744 const MachineOperand
*Addr
= getNamedOperand(MI
, AMDGPU::OpName::vaddr
);
8745 if (!Addr
|| !Addr
->isFI())
8748 assert(!MI
.memoperands_empty() &&
8749 (*MI
.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS
);
8751 FrameIndex
= Addr
->getIndex();
8752 return getNamedOperand(MI
, AMDGPU::OpName::vdata
)->getReg();
8755 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr
&MI
,
8756 int &FrameIndex
) const {
8757 const MachineOperand
*Addr
= getNamedOperand(MI
, AMDGPU::OpName::addr
);
8758 assert(Addr
&& Addr
->isFI());
8759 FrameIndex
= Addr
->getIndex();
8760 return getNamedOperand(MI
, AMDGPU::OpName::data
)->getReg();
8763 Register
SIInstrInfo::isLoadFromStackSlot(const MachineInstr
&MI
,
8764 int &FrameIndex
) const {
8768 if (isMUBUF(MI
) || isVGPRSpill(MI
))
8769 return isStackAccess(MI
, FrameIndex
);
8771 if (isSGPRSpill(MI
))
8772 return isSGPRStackAccess(MI
, FrameIndex
);
8777 Register
SIInstrInfo::isStoreToStackSlot(const MachineInstr
&MI
,
8778 int &FrameIndex
) const {
8782 if (isMUBUF(MI
) || isVGPRSpill(MI
))
8783 return isStackAccess(MI
, FrameIndex
);
8785 if (isSGPRSpill(MI
))
8786 return isSGPRStackAccess(MI
, FrameIndex
);
8791 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr
&MI
) const {
8793 MachineBasicBlock::const_instr_iterator I
= MI
.getIterator();
8794 MachineBasicBlock::const_instr_iterator E
= MI
.getParent()->instr_end();
8795 while (++I
!= E
&& I
->isInsideBundle()) {
8796 assert(!I
->isBundle() && "No nested bundle!");
8797 Size
+= getInstSizeInBytes(*I
);
8803 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr
&MI
) const {
8804 unsigned Opc
= MI
.getOpcode();
8805 const MCInstrDesc
&Desc
= getMCOpcodeFromPseudo(Opc
);
8806 unsigned DescSize
= Desc
.getSize();
8808 // If we have a definitive size, we can use it. Otherwise we need to inspect
8809 // the operands to know the size.
8810 if (isFixedSize(MI
)) {
8811 unsigned Size
= DescSize
;
8813 // If we hit the buggy offset, an extra nop will be inserted in MC so
8814 // estimate the worst case.
8815 if (MI
.isBranch() && ST
.hasOffset3fBug())
8821 // Instructions may have a 32-bit literal encoded after them. Check
8822 // operands that could ever be literals.
8823 if (isVALU(MI
) || isSALU(MI
)) {
8826 bool HasLiteral
= false;
8827 for (int I
= 0, E
= MI
.getNumExplicitOperands(); I
!= E
; ++I
) {
8828 const MachineOperand
&Op
= MI
.getOperand(I
);
8829 const MCOperandInfo
&OpInfo
= Desc
.operands()[I
];
8830 if (!Op
.isReg() && !isInlineConstant(Op
, OpInfo
)) {
8835 return HasLiteral
? DescSize
+ 4 : DescSize
;
8838 // Check whether we have extra NSA words.
8840 int VAddr0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr0
);
8844 int RSrcIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::srsrc
);
8845 return 8 + 4 * ((RSrcIdx
- VAddr0Idx
+ 2) / 4);
8849 case TargetOpcode::BUNDLE
:
8850 return getInstBundleSize(MI
);
8851 case TargetOpcode::INLINEASM
:
8852 case TargetOpcode::INLINEASM_BR
: {
8853 const MachineFunction
*MF
= MI
.getParent()->getParent();
8854 const char *AsmStr
= MI
.getOperand(0).getSymbolName();
8855 return getInlineAsmLength(AsmStr
, *MF
->getTarget().getMCAsmInfo(), &ST
);
8858 if (MI
.isMetaInstruction())
8864 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr
&MI
) const {
8868 if (MI
.memoperands_empty())
8871 for (const MachineMemOperand
*MMO
: MI
.memoperands()) {
8872 if (MMO
->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS
)
8878 ArrayRef
<std::pair
<int, const char *>>
8879 SIInstrInfo::getSerializableTargetIndices() const {
8880 static const std::pair
<int, const char *> TargetIndices
[] = {
8881 {AMDGPU::TI_CONSTDATA_START
, "amdgpu-constdata-start"},
8882 {AMDGPU::TI_SCRATCH_RSRC_DWORD0
, "amdgpu-scratch-rsrc-dword0"},
8883 {AMDGPU::TI_SCRATCH_RSRC_DWORD1
, "amdgpu-scratch-rsrc-dword1"},
8884 {AMDGPU::TI_SCRATCH_RSRC_DWORD2
, "amdgpu-scratch-rsrc-dword2"},
8885 {AMDGPU::TI_SCRATCH_RSRC_DWORD3
, "amdgpu-scratch-rsrc-dword3"}};
8886 return ArrayRef(TargetIndices
);
8889 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8890 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8891 ScheduleHazardRecognizer
*
8892 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData
*II
,
8893 const ScheduleDAG
*DAG
) const {
8894 return new GCNHazardRecognizer(DAG
->MF
);
8897 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8899 ScheduleHazardRecognizer
*
8900 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction
&MF
) const {
8901 return new GCNHazardRecognizer(MF
);
8905 // - pre-RA scheduling and post-RA scheduling
8906 ScheduleHazardRecognizer
*
8907 SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData
*II
,
8908 const ScheduleDAGMI
*DAG
) const {
8909 // Borrowed from Arm Target
8910 // We would like to restrict this hazard recognizer to only
8911 // post-RA scheduling; we can tell that we're post-RA because we don't
8912 // track VRegLiveness.
8913 if (!DAG
->hasVRegLiveness())
8914 return new GCNHazardRecognizer(DAG
->MF
);
8915 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II
, DAG
);
8918 std::pair
<unsigned, unsigned>
8919 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF
) const {
8920 return std::pair(TF
& MO_MASK
, TF
& ~MO_MASK
);
8923 ArrayRef
<std::pair
<unsigned, const char *>>
8924 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8925 static const std::pair
<unsigned, const char *> TargetFlags
[] = {
8926 { MO_GOTPCREL
, "amdgpu-gotprel" },
8927 { MO_GOTPCREL32_LO
, "amdgpu-gotprel32-lo" },
8928 { MO_GOTPCREL32_HI
, "amdgpu-gotprel32-hi" },
8929 { MO_REL32_LO
, "amdgpu-rel32-lo" },
8930 { MO_REL32_HI
, "amdgpu-rel32-hi" },
8931 { MO_ABS32_LO
, "amdgpu-abs32-lo" },
8932 { MO_ABS32_HI
, "amdgpu-abs32-hi" },
8935 return ArrayRef(TargetFlags
);
8938 ArrayRef
<std::pair
<MachineMemOperand::Flags
, const char *>>
8939 SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8940 static const std::pair
<MachineMemOperand::Flags
, const char *> TargetFlags
[] =
8942 {MONoClobber
, "amdgpu-noclobber"},
8943 {MOLastUse
, "amdgpu-last-use"},
8946 return ArrayRef(TargetFlags
);
8949 unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg
,
8950 const MachineFunction
&MF
) const {
8951 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
8952 assert(SrcReg
.isVirtual());
8953 if (MFI
->checkFlag(SrcReg
, AMDGPU::VirtRegFlag::WWM_REG
))
8954 return AMDGPU::WWM_COPY
;
8956 return AMDGPU::COPY
;
8959 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr
&MI
,
8960 Register Reg
) const {
8961 // We need to handle instructions which may be inserted during register
8962 // allocation to handle the prolog. The initial prolog instruction may have
8963 // been separated from the start of the block by spills and copies inserted
8964 // needed by the prolog. However, the insertions for scalar registers can
8965 // always be placed at the BB top as they are independent of the exec mask
8967 const MachineFunction
*MF
= MI
.getParent()->getParent();
8968 bool IsNullOrVectorRegister
= true;
8970 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
8971 IsNullOrVectorRegister
= !RI
.isSGPRClass(RI
.getRegClassForReg(MRI
, Reg
));
8974 uint16_t Opcode
= MI
.getOpcode();
8975 const SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
8976 return IsNullOrVectorRegister
&&
8977 (isSGPRSpill(Opcode
) || isWWMRegSpillOpcode(Opcode
) ||
8978 (Opcode
== AMDGPU::IMPLICIT_DEF
&&
8979 MFI
->isWWMReg(MI
.getOperand(0).getReg())) ||
8980 (!MI
.isTerminator() && Opcode
!= AMDGPU::COPY
&&
8981 MI
.modifiesRegister(AMDGPU::EXEC
, &RI
)));
8985 SIInstrInfo::getAddNoCarry(MachineBasicBlock
&MBB
,
8986 MachineBasicBlock::iterator I
,
8988 Register DestReg
) const {
8989 if (ST
.hasAddNoCarry())
8990 return BuildMI(MBB
, I
, DL
, get(AMDGPU::V_ADD_U32_e64
), DestReg
);
8992 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
8993 Register UnusedCarry
= MRI
.createVirtualRegister(RI
.getBoolRC());
8994 MRI
.setRegAllocationHint(UnusedCarry
, 0, RI
.getVCC());
8996 return BuildMI(MBB
, I
, DL
, get(AMDGPU::V_ADD_CO_U32_e64
), DestReg
)
8997 .addReg(UnusedCarry
, RegState::Define
| RegState::Dead
);
9000 MachineInstrBuilder
SIInstrInfo::getAddNoCarry(MachineBasicBlock
&MBB
,
9001 MachineBasicBlock::iterator I
,
9004 RegScavenger
&RS
) const {
9005 if (ST
.hasAddNoCarry())
9006 return BuildMI(MBB
, I
, DL
, get(AMDGPU::V_ADD_U32_e32
), DestReg
);
9008 // If available, prefer to use vcc.
9009 Register UnusedCarry
= !RS
.isRegUsed(AMDGPU::VCC
)
9010 ? Register(RI
.getVCC())
9011 : RS
.scavengeRegisterBackwards(
9012 *RI
.getBoolRC(), I
, /* RestoreAfter */ false,
9013 0, /* AllowSpill */ false);
9015 // TODO: Users need to deal with this.
9016 if (!UnusedCarry
.isValid())
9017 return MachineInstrBuilder();
9019 return BuildMI(MBB
, I
, DL
, get(AMDGPU::V_ADD_CO_U32_e64
), DestReg
)
9020 .addReg(UnusedCarry
, RegState::Define
| RegState::Dead
);
9023 bool SIInstrInfo::isKillTerminator(unsigned Opcode
) {
9025 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
:
9026 case AMDGPU::SI_KILL_I1_TERMINATOR
:
9033 const MCInstrDesc
&SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode
) const {
9035 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
9036 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
);
9037 case AMDGPU::SI_KILL_I1_PSEUDO
:
9038 return get(AMDGPU::SI_KILL_I1_TERMINATOR
);
9040 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9044 bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm
) const {
9045 return Imm
<= getMaxMUBUFImmOffset(ST
);
9048 unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget
&ST
) {
9049 // GFX12 field is non-negative 24-bit signed byte offset.
9050 const unsigned OffsetBits
=
9051 ST
.getGeneration() >= AMDGPUSubtarget::GFX12
? 23 : 12;
9052 return (1 << OffsetBits
) - 1;
9055 void SIInstrInfo::fixImplicitOperands(MachineInstr
&MI
) const {
9059 if (MI
.isInlineAsm())
9062 for (auto &Op
: MI
.implicit_operands()) {
9063 if (Op
.isReg() && Op
.getReg() == AMDGPU::VCC
)
9064 Op
.setReg(AMDGPU::VCC_LO
);
9068 bool SIInstrInfo::isBufferSMRD(const MachineInstr
&MI
) const {
9072 // Check that it is using a buffer resource.
9073 int Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::sbase
);
9074 if (Idx
== -1) // e.g. s_memtime
9077 const auto RCID
= MI
.getDesc().operands()[Idx
].RegClass
;
9078 return RI
.getRegClass(RCID
)->hasSubClassEq(&AMDGPU::SGPR_128RegClass
);
9081 // Given Imm, split it into the values to put into the SOffset and ImmOffset
9082 // fields in an MUBUF instruction. Return false if it is not possible (due to a
9083 // hardware bug needing a workaround).
9085 // The required alignment ensures that individual address components remain
9086 // aligned if they are aligned to begin with. It also ensures that additional
9087 // offsets within the given alignment can be added to the resulting ImmOffset.
9088 bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm
, uint32_t &SOffset
,
9089 uint32_t &ImmOffset
, Align Alignment
) const {
9090 const uint32_t MaxOffset
= SIInstrInfo::getMaxMUBUFImmOffset(ST
);
9091 const uint32_t MaxImm
= alignDown(MaxOffset
, Alignment
.value());
9092 uint32_t Overflow
= 0;
9095 if (Imm
<= MaxImm
+ 64) {
9096 // Use an SOffset inline constant for 4..64
9097 Overflow
= Imm
- MaxImm
;
9100 // Try to keep the same value in SOffset for adjacent loads, so that
9101 // the corresponding register contents can be re-used.
9103 // Load values with all low-bits (except for alignment bits) set into
9104 // SOffset, so that a larger range of values can be covered using
9107 // Atomic operations fail to work correctly when individual address
9108 // components are unaligned, even if their sum is aligned.
9109 uint32_t High
= (Imm
+ Alignment
.value()) & ~MaxOffset
;
9110 uint32_t Low
= (Imm
+ Alignment
.value()) & MaxOffset
;
9112 Overflow
= High
- Alignment
.value();
9117 // There is a hardware bug in SI and CI which prevents address clamping in
9118 // MUBUF instructions from working correctly with SOffsets. The immediate
9119 // offset is unaffected.
9120 if (ST
.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS
)
9123 // It is not possible to set immediate in SOffset field on some targets.
9124 if (ST
.hasRestrictedSOffset())
9133 // Depending on the used address space and instructions, some immediate offsets
9134 // are allowed and some are not.
9135 // Pre-GFX12, flat instruction offsets can only be non-negative, global and
9136 // scratch instruction offsets can also be negative. On GFX12, offsets can be
9137 // negative for all variants.
9139 // There are several bugs related to these offsets:
9140 // On gfx10.1, flat instructions that go into the global address space cannot
9143 // For scratch instructions, the address can be either an SGPR or a VGPR.
9144 // The following offsets can be used, depending on the architecture (x means
9146 // +----------------------------+------+------+
9147 // | Address-Mode | SGPR | VGPR |
9148 // +----------------------------+------+------+
9150 // | negative, 4-aligned offset | x | ok |
9151 // | negative, unaligned offset | x | ok |
9152 // +----------------------------+------+------+
9154 // | negative, 4-aligned offset | ok | ok |
9155 // | negative, unaligned offset | ok | x |
9156 // +----------------------------+------+------+
9158 // | negative, 4-aligned offset | ok | ok |
9159 // | negative, unaligned offset | ok | ok |
9160 // +----------------------------+------+------+
9162 // This function ignores the addressing mode, so if an offset cannot be used in
9163 // one addressing mode, it is considered illegal.
9164 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset
, unsigned AddrSpace
,
9165 uint64_t FlatVariant
) const {
9166 // TODO: Should 0 be special cased?
9167 if (!ST
.hasFlatInstOffsets())
9170 if (ST
.hasFlatSegmentOffsetBug() && FlatVariant
== SIInstrFlags::FLAT
&&
9171 (AddrSpace
== AMDGPUAS::FLAT_ADDRESS
||
9172 AddrSpace
== AMDGPUAS::GLOBAL_ADDRESS
))
9175 if (ST
.hasNegativeUnalignedScratchOffsetBug() &&
9176 FlatVariant
== SIInstrFlags::FlatScratch
&& Offset
< 0 &&
9177 (Offset
% 4) != 0) {
9181 bool AllowNegative
= allowNegativeFlatOffset(FlatVariant
);
9182 unsigned N
= AMDGPU::getNumFlatOffsetBits(ST
);
9183 return isIntN(N
, Offset
) && (AllowNegative
|| Offset
>= 0);
9186 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9187 std::pair
<int64_t, int64_t>
9188 SIInstrInfo::splitFlatOffset(int64_t COffsetVal
, unsigned AddrSpace
,
9189 uint64_t FlatVariant
) const {
9190 int64_t RemainderOffset
= COffsetVal
;
9191 int64_t ImmField
= 0;
9193 bool AllowNegative
= allowNegativeFlatOffset(FlatVariant
);
9194 const unsigned NumBits
= AMDGPU::getNumFlatOffsetBits(ST
) - 1;
9196 if (AllowNegative
) {
9197 // Use signed division by a power of two to truncate towards 0.
9198 int64_t D
= 1LL << NumBits
;
9199 RemainderOffset
= (COffsetVal
/ D
) * D
;
9200 ImmField
= COffsetVal
- RemainderOffset
;
9202 if (ST
.hasNegativeUnalignedScratchOffsetBug() &&
9203 FlatVariant
== SIInstrFlags::FlatScratch
&& ImmField
< 0 &&
9204 (ImmField
% 4) != 0) {
9205 // Make ImmField a multiple of 4
9206 RemainderOffset
+= ImmField
% 4;
9207 ImmField
-= ImmField
% 4;
9209 } else if (COffsetVal
>= 0) {
9210 ImmField
= COffsetVal
& maskTrailingOnes
<uint64_t>(NumBits
);
9211 RemainderOffset
= COffsetVal
- ImmField
;
9214 assert(isLegalFLATOffset(ImmField
, AddrSpace
, FlatVariant
));
9215 assert(RemainderOffset
+ ImmField
== COffsetVal
);
9216 return {ImmField
, RemainderOffset
};
9219 bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant
) const {
9220 if (ST
.hasNegativeScratchOffsetBug() &&
9221 FlatVariant
== SIInstrFlags::FlatScratch
)
9224 return FlatVariant
!= SIInstrFlags::FLAT
|| AMDGPU::isGFX12Plus(ST
);
9227 static unsigned subtargetEncodingFamily(const GCNSubtarget
&ST
) {
9228 switch (ST
.getGeneration()) {
9231 case AMDGPUSubtarget::SOUTHERN_ISLANDS
:
9232 case AMDGPUSubtarget::SEA_ISLANDS
:
9233 return SIEncodingFamily::SI
;
9234 case AMDGPUSubtarget::VOLCANIC_ISLANDS
:
9235 case AMDGPUSubtarget::GFX9
:
9236 return SIEncodingFamily::VI
;
9237 case AMDGPUSubtarget::GFX10
:
9238 return SIEncodingFamily::GFX10
;
9239 case AMDGPUSubtarget::GFX11
:
9240 return SIEncodingFamily::GFX11
;
9241 case AMDGPUSubtarget::GFX12
:
9242 return SIEncodingFamily::GFX12
;
9244 llvm_unreachable("Unknown subtarget generation!");
9247 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp
) const {
9249 // These opcodes use indirect register addressing so
9250 // they need special handling by codegen (currently missing).
9251 // Therefore it is too risky to allow these opcodes
9252 // to be selected by dpp combiner or sdwa peepholer.
9253 case AMDGPU::V_MOVRELS_B32_dpp_gfx10
:
9254 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10
:
9255 case AMDGPU::V_MOVRELD_B32_dpp_gfx10
:
9256 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10
:
9257 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10
:
9258 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10
:
9259 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10
:
9260 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10
:
9267 #define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9268 case OPCODE##_dpp: \
9269 case OPCODE##_e32: \
9270 case OPCODE##_e64: \
9271 case OPCODE##_e64_dpp: \
9274 static bool isRenamedInGFX9(int Opcode
) {
9276 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32
)
9277 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32
)
9278 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32
)
9279 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32
)
9280 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32
)
9281 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32
)
9282 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32
)
9283 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32
)
9284 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32
)
9286 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64
:
9287 case AMDGPU::V_FMA_F16_gfx9_e64
:
9288 case AMDGPU::V_INTERP_P2_F16
:
9289 case AMDGPU::V_MAD_F16_e64
:
9290 case AMDGPU::V_MAD_U16_e64
:
9291 case AMDGPU::V_MAD_I16_e64
:
9298 int SIInstrInfo::pseudoToMCOpcode(int Opcode
) const {
9299 Opcode
= SIInstrInfo::getNonSoftWaitcntOpcode(Opcode
);
9301 unsigned Gen
= subtargetEncodingFamily(ST
);
9303 if (ST
.getGeneration() == AMDGPUSubtarget::GFX9
&& isRenamedInGFX9(Opcode
))
9304 Gen
= SIEncodingFamily::GFX9
;
9306 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9307 // subtarget has UnpackedD16VMem feature.
9308 // TODO: remove this when we discard GFX80 encoding.
9309 if (ST
.hasUnpackedD16VMem() && (get(Opcode
).TSFlags
& SIInstrFlags::D16Buf
))
9310 Gen
= SIEncodingFamily::GFX80
;
9312 if (get(Opcode
).TSFlags
& SIInstrFlags::SDWA
) {
9313 switch (ST
.getGeneration()) {
9315 Gen
= SIEncodingFamily::SDWA
;
9317 case AMDGPUSubtarget::GFX9
:
9318 Gen
= SIEncodingFamily::SDWA9
;
9320 case AMDGPUSubtarget::GFX10
:
9321 Gen
= SIEncodingFamily::SDWA10
;
9326 if (isMAI(Opcode
)) {
9327 int MFMAOp
= AMDGPU::getMFMAEarlyClobberOp(Opcode
);
9332 int MCOp
= AMDGPU::getMCOpcode(Opcode
, Gen
);
9334 // -1 means that Opcode is already a native instruction.
9338 if (ST
.hasGFX90AInsts()) {
9339 uint16_t NMCOp
= (uint16_t)-1;
9340 if (ST
.hasGFX940Insts())
9341 NMCOp
= AMDGPU::getMCOpcode(Opcode
, SIEncodingFamily::GFX940
);
9342 if (NMCOp
== (uint16_t)-1)
9343 NMCOp
= AMDGPU::getMCOpcode(Opcode
, SIEncodingFamily::GFX90A
);
9344 if (NMCOp
== (uint16_t)-1)
9345 NMCOp
= AMDGPU::getMCOpcode(Opcode
, SIEncodingFamily::GFX9
);
9346 if (NMCOp
!= (uint16_t)-1)
9350 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9351 // no encoding in the given subtarget generation.
9352 if (MCOp
== (uint16_t)-1)
9355 if (isAsmOnlyOpcode(MCOp
))
9362 TargetInstrInfo::RegSubRegPair
getRegOrUndef(const MachineOperand
&RegOpnd
) {
9363 assert(RegOpnd
.isReg());
9364 return RegOpnd
.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9365 getRegSubRegPair(RegOpnd
);
9368 TargetInstrInfo::RegSubRegPair
9369 llvm::getRegSequenceSubReg(MachineInstr
&MI
, unsigned SubReg
) {
9370 assert(MI
.isRegSequence());
9371 for (unsigned I
= 0, E
= (MI
.getNumOperands() - 1)/ 2; I
< E
; ++I
)
9372 if (MI
.getOperand(1 + 2 * I
+ 1).getImm() == SubReg
) {
9373 auto &RegOp
= MI
.getOperand(1 + 2 * I
);
9374 return getRegOrUndef(RegOp
);
9376 return TargetInstrInfo::RegSubRegPair();
9379 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
9380 // Following a subreg of reg:subreg isn't supported
9381 static bool followSubRegDef(MachineInstr
&MI
,
9382 TargetInstrInfo::RegSubRegPair
&RSR
) {
9385 switch (MI
.getOpcode()) {
9387 case AMDGPU::REG_SEQUENCE
:
9388 RSR
= getRegSequenceSubReg(MI
, RSR
.SubReg
);
9390 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9391 case AMDGPU::INSERT_SUBREG
:
9392 if (RSR
.SubReg
== (unsigned)MI
.getOperand(3).getImm())
9393 // inserted the subreg we're looking for
9394 RSR
= getRegOrUndef(MI
.getOperand(2));
9395 else { // the subreg in the rest of the reg
9396 auto R1
= getRegOrUndef(MI
.getOperand(1));
9397 if (R1
.SubReg
) // subreg of subreg isn't supported
9406 MachineInstr
*llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair
&P
,
9407 MachineRegisterInfo
&MRI
) {
9408 assert(MRI
.isSSA());
9409 if (!P
.Reg
.isVirtual())
9413 auto *DefInst
= MRI
.getVRegDef(RSR
.Reg
);
9414 while (auto *MI
= DefInst
) {
9416 switch (MI
->getOpcode()) {
9418 case AMDGPU::V_MOV_B32_e32
: {
9419 auto &Op1
= MI
->getOperand(1);
9420 if (Op1
.isReg() && Op1
.getReg().isVirtual()) {
9423 RSR
= getRegSubRegPair(Op1
);
9424 DefInst
= MRI
.getVRegDef(RSR
.Reg
);
9429 if (followSubRegDef(*MI
, RSR
)) {
9432 DefInst
= MRI
.getVRegDef(RSR
.Reg
);
9441 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo
&MRI
,
9443 const MachineInstr
&DefMI
,
9444 const MachineInstr
&UseMI
) {
9445 assert(MRI
.isSSA() && "Must be run on SSA");
9447 auto *TRI
= MRI
.getTargetRegisterInfo();
9448 auto *DefBB
= DefMI
.getParent();
9450 // Don't bother searching between blocks, although it is possible this block
9451 // doesn't modify exec.
9452 if (UseMI
.getParent() != DefBB
)
9455 const int MaxInstScan
= 20;
9458 // Stop scan at the use.
9459 auto E
= UseMI
.getIterator();
9460 for (auto I
= std::next(DefMI
.getIterator()); I
!= E
; ++I
) {
9461 if (I
->isDebugInstr())
9464 if (++NumInst
> MaxInstScan
)
9467 if (I
->modifiesRegister(AMDGPU::EXEC
, TRI
))
9474 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo
&MRI
,
9476 const MachineInstr
&DefMI
) {
9477 assert(MRI
.isSSA() && "Must be run on SSA");
9479 auto *TRI
= MRI
.getTargetRegisterInfo();
9480 auto *DefBB
= DefMI
.getParent();
9482 const int MaxUseScan
= 10;
9485 for (auto &Use
: MRI
.use_nodbg_operands(VReg
)) {
9486 auto &UseInst
= *Use
.getParent();
9487 // Don't bother searching between blocks, although it is possible this block
9488 // doesn't modify exec.
9489 if (UseInst
.getParent() != DefBB
|| UseInst
.isPHI())
9492 if (++NumUse
> MaxUseScan
)
9499 const int MaxInstScan
= 20;
9502 // Stop scan when we have seen all the uses.
9503 for (auto I
= std::next(DefMI
.getIterator()); ; ++I
) {
9504 assert(I
!= DefBB
->end());
9506 if (I
->isDebugInstr())
9509 if (++NumInst
> MaxInstScan
)
9512 for (const MachineOperand
&Op
: I
->operands()) {
9513 // We don't check reg masks here as they're used only on calls:
9514 // 1. EXEC is only considered const within one BB
9515 // 2. Call should be a terminator instruction if present in a BB
9520 Register Reg
= Op
.getReg();
9522 if (Reg
== VReg
&& --NumUse
== 0)
9524 } else if (TRI
->regsOverlap(Reg
, AMDGPU::EXEC
))
9530 MachineInstr
*SIInstrInfo::createPHIDestinationCopy(
9531 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator LastPHIIt
,
9532 const DebugLoc
&DL
, Register Src
, Register Dst
) const {
9533 auto Cur
= MBB
.begin();
9534 if (Cur
!= MBB
.end())
9536 if (!Cur
->isPHI() && Cur
->readsRegister(Dst
, /*TRI=*/nullptr))
9537 return BuildMI(MBB
, Cur
, DL
, get(TargetOpcode::COPY
), Dst
).addReg(Src
);
9539 } while (Cur
!= MBB
.end() && Cur
!= LastPHIIt
);
9541 return TargetInstrInfo::createPHIDestinationCopy(MBB
, LastPHIIt
, DL
, Src
,
9545 MachineInstr
*SIInstrInfo::createPHISourceCopy(
9546 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator InsPt
,
9547 const DebugLoc
&DL
, Register Src
, unsigned SrcSubReg
, Register Dst
) const {
9548 if (InsPt
!= MBB
.end() &&
9549 (InsPt
->getOpcode() == AMDGPU::SI_IF
||
9550 InsPt
->getOpcode() == AMDGPU::SI_ELSE
||
9551 InsPt
->getOpcode() == AMDGPU::SI_IF_BREAK
) &&
9552 InsPt
->definesRegister(Src
, /*TRI=*/nullptr)) {
9554 return BuildMI(MBB
, InsPt
, DL
,
9555 get(ST
.isWave32() ? AMDGPU::S_MOV_B32_term
9556 : AMDGPU::S_MOV_B64_term
),
9558 .addReg(Src
, 0, SrcSubReg
)
9559 .addReg(AMDGPU::EXEC
, RegState::Implicit
);
9561 return TargetInstrInfo::createPHISourceCopy(MBB
, InsPt
, DL
, Src
, SrcSubReg
,
9565 bool llvm::SIInstrInfo::isWave32() const { return ST
.isWave32(); }
9567 MachineInstr
*SIInstrInfo::foldMemoryOperandImpl(
9568 MachineFunction
&MF
, MachineInstr
&MI
, ArrayRef
<unsigned> Ops
,
9569 MachineBasicBlock::iterator InsertPt
, int FrameIndex
, LiveIntervals
*LIS
,
9570 VirtRegMap
*VRM
) const {
9571 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9573 // %0:sreg_32 = COPY $m0
9575 // We explicitly chose SReg_32 for the virtual register so such a copy might
9576 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9577 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9578 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9579 // TargetInstrInfo::foldMemoryOperand() is going to try.
9580 // A similar issue also exists with spilling and reloading $exec registers.
9582 // To prevent that, constrain the %0 register class here.
9583 if (isFullCopyInstr(MI
)) {
9584 Register DstReg
= MI
.getOperand(0).getReg();
9585 Register SrcReg
= MI
.getOperand(1).getReg();
9586 if ((DstReg
.isVirtual() || SrcReg
.isVirtual()) &&
9587 (DstReg
.isVirtual() != SrcReg
.isVirtual())) {
9588 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
9589 Register VirtReg
= DstReg
.isVirtual() ? DstReg
: SrcReg
;
9590 const TargetRegisterClass
*RC
= MRI
.getRegClass(VirtReg
);
9591 if (RC
->hasSuperClassEq(&AMDGPU::SReg_32RegClass
)) {
9592 MRI
.constrainRegClass(VirtReg
, &AMDGPU::SReg_32_XM0_XEXECRegClass
);
9595 if (RC
->hasSuperClassEq(&AMDGPU::SReg_64RegClass
)) {
9596 MRI
.constrainRegClass(VirtReg
, &AMDGPU::SReg_64_XEXECRegClass
);
9605 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData
*ItinData
,
9606 const MachineInstr
&MI
,
9607 unsigned *PredCost
) const {
9608 if (MI
.isBundle()) {
9609 MachineBasicBlock::const_instr_iterator
I(MI
.getIterator());
9610 MachineBasicBlock::const_instr_iterator
E(MI
.getParent()->instr_end());
9611 unsigned Lat
= 0, Count
= 0;
9612 for (++I
; I
!= E
&& I
->isBundledWithPred(); ++I
) {
9614 Lat
= std::max(Lat
, SchedModel
.computeInstrLatency(&*I
));
9616 return Lat
+ Count
- 1;
9619 return SchedModel
.computeInstrLatency(&MI
);
9622 InstructionUniformity
9623 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr
&MI
) const {
9624 unsigned opcode
= MI
.getOpcode();
9625 if (auto *GI
= dyn_cast
<GIntrinsic
>(&MI
)) {
9626 auto IID
= GI
->getIntrinsicID();
9627 if (AMDGPU::isIntrinsicSourceOfDivergence(IID
))
9628 return InstructionUniformity::NeverUniform
;
9629 if (AMDGPU::isIntrinsicAlwaysUniform(IID
))
9630 return InstructionUniformity::AlwaysUniform
;
9633 case Intrinsic::amdgcn_if
:
9634 case Intrinsic::amdgcn_else
:
9635 // FIXME: Uniform if second result
9639 return InstructionUniformity::Default
;
9642 // Loads from the private and flat address spaces are divergent, because
9643 // threads can execute the load instruction with the same inputs and get
9644 // different results.
9646 // All other loads are not divergent, because if threads issue loads with the
9647 // same arguments, they will always get the same result.
9648 if (opcode
== AMDGPU::G_LOAD
) {
9649 if (MI
.memoperands_empty())
9650 return InstructionUniformity::NeverUniform
; // conservative assumption
9652 if (llvm::any_of(MI
.memoperands(), [](const MachineMemOperand
*mmo
) {
9653 return mmo
->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS
||
9654 mmo
->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS
;
9656 // At least one MMO in a non-global address space.
9657 return InstructionUniformity::NeverUniform
;
9659 return InstructionUniformity::Default
;
9662 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode
) ||
9663 opcode
== AMDGPU::G_ATOMIC_CMPXCHG
||
9664 opcode
== AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS
||
9665 AMDGPU::isGenericAtomic(opcode
)) {
9666 return InstructionUniformity::NeverUniform
;
9668 return InstructionUniformity::Default
;
9671 InstructionUniformity
9672 SIInstrInfo::getInstructionUniformity(const MachineInstr
&MI
) const {
9674 if (isNeverUniform(MI
))
9675 return InstructionUniformity::NeverUniform
;
9677 unsigned opcode
= MI
.getOpcode();
9678 if (opcode
== AMDGPU::V_READLANE_B32
||
9679 opcode
== AMDGPU::V_READFIRSTLANE_B32
||
9680 opcode
== AMDGPU::SI_RESTORE_S32_FROM_VGPR
)
9681 return InstructionUniformity::AlwaysUniform
;
9683 if (isCopyInstr(MI
)) {
9684 const MachineOperand
&srcOp
= MI
.getOperand(1);
9685 if (srcOp
.isReg() && srcOp
.getReg().isPhysical()) {
9686 const TargetRegisterClass
*regClass
=
9687 RI
.getPhysRegBaseClass(srcOp
.getReg());
9688 return RI
.isSGPRClass(regClass
) ? InstructionUniformity::AlwaysUniform
9689 : InstructionUniformity::NeverUniform
;
9691 return InstructionUniformity::Default
;
9695 if (MI
.isPreISelOpcode())
9696 return SIInstrInfo::getGenericInstructionUniformity(MI
);
9698 // Atomics are divergent because they are executed sequentially: when an
9699 // atomic operation refers to the same address in each thread, then each
9700 // thread after the first sees the value written by the previous thread as
9704 return InstructionUniformity::NeverUniform
;
9706 // Loads from the private and flat address spaces are divergent, because
9707 // threads can execute the load instruction with the same inputs and get
9708 // different results.
9709 if (isFLAT(MI
) && MI
.mayLoad()) {
9710 if (MI
.memoperands_empty())
9711 return InstructionUniformity::NeverUniform
; // conservative assumption
9713 if (llvm::any_of(MI
.memoperands(), [](const MachineMemOperand
*mmo
) {
9714 return mmo
->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS
||
9715 mmo
->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS
;
9717 // At least one MMO in a non-global address space.
9718 return InstructionUniformity::NeverUniform
;
9721 return InstructionUniformity::Default
;
9724 const MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
9725 const AMDGPURegisterBankInfo
*RBI
= ST
.getRegBankInfo();
9727 // FIXME: It's conceptually broken to report this for an instruction, and not
9728 // a specific def operand. For inline asm in particular, there could be mixed
9729 // uniform and divergent results.
9730 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
9731 const MachineOperand
&SrcOp
= MI
.getOperand(I
);
9735 Register Reg
= SrcOp
.getReg();
9736 if (!Reg
|| !SrcOp
.readsReg())
9739 // If RegBank is null, this is unassigned or an unallocatable special
9740 // register, which are all scalars.
9741 const RegisterBank
*RegBank
= RBI
->getRegBank(Reg
, MRI
, RI
);
9742 if (RegBank
&& RegBank
->getID() != AMDGPU::SGPRRegBankID
)
9743 return InstructionUniformity::NeverUniform
;
9746 // TODO: Uniformity check condtions above can be rearranged for more
9749 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9750 // currently turned into no-op COPYs by SelectionDAG ISel and are
9751 // therefore no longer recognizable.
9753 return InstructionUniformity::Default
;
9756 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction
&MF
) {
9757 switch (MF
.getFunction().getCallingConv()) {
9758 case CallingConv::AMDGPU_PS
:
9760 case CallingConv::AMDGPU_VS
:
9762 case CallingConv::AMDGPU_GS
:
9764 case CallingConv::AMDGPU_HS
:
9765 case CallingConv::AMDGPU_LS
:
9766 case CallingConv::AMDGPU_ES
:
9767 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9768 case CallingConv::AMDGPU_CS
:
9769 case CallingConv::AMDGPU_KERNEL
:
9770 case CallingConv::C
:
9771 case CallingConv::Fast
:
9773 // Assume other calling conventions are various compute callable functions
9778 bool SIInstrInfo::analyzeCompare(const MachineInstr
&MI
, Register
&SrcReg
,
9779 Register
&SrcReg2
, int64_t &CmpMask
,
9780 int64_t &CmpValue
) const {
9781 if (!MI
.getOperand(0).isReg() || MI
.getOperand(0).getSubReg())
9784 switch (MI
.getOpcode()) {
9787 case AMDGPU::S_CMP_EQ_U32
:
9788 case AMDGPU::S_CMP_EQ_I32
:
9789 case AMDGPU::S_CMP_LG_U32
:
9790 case AMDGPU::S_CMP_LG_I32
:
9791 case AMDGPU::S_CMP_LT_U32
:
9792 case AMDGPU::S_CMP_LT_I32
:
9793 case AMDGPU::S_CMP_GT_U32
:
9794 case AMDGPU::S_CMP_GT_I32
:
9795 case AMDGPU::S_CMP_LE_U32
:
9796 case AMDGPU::S_CMP_LE_I32
:
9797 case AMDGPU::S_CMP_GE_U32
:
9798 case AMDGPU::S_CMP_GE_I32
:
9799 case AMDGPU::S_CMP_EQ_U64
:
9800 case AMDGPU::S_CMP_LG_U64
:
9801 SrcReg
= MI
.getOperand(0).getReg();
9802 if (MI
.getOperand(1).isReg()) {
9803 if (MI
.getOperand(1).getSubReg())
9805 SrcReg2
= MI
.getOperand(1).getReg();
9807 } else if (MI
.getOperand(1).isImm()) {
9808 SrcReg2
= Register();
9809 CmpValue
= MI
.getOperand(1).getImm();
9815 case AMDGPU::S_CMPK_EQ_U32
:
9816 case AMDGPU::S_CMPK_EQ_I32
:
9817 case AMDGPU::S_CMPK_LG_U32
:
9818 case AMDGPU::S_CMPK_LG_I32
:
9819 case AMDGPU::S_CMPK_LT_U32
:
9820 case AMDGPU::S_CMPK_LT_I32
:
9821 case AMDGPU::S_CMPK_GT_U32
:
9822 case AMDGPU::S_CMPK_GT_I32
:
9823 case AMDGPU::S_CMPK_LE_U32
:
9824 case AMDGPU::S_CMPK_LE_I32
:
9825 case AMDGPU::S_CMPK_GE_U32
:
9826 case AMDGPU::S_CMPK_GE_I32
:
9827 SrcReg
= MI
.getOperand(0).getReg();
9828 SrcReg2
= Register();
9829 CmpValue
= MI
.getOperand(1).getImm();
9837 bool SIInstrInfo::optimizeCompareInstr(MachineInstr
&CmpInstr
, Register SrcReg
,
9838 Register SrcReg2
, int64_t CmpMask
,
9840 const MachineRegisterInfo
*MRI
) const {
9841 if (!SrcReg
|| SrcReg
.isPhysical())
9844 if (SrcReg2
&& !getFoldableImm(SrcReg2
, *MRI
, CmpValue
))
9847 const auto optimizeCmpAnd
= [&CmpInstr
, SrcReg
, CmpValue
, MRI
,
9848 this](int64_t ExpectedValue
, unsigned SrcSize
,
9849 bool IsReversible
, bool IsSigned
) -> bool {
9850 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9851 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9852 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9853 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9854 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9855 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9856 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9857 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9858 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9859 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9861 // Signed ge/gt are not used for the sign bit.
9863 // If result of the AND is unused except in the compare:
9864 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9866 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9867 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9868 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9869 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9870 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9871 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9873 MachineInstr
*Def
= MRI
->getUniqueVRegDef(SrcReg
);
9874 if (!Def
|| Def
->getParent() != CmpInstr
.getParent())
9877 if (Def
->getOpcode() != AMDGPU::S_AND_B32
&&
9878 Def
->getOpcode() != AMDGPU::S_AND_B64
)
9882 const auto isMask
= [&Mask
, SrcSize
](const MachineOperand
*MO
) -> bool {
9884 Mask
= MO
->getImm();
9885 else if (!getFoldableImm(MO
, Mask
))
9887 Mask
&= maxUIntN(SrcSize
);
9888 return isPowerOf2_64(Mask
);
9891 MachineOperand
*SrcOp
= &Def
->getOperand(1);
9893 SrcOp
= &Def
->getOperand(2);
9894 else if (isMask(&Def
->getOperand(2)))
9895 SrcOp
= &Def
->getOperand(1);
9899 // A valid Mask is required to have a single bit set, hence a non-zero and
9900 // power-of-two value. This verifies that we will not do 64-bit shift below.
9901 assert(llvm::has_single_bit
<uint64_t>(Mask
) && "Invalid mask.");
9902 unsigned BitNo
= llvm::countr_zero((uint64_t)Mask
);
9903 if (IsSigned
&& BitNo
== SrcSize
- 1)
9906 ExpectedValue
<<= BitNo
;
9908 bool IsReversedCC
= false;
9909 if (CmpValue
!= ExpectedValue
) {
9912 IsReversedCC
= CmpValue
== (ExpectedValue
^ Mask
);
9917 Register DefReg
= Def
->getOperand(0).getReg();
9918 if (IsReversedCC
&& !MRI
->hasOneNonDBGUse(DefReg
))
9921 for (auto I
= std::next(Def
->getIterator()), E
= CmpInstr
.getIterator();
9923 if (I
->modifiesRegister(AMDGPU::SCC
, &RI
) ||
9924 I
->killsRegister(AMDGPU::SCC
, &RI
))
9928 MachineOperand
*SccDef
=
9929 Def
->findRegisterDefOperand(AMDGPU::SCC
, /*TRI=*/nullptr);
9930 SccDef
->setIsDead(false);
9931 CmpInstr
.eraseFromParent();
9933 if (!MRI
->use_nodbg_empty(DefReg
)) {
9934 assert(!IsReversedCC
);
9938 // Replace AND with unused result with a S_BITCMP.
9939 MachineBasicBlock
*MBB
= Def
->getParent();
9941 unsigned NewOpc
= (SrcSize
== 32) ? IsReversedCC
? AMDGPU::S_BITCMP0_B32
9942 : AMDGPU::S_BITCMP1_B32
9943 : IsReversedCC
? AMDGPU::S_BITCMP0_B64
9944 : AMDGPU::S_BITCMP1_B64
;
9946 BuildMI(*MBB
, Def
, Def
->getDebugLoc(), get(NewOpc
))
9949 Def
->eraseFromParent();
9954 switch (CmpInstr
.getOpcode()) {
9957 case AMDGPU::S_CMP_EQ_U32
:
9958 case AMDGPU::S_CMP_EQ_I32
:
9959 case AMDGPU::S_CMPK_EQ_U32
:
9960 case AMDGPU::S_CMPK_EQ_I32
:
9961 return optimizeCmpAnd(1, 32, true, false);
9962 case AMDGPU::S_CMP_GE_U32
:
9963 case AMDGPU::S_CMPK_GE_U32
:
9964 return optimizeCmpAnd(1, 32, false, false);
9965 case AMDGPU::S_CMP_GE_I32
:
9966 case AMDGPU::S_CMPK_GE_I32
:
9967 return optimizeCmpAnd(1, 32, false, true);
9968 case AMDGPU::S_CMP_EQ_U64
:
9969 return optimizeCmpAnd(1, 64, true, false);
9970 case AMDGPU::S_CMP_LG_U32
:
9971 case AMDGPU::S_CMP_LG_I32
:
9972 case AMDGPU::S_CMPK_LG_U32
:
9973 case AMDGPU::S_CMPK_LG_I32
:
9974 return optimizeCmpAnd(0, 32, true, false);
9975 case AMDGPU::S_CMP_GT_U32
:
9976 case AMDGPU::S_CMPK_GT_U32
:
9977 return optimizeCmpAnd(0, 32, false, false);
9978 case AMDGPU::S_CMP_GT_I32
:
9979 case AMDGPU::S_CMPK_GT_I32
:
9980 return optimizeCmpAnd(0, 32, false, true);
9981 case AMDGPU::S_CMP_LG_U64
:
9982 return optimizeCmpAnd(0, 64, true, false);
9988 void SIInstrInfo::enforceOperandRCAlignment(MachineInstr
&MI
,
9989 unsigned OpName
) const {
9990 if (!ST
.needsAlignedVGPRs())
9993 int OpNo
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), OpName
);
9996 MachineOperand
&Op
= MI
.getOperand(OpNo
);
9997 if (getOpSize(MI
, OpNo
) > 4)
10000 // Add implicit aligned super-reg to force alignment on the data operand.
10001 const DebugLoc
&DL
= MI
.getDebugLoc();
10002 MachineBasicBlock
*BB
= MI
.getParent();
10003 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
10004 Register DataReg
= Op
.getReg();
10005 bool IsAGPR
= RI
.isAGPR(MRI
, DataReg
);
10006 Register Undef
= MRI
.createVirtualRegister(
10007 IsAGPR
? &AMDGPU::AGPR_32RegClass
: &AMDGPU::VGPR_32RegClass
);
10008 BuildMI(*BB
, MI
, DL
, get(AMDGPU::IMPLICIT_DEF
), Undef
);
10010 MRI
.createVirtualRegister(IsAGPR
? &AMDGPU::AReg_64_Align2RegClass
10011 : &AMDGPU::VReg_64_Align2RegClass
);
10012 BuildMI(*BB
, MI
, DL
, get(AMDGPU::REG_SEQUENCE
), NewVR
)
10013 .addReg(DataReg
, 0, Op
.getSubReg())
10014 .addImm(AMDGPU::sub0
)
10016 .addImm(AMDGPU::sub1
);
10018 Op
.setSubReg(AMDGPU::sub0
);
10019 MI
.addOperand(MachineOperand::CreateReg(NewVR
, false, true));