[Clang/AMDGPU] Zero sized arrays not allowed in HIP device code. (#113470)
[llvm-project.git] / llvm / lib / Target / AMDGPU / SIInstrInfo.cpp
blob4a94d6902979495765f5339d12c0aec07eb9f83d
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
12 //===----------------------------------------------------------------------===//
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "GCNHazardRecognizer.h"
18 #include "GCNSubtarget.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "Utils/AMDGPUBaseInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
23 #include "llvm/CodeGen/LiveIntervals.h"
24 #include "llvm/CodeGen/LiveVariables.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineScheduler.h"
28 #include "llvm/CodeGen/RegisterScavenging.h"
29 #include "llvm/CodeGen/ScheduleDAG.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/IntrinsicsAMDGPU.h"
32 #include "llvm/MC/MCContext.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Target/TargetMachine.h"
36 using namespace llvm;
38 #define DEBUG_TYPE "si-instr-info"
40 #define GET_INSTRINFO_CTOR_DTOR
41 #include "AMDGPUGenInstrInfo.inc"
43 namespace llvm::AMDGPU {
44 #define GET_D16ImageDimIntrinsics_IMPL
45 #define GET_ImageDimIntrinsicTable_IMPL
46 #define GET_RsrcIntrinsics_IMPL
47 #include "AMDGPUGenSearchableTables.inc"
48 } // namespace llvm::AMDGPU
50 // Must be at least 4 to be able to branch over minimum unconditional branch
51 // code. This is only for making it possible to write reasonably small tests for
52 // long branches.
53 static cl::opt<unsigned>
54 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
57 static cl::opt<bool> Fix16BitCopies(
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
61 cl::ReallyHidden);
63 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
69 //===----------------------------------------------------------------------===//
70 // TargetInstrInfo callbacks
71 //===----------------------------------------------------------------------===//
73 static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
80 /// Returns true if both nodes have the same value for the given
81 /// operand \p Op, or if both nodes do not have this operand.
82 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107 static bool canRemat(const MachineInstr &MI) {
109 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
110 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
111 SIInstrInfo::isSALU(MI))
112 return true;
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
121 return false;
124 bool SIInstrInfo::isReallyTriviallyReMaterializable(
125 const MachineInstr &MI) const {
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
145 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
148 // Returns true if the scalar result of a VALU instruction depends on exec.
149 static bool resultDependsOnExec(const MachineInstr &MI) {
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
171 return false;
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
181 return false;
184 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
187 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
190 bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
213 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
214 FromCycle->getExitingBlocks(ExitingBlocks);
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
222 FromCycle = FromCycle->getParentCycle();
227 return true;
230 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
247 if (isDS(Opc0) && isDS(Opc1)) {
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
300 if (!Load0Offset || !Load1Offset)
301 return false;
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
341 return false;
344 static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
356 bool SIInstrInfo::getMemOperandsWithOffsetWidth(
357 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
411 if (isStride64(Opc))
412 EltSize *= 64;
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
427 return true;
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
515 return false;
518 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
519 ArrayRef<const MachineOperand *> BaseOps1,
520 const MachineInstr &MI2,
521 ArrayRef<const MachineOperand *> BaseOps2) {
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
531 auto *MO1 = *MI1.memoperands_begin();
532 auto *MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
536 const auto *Base1 = MO1->getValue();
537 const auto *Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
546 return Base1 == Base2;
549 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
550 int64_t Offset1, bool OffsetIsScalable1,
551 ArrayRef<const MachineOperand *> BaseOps2,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 if (!BaseOps1.empty() && !BaseOps2.empty()) {
558 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
559 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
560 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
561 return false;
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed 8. This is an
569 // empirical value based on certain observations and performance related
570 // experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize`.
574 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
575 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
576 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
577 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
578 // (5) LoadSize >= 17: do not cluster
579 const unsigned LoadSize = NumBytes / ClusterSize;
580 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
581 return NumDWORDs <= 8;
584 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
585 // the first 16 loads will be interleaved with the stores, and the next 16 will
586 // be clustered as expected. It should really split into 2 16 store batches.
588 // Loads are clustered until this returns false, rather than trying to schedule
589 // groups of stores. This also means we have to deal with saying different
590 // address space loads should be clustered, and ones which might cause bank
591 // conflicts.
593 // This might be deprecated so it might not be worth that much effort to fix.
594 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
595 int64_t Offset0, int64_t Offset1,
596 unsigned NumLoads) const {
597 assert(Offset1 > Offset0 &&
598 "Second offset should be larger than first offset!");
599 // If we have less than 16 loads in a row, and the offsets are within 64
600 // bytes, then schedule together.
602 // A cacheline is 64 bytes (for global memory).
603 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
606 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
607 MachineBasicBlock::iterator MI,
608 const DebugLoc &DL, MCRegister DestReg,
609 MCRegister SrcReg, bool KillSrc,
610 const char *Msg = "illegal VGPR to SGPR copy") {
611 MachineFunction *MF = MBB.getParent();
612 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
613 LLVMContext &C = MF->getFunction().getContext();
614 C.diagnose(IllegalCopy);
616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
617 .addReg(SrcReg, getKillRegState(KillSrc));
620 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
621 /// possible to have a direct copy in these cases on GFX908, so an intermediate
622 /// VGPR copy is required.
623 static void indirectCopyToAGPR(const SIInstrInfo &TII,
624 MachineBasicBlock &MBB,
625 MachineBasicBlock::iterator MI,
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 RegScavenger &RS, bool RegsOverlap,
629 Register ImpDefSuperReg = Register(),
630 Register ImpUseSuperReg = Register()) {
631 assert((TII.getSubtarget().hasMAIInsts() &&
632 !TII.getSubtarget().hasGFX90AInsts()) &&
633 "Expected GFX908 subtarget.");
635 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
636 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
637 "Source register of the copy should be either an SGPR or an AGPR.");
639 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
640 "Destination register of the copy should be an AGPR.");
642 const SIRegisterInfo &RI = TII.getRegisterInfo();
644 // First try to find defining accvgpr_write to avoid temporary registers.
645 // In the case of copies of overlapping AGPRs, we conservatively do not
646 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
647 // an accvgpr_write used for this same copy due to implicit-defs
648 if (!RegsOverlap) {
649 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
650 --Def;
652 if (!Def->modifiesRegister(SrcReg, &RI))
653 continue;
655 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
656 Def->getOperand(0).getReg() != SrcReg)
657 break;
659 MachineOperand &DefOp = Def->getOperand(1);
660 assert(DefOp.isReg() || DefOp.isImm());
662 if (DefOp.isReg()) {
663 bool SafeToPropagate = true;
664 // Check that register source operand is not clobbered before MI.
665 // Immediate operands are always safe to propagate.
666 for (auto I = Def; I != MI && SafeToPropagate; ++I)
667 if (I->modifiesRegister(DefOp.getReg(), &RI))
668 SafeToPropagate = false;
670 if (!SafeToPropagate)
671 break;
673 DefOp.setIsKill(false);
676 MachineInstrBuilder Builder =
677 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
678 .add(DefOp);
679 if (ImpDefSuperReg)
680 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
682 if (ImpUseSuperReg) {
683 Builder.addReg(ImpUseSuperReg,
684 getKillRegState(KillSrc) | RegState::Implicit);
687 return;
691 RS.enterBasicBlockEnd(MBB);
692 RS.backward(std::next(MI));
694 // Ideally we want to have three registers for a long reg_sequence copy
695 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
696 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
697 *MBB.getParent());
699 // Registers in the sequence are allocated contiguously so we can just
700 // use register number to pick one of three round-robin temps.
701 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
702 Register Tmp =
703 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
704 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
705 "VGPR used for an intermediate copy should have been reserved.");
707 // Only loop through if there are any free registers left. We don't want to
708 // spill.
709 while (RegNo--) {
710 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
711 /* RestoreAfter */ false, 0,
712 /* AllowSpill */ false);
713 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
714 break;
715 Tmp = Tmp2;
716 RS.setRegUsed(Tmp);
719 // Insert copy to temporary VGPR.
720 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
721 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
722 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
723 } else {
724 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
727 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
728 .addReg(SrcReg, getKillRegState(KillSrc));
729 if (ImpUseSuperReg) {
730 UseBuilder.addReg(ImpUseSuperReg,
731 getKillRegState(KillSrc) | RegState::Implicit);
734 MachineInstrBuilder DefBuilder
735 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
736 .addReg(Tmp, RegState::Kill);
738 if (ImpDefSuperReg)
739 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
742 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
743 MachineBasicBlock::iterator MI, const DebugLoc &DL,
744 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
745 const TargetRegisterClass *RC, bool Forward) {
746 const SIRegisterInfo &RI = TII.getRegisterInfo();
747 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
748 MachineBasicBlock::iterator I = MI;
749 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
751 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
752 int16_t SubIdx = BaseIndices[Idx];
753 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
754 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
755 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
756 unsigned Opcode = AMDGPU::S_MOV_B32;
758 // Is SGPR aligned? If so try to combine with next.
759 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
760 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
761 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
762 // Can use SGPR64 copy
763 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
764 SubIdx = RI.getSubRegFromChannel(Channel, 2);
765 DestSubReg = RI.getSubReg(DestReg, SubIdx);
766 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
767 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
768 Opcode = AMDGPU::S_MOV_B64;
769 Idx++;
772 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
773 .addReg(SrcSubReg)
774 .addReg(SrcReg, RegState::Implicit);
776 if (!FirstMI)
777 FirstMI = LastMI;
779 if (!Forward)
780 I--;
783 assert(FirstMI && LastMI);
784 if (!Forward)
785 std::swap(FirstMI, LastMI);
787 FirstMI->addOperand(
788 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
790 if (KillSrc)
791 LastMI->addRegisterKilled(SrcReg, &RI);
794 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
795 MachineBasicBlock::iterator MI,
796 const DebugLoc &DL, MCRegister DestReg,
797 MCRegister SrcReg, bool KillSrc,
798 bool RenamableDest, bool RenamableSrc) const {
799 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
800 unsigned Size = RI.getRegSizeInBits(*RC);
801 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
802 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies) {
808 if (((Size == 16) != (SrcSize == 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
810 assert(ST.hasTrue16BitInsts());
811 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
812 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
813 RegToFix = SubReg;
815 if (DestReg == SrcReg) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
817 // instruction here.
818 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
819 return;
821 RC = RI.getPhysRegBaseClass(DestReg);
822 Size = RI.getRegSizeInBits(*RC);
823 SrcRC = RI.getPhysRegBaseClass(SrcReg);
824 SrcSize = RI.getRegSizeInBits(*SrcRC);
828 if (RC == &AMDGPU::VGPR_32RegClass) {
829 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
830 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
831 AMDGPU::AGPR_32RegClass.contains(SrcReg));
832 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
834 BuildMI(MBB, MI, DL, get(Opc), DestReg)
835 .addReg(SrcReg, getKillRegState(KillSrc));
836 return;
839 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
840 RC == &AMDGPU::SReg_32RegClass) {
841 if (SrcReg == AMDGPU::SCC) {
842 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
843 .addImm(1)
844 .addImm(0);
845 return;
848 if (DestReg == AMDGPU::VCC_LO) {
849 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
850 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
851 .addReg(SrcReg, getKillRegState(KillSrc));
852 } else {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
860 return;
863 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
869 .addReg(SrcReg, getKillRegState(KillSrc));
870 return;
873 if (RC == &AMDGPU::SReg_64RegClass) {
874 if (SrcReg == AMDGPU::SCC) {
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
876 .addImm(1)
877 .addImm(0);
878 return;
881 if (DestReg == AMDGPU::VCC) {
882 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 } else {
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
889 .addImm(0)
890 .addReg(SrcReg, getKillRegState(KillSrc));
893 return;
896 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
897 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
898 return;
901 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
906 if (DestReg == AMDGPU::SCC) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
913 assert(ST.hasScalarCompareEq64());
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
915 .addReg(SrcReg, getKillRegState(KillSrc))
916 .addImm(0);
917 } else {
918 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
919 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
920 .addReg(SrcReg, getKillRegState(KillSrc))
921 .addImm(0);
924 return;
927 if (RC == &AMDGPU::AGPR_32RegClass) {
928 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
929 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
930 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
931 .addReg(SrcReg, getKillRegState(KillSrc));
932 return;
935 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
936 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
937 .addReg(SrcReg, getKillRegState(KillSrc));
938 return;
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
942 // every AGPR spill.
943 RegScavenger RS;
944 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
945 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
946 return;
949 if (Size == 16) {
950 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
951 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
952 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
954 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
955 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
956 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
957 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
958 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
959 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
960 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
961 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
963 if (IsSGPRDst) {
964 if (!IsSGPRSrc) {
965 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
966 return;
969 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
970 .addReg(NewSrcReg, getKillRegState(KillSrc));
971 return;
974 if (IsAGPRDst || IsAGPRSrc) {
975 if (!DstLow || !SrcLow) {
976 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
977 "Cannot use hi16 subreg with an AGPR!");
980 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
981 return;
984 if (ST.hasTrue16BitInsts()) {
985 if (IsSGPRSrc) {
986 assert(SrcLow);
987 SrcReg = NewSrcReg;
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
991 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
993 .addReg(SrcReg);
994 } else {
995 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
996 .addImm(0) // src0_modifiers
997 .addReg(SrcReg)
998 .addImm(0); // op_sel
1000 return;
1003 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1004 if (!DstLow || !SrcLow) {
1005 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1006 "Cannot use hi16 subreg on VI!");
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1010 .addReg(NewSrcReg, getKillRegState(KillSrc));
1011 return;
1014 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(NewSrcReg)
1017 .addImm(0) // clamp
1018 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1019 : AMDGPU::SDWA::SdwaSel::WORD_1)
1020 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1021 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1022 : AMDGPU::SDWA::SdwaSel::WORD_1)
1023 .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
1024 // First implicit operand is $exec.
1025 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1026 return;
1029 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1030 if (ST.hasMovB64()) {
1031 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1032 .addReg(SrcReg, getKillRegState(KillSrc));
1033 return;
1035 if (ST.hasPkMovB32()) {
1036 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1037 .addImm(SISrcMods::OP_SEL_1)
1038 .addReg(SrcReg)
1039 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1040 .addReg(SrcReg)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1045 .addImm(0) // clamp
1046 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1047 return;
1051 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1052 if (RI.isSGPRClass(RC)) {
1053 if (!RI.isSGPRClass(SrcRC)) {
1054 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1055 return;
1057 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1058 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1059 Forward);
1060 return;
1063 unsigned EltSize = 4;
1064 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1065 if (RI.isAGPRClass(RC)) {
1066 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1067 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1068 else if (RI.hasVGPRs(SrcRC) ||
1069 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1070 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1071 else
1072 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1073 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1074 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1075 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1076 (RI.isProperlyAlignedRC(*RC) &&
1077 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST.hasMovB64()) {
1080 Opcode = AMDGPU::V_MOV_B64_e32;
1081 EltSize = 8;
1082 } else if (ST.hasPkMovB32()) {
1083 Opcode = AMDGPU::V_PK_MOV_B32;
1084 EltSize = 8;
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr<RegScavenger> RS;
1094 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1095 RS = std::make_unique<RegScavenger>();
1097 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1102 const bool CanKillSuperReg = KillSrc && !Overlap;
1104 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1105 unsigned SubIdx;
1106 if (Forward)
1107 SubIdx = SubIndices[Idx];
1108 else
1109 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1110 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1111 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1112 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1114 bool IsFirstSubreg = Idx == 0;
1115 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1117 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1118 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1119 Register ImpUseSuper = SrcReg;
1120 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1121 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1122 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1123 MachineInstrBuilder MIB =
1124 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1125 .addImm(SISrcMods::OP_SEL_1)
1126 .addReg(SrcSubReg)
1127 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1128 .addReg(SrcSubReg)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1133 .addImm(0) // clamp
1134 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 if (IsFirstSubreg)
1136 MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
1137 } else {
1138 MachineInstrBuilder Builder =
1139 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1140 if (IsFirstSubreg)
1141 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1143 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1148 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1149 int NewOpc;
1151 // Try to map original to commuted opcode
1152 NewOpc = AMDGPU::getCommuteRev(Opcode);
1153 if (NewOpc != -1)
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1157 // Try to map commuted to original opcode
1158 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1159 if (NewOpc != -1)
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1163 return Opcode;
1166 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
1167 MachineBasicBlock::iterator MI,
1168 const DebugLoc &DL, Register DestReg,
1169 int64_t Value) const {
1170 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1171 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1172 if (RegClass == &AMDGPU::SReg_32RegClass ||
1173 RegClass == &AMDGPU::SGPR_32RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1175 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1176 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1177 .addImm(Value);
1178 return;
1181 if (RegClass == &AMDGPU::SReg_64RegClass ||
1182 RegClass == &AMDGPU::SGPR_64RegClass ||
1183 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1184 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1185 .addImm(Value);
1186 return;
1189 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1191 .addImm(Value);
1192 return;
1194 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1196 .addImm(Value);
1197 return;
1200 unsigned EltSize = 4;
1201 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1202 if (RI.isSGPRClass(RegClass)) {
1203 if (RI.getRegSizeInBits(*RegClass) > 32) {
1204 Opcode = AMDGPU::S_MOV_B64;
1205 EltSize = 8;
1206 } else {
1207 Opcode = AMDGPU::S_MOV_B32;
1208 EltSize = 4;
1212 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1213 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1214 int64_t IdxValue = Idx == 0 ? Value : 0;
1216 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1217 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1218 Builder.addImm(IdxValue);
1222 const TargetRegisterClass *
1223 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1224 return &AMDGPU::VGPR_32RegClass;
1227 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1228 MachineBasicBlock::iterator I,
1229 const DebugLoc &DL, Register DstReg,
1230 ArrayRef<MachineOperand> Cond,
1231 Register TrueReg,
1232 Register FalseReg) const {
1233 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1234 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1235 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1236 "Not a VGPR32 reg");
1238 if (Cond.size() == 1) {
1239 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1240 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1241 .add(Cond[0]);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 } else if (Cond.size() == 2) {
1249 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1250 switch (Cond[0].getImm()) {
1251 case SIInstrInfo::SCC_TRUE: {
1252 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1253 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1254 : AMDGPU::S_CSELECT_B64), SReg)
1255 .addImm(1)
1256 .addImm(0);
1257 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1258 .addImm(0)
1259 .addReg(FalseReg)
1260 .addImm(0)
1261 .addReg(TrueReg)
1262 .addReg(SReg);
1263 break;
1265 case SIInstrInfo::SCC_FALSE: {
1266 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1267 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1268 : AMDGPU::S_CSELECT_B64), SReg)
1269 .addImm(0)
1270 .addImm(1);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1279 case SIInstrInfo::VCCNZ: {
1280 MachineOperand RegOp = Cond[1];
1281 RegOp.setImplicit(false);
1282 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1283 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1284 .add(RegOp);
1285 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1286 .addImm(0)
1287 .addReg(FalseReg)
1288 .addImm(0)
1289 .addReg(TrueReg)
1290 .addReg(SReg);
1291 break;
1293 case SIInstrInfo::VCCZ: {
1294 MachineOperand RegOp = Cond[1];
1295 RegOp.setImplicit(false);
1296 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1297 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1298 .add(RegOp);
1299 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1300 .addImm(0)
1301 .addReg(TrueReg)
1302 .addImm(0)
1303 .addReg(FalseReg)
1304 .addReg(SReg);
1305 break;
1307 case SIInstrInfo::EXECNZ: {
1308 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1309 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1310 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1311 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1312 .addImm(0);
1313 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1314 : AMDGPU::S_CSELECT_B64), SReg)
1315 .addImm(1)
1316 .addImm(0);
1317 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1318 .addImm(0)
1319 .addReg(FalseReg)
1320 .addImm(0)
1321 .addReg(TrueReg)
1322 .addReg(SReg);
1323 break;
1325 case SIInstrInfo::EXECZ: {
1326 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1327 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1328 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1329 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1330 .addImm(0);
1331 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1332 : AMDGPU::S_CSELECT_B64), SReg)
1333 .addImm(0)
1334 .addImm(1);
1335 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1336 .addImm(0)
1337 .addReg(FalseReg)
1338 .addImm(0)
1339 .addReg(TrueReg)
1340 .addReg(SReg);
1341 llvm_unreachable("Unhandled branch predicate EXECZ");
1342 break;
1344 default:
1345 llvm_unreachable("invalid branch predicate");
1347 } else {
1348 llvm_unreachable("Can only handle Cond size 1 or 2");
1352 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1353 MachineBasicBlock::iterator I,
1354 const DebugLoc &DL,
1355 Register SrcReg, int Value) const {
1356 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1357 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1358 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1359 .addImm(Value)
1360 .addReg(SrcReg);
1362 return Reg;
1365 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1366 MachineBasicBlock::iterator I,
1367 const DebugLoc &DL,
1368 Register SrcReg, int Value) const {
1369 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1370 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1371 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1372 .addImm(Value)
1373 .addReg(SrcReg);
1375 return Reg;
1378 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1380 if (RI.isAGPRClass(DstRC))
1381 return AMDGPU::COPY;
1382 if (RI.getRegSizeInBits(*DstRC) == 16) {
1383 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1384 // before RA.
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1387 if (RI.getRegSizeInBits(*DstRC) == 32)
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1390 return AMDGPU::S_MOV_B64;
1391 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 return AMDGPU::COPY;
1396 const MCInstrDesc &
1397 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1398 bool IsIndirectSrc) const {
1399 if (IsIndirectSrc) {
1400 if (VecSize <= 32) // 4 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1402 if (VecSize <= 64) // 8 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1404 if (VecSize <= 96) // 12 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1406 if (VecSize <= 128) // 16 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1408 if (VecSize <= 160) // 20 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1410 if (VecSize <= 256) // 32 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1412 if (VecSize <= 288) // 36 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1414 if (VecSize <= 320) // 40 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1416 if (VecSize <= 352) // 44 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1418 if (VecSize <= 384) // 48 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1420 if (VecSize <= 512) // 64 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1422 if (VecSize <= 1024) // 128 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1428 if (VecSize <= 32) // 4 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1430 if (VecSize <= 64) // 8 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1432 if (VecSize <= 96) // 12 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1434 if (VecSize <= 128) // 16 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1436 if (VecSize <= 160) // 20 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1438 if (VecSize <= 256) // 32 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1440 if (VecSize <= 288) // 36 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1442 if (VecSize <= 320) // 40 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1444 if (VecSize <= 352) // 44 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1446 if (VecSize <= 384) // 48 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1448 if (VecSize <= 512) // 64 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1450 if (VecSize <= 1024) // 128 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1453 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1456 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1457 if (VecSize <= 32) // 4 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1459 if (VecSize <= 64) // 8 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1461 if (VecSize <= 96) // 12 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1463 if (VecSize <= 128) // 16 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1465 if (VecSize <= 160) // 20 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1467 if (VecSize <= 256) // 32 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1469 if (VecSize <= 288) // 36 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1471 if (VecSize <= 320) // 40 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1473 if (VecSize <= 352) // 44 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1475 if (VecSize <= 384) // 48 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1477 if (VecSize <= 512) // 64 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1479 if (VecSize <= 1024) // 128 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1482 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1485 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1486 if (VecSize <= 32) // 4 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1488 if (VecSize <= 64) // 8 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1490 if (VecSize <= 96) // 12 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1492 if (VecSize <= 128) // 16 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1494 if (VecSize <= 160) // 20 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1496 if (VecSize <= 256) // 32 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1498 if (VecSize <= 288) // 36 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1500 if (VecSize <= 320) // 40 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1502 if (VecSize <= 352) // 44 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1504 if (VecSize <= 384) // 48 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1506 if (VecSize <= 512) // 64 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1508 if (VecSize <= 1024) // 128 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1511 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1517 if (VecSize <= 128) // 16 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1519 if (VecSize <= 256) // 32 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1521 if (VecSize <= 512) // 64 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1523 if (VecSize <= 1024) // 128 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1526 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1529 const MCInstrDesc &
1530 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1531 bool IsSGPR) const {
1532 if (IsSGPR) {
1533 switch (EltSize) {
1534 case 32:
1535 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1536 case 64:
1537 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1538 default:
1539 llvm_unreachable("invalid reg indexing elt size");
1543 assert(EltSize == 32 && "invalid reg indexing elt size");
1544 return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1547 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1548 switch (Size) {
1549 case 4:
1550 return AMDGPU::SI_SPILL_S32_SAVE;
1551 case 8:
1552 return AMDGPU::SI_SPILL_S64_SAVE;
1553 case 12:
1554 return AMDGPU::SI_SPILL_S96_SAVE;
1555 case 16:
1556 return AMDGPU::SI_SPILL_S128_SAVE;
1557 case 20:
1558 return AMDGPU::SI_SPILL_S160_SAVE;
1559 case 24:
1560 return AMDGPU::SI_SPILL_S192_SAVE;
1561 case 28:
1562 return AMDGPU::SI_SPILL_S224_SAVE;
1563 case 32:
1564 return AMDGPU::SI_SPILL_S256_SAVE;
1565 case 36:
1566 return AMDGPU::SI_SPILL_S288_SAVE;
1567 case 40:
1568 return AMDGPU::SI_SPILL_S320_SAVE;
1569 case 44:
1570 return AMDGPU::SI_SPILL_S352_SAVE;
1571 case 48:
1572 return AMDGPU::SI_SPILL_S384_SAVE;
1573 case 64:
1574 return AMDGPU::SI_SPILL_S512_SAVE;
1575 case 128:
1576 return AMDGPU::SI_SPILL_S1024_SAVE;
1577 default:
1578 llvm_unreachable("unknown register size");
1582 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1617 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_A32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_A64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_A96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_A128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_A160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_A192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_A224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_A256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_A288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_A320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_A352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_A384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_A512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_A1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1652 static unsigned getAVSpillSaveOpcode(unsigned Size) {
1653 switch (Size) {
1654 case 4:
1655 return AMDGPU::SI_SPILL_AV32_SAVE;
1656 case 8:
1657 return AMDGPU::SI_SPILL_AV64_SAVE;
1658 case 12:
1659 return AMDGPU::SI_SPILL_AV96_SAVE;
1660 case 16:
1661 return AMDGPU::SI_SPILL_AV128_SAVE;
1662 case 20:
1663 return AMDGPU::SI_SPILL_AV160_SAVE;
1664 case 24:
1665 return AMDGPU::SI_SPILL_AV192_SAVE;
1666 case 28:
1667 return AMDGPU::SI_SPILL_AV224_SAVE;
1668 case 32:
1669 return AMDGPU::SI_SPILL_AV256_SAVE;
1670 case 36:
1671 return AMDGPU::SI_SPILL_AV288_SAVE;
1672 case 40:
1673 return AMDGPU::SI_SPILL_AV320_SAVE;
1674 case 44:
1675 return AMDGPU::SI_SPILL_AV352_SAVE;
1676 case 48:
1677 return AMDGPU::SI_SPILL_AV384_SAVE;
1678 case 64:
1679 return AMDGPU::SI_SPILL_AV512_SAVE;
1680 case 128:
1681 return AMDGPU::SI_SPILL_AV1024_SAVE;
1682 default:
1683 llvm_unreachable("unknown register size");
1687 static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1688 bool IsVectorSuperClass) {
1689 // Currently, there is only 32-bit WWM register spills needed.
1690 if (Size != 4)
1691 llvm_unreachable("unknown wwm register spill size");
1693 if (IsVectorSuperClass)
1694 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1696 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1699 static unsigned getVectorRegSpillSaveOpcode(Register Reg,
1700 const TargetRegisterClass *RC,
1701 unsigned Size,
1702 const SIRegisterInfo &TRI,
1703 const SIMachineFunctionInfo &MFI) {
1704 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1706 // Choose the right opcode if spilling a WWM register.
1707 if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710 if (IsVectorSuperClass)
1711 return getAVSpillSaveOpcode(Size);
1713 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1714 : getVGPRSpillSaveOpcode(Size);
1717 void SIInstrInfo::storeRegToStackSlot(
1718 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1720 const TargetRegisterInfo *TRI, Register VReg) const {
1721 MachineFunction *MF = MBB.getParent();
1722 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1728 MachineMemOperand *MMO = MF->getMachineMemOperand(
1729 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1730 FrameInfo.getObjectAlign(FrameIndex));
1731 unsigned SpillSize = TRI->getSpillSize(*RC);
1733 MachineRegisterInfo &MRI = MF->getRegInfo();
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1750 BuildMI(MBB, MI, DL, OpDesc)
1751 .addReg(SrcReg, getKillRegState(isKill)) // data
1752 .addFrameIndex(FrameIndex) // addr
1753 .addMemOperand(MMO)
1754 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1758 return;
1761 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1762 SpillSize, RI, *MFI);
1763 MFI->setHasSpilledVGPRs();
1765 BuildMI(MBB, MI, DL, get(Opcode))
1766 .addReg(SrcReg, getKillRegState(isKill)) // data
1767 .addFrameIndex(FrameIndex) // addr
1768 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(0) // offset
1770 .addMemOperand(MMO);
1773 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1808 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 4:
1811 return AMDGPU::SI_SPILL_V32_RESTORE;
1812 case 8:
1813 return AMDGPU::SI_SPILL_V64_RESTORE;
1814 case 12:
1815 return AMDGPU::SI_SPILL_V96_RESTORE;
1816 case 16:
1817 return AMDGPU::SI_SPILL_V128_RESTORE;
1818 case 20:
1819 return AMDGPU::SI_SPILL_V160_RESTORE;
1820 case 24:
1821 return AMDGPU::SI_SPILL_V192_RESTORE;
1822 case 28:
1823 return AMDGPU::SI_SPILL_V224_RESTORE;
1824 case 32:
1825 return AMDGPU::SI_SPILL_V256_RESTORE;
1826 case 36:
1827 return AMDGPU::SI_SPILL_V288_RESTORE;
1828 case 40:
1829 return AMDGPU::SI_SPILL_V320_RESTORE;
1830 case 44:
1831 return AMDGPU::SI_SPILL_V352_RESTORE;
1832 case 48:
1833 return AMDGPU::SI_SPILL_V384_RESTORE;
1834 case 64:
1835 return AMDGPU::SI_SPILL_V512_RESTORE;
1836 case 128:
1837 return AMDGPU::SI_SPILL_V1024_RESTORE;
1838 default:
1839 llvm_unreachable("unknown register size");
1843 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1844 switch (Size) {
1845 case 4:
1846 return AMDGPU::SI_SPILL_A32_RESTORE;
1847 case 8:
1848 return AMDGPU::SI_SPILL_A64_RESTORE;
1849 case 12:
1850 return AMDGPU::SI_SPILL_A96_RESTORE;
1851 case 16:
1852 return AMDGPU::SI_SPILL_A128_RESTORE;
1853 case 20:
1854 return AMDGPU::SI_SPILL_A160_RESTORE;
1855 case 24:
1856 return AMDGPU::SI_SPILL_A192_RESTORE;
1857 case 28:
1858 return AMDGPU::SI_SPILL_A224_RESTORE;
1859 case 32:
1860 return AMDGPU::SI_SPILL_A256_RESTORE;
1861 case 36:
1862 return AMDGPU::SI_SPILL_A288_RESTORE;
1863 case 40:
1864 return AMDGPU::SI_SPILL_A320_RESTORE;
1865 case 44:
1866 return AMDGPU::SI_SPILL_A352_RESTORE;
1867 case 48:
1868 return AMDGPU::SI_SPILL_A384_RESTORE;
1869 case 64:
1870 return AMDGPU::SI_SPILL_A512_RESTORE;
1871 case 128:
1872 return AMDGPU::SI_SPILL_A1024_RESTORE;
1873 default:
1874 llvm_unreachable("unknown register size");
1878 static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1879 switch (Size) {
1880 case 4:
1881 return AMDGPU::SI_SPILL_AV32_RESTORE;
1882 case 8:
1883 return AMDGPU::SI_SPILL_AV64_RESTORE;
1884 case 12:
1885 return AMDGPU::SI_SPILL_AV96_RESTORE;
1886 case 16:
1887 return AMDGPU::SI_SPILL_AV128_RESTORE;
1888 case 20:
1889 return AMDGPU::SI_SPILL_AV160_RESTORE;
1890 case 24:
1891 return AMDGPU::SI_SPILL_AV192_RESTORE;
1892 case 28:
1893 return AMDGPU::SI_SPILL_AV224_RESTORE;
1894 case 32:
1895 return AMDGPU::SI_SPILL_AV256_RESTORE;
1896 case 36:
1897 return AMDGPU::SI_SPILL_AV288_RESTORE;
1898 case 40:
1899 return AMDGPU::SI_SPILL_AV320_RESTORE;
1900 case 44:
1901 return AMDGPU::SI_SPILL_AV352_RESTORE;
1902 case 48:
1903 return AMDGPU::SI_SPILL_AV384_RESTORE;
1904 case 64:
1905 return AMDGPU::SI_SPILL_AV512_RESTORE;
1906 case 128:
1907 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1908 default:
1909 llvm_unreachable("unknown register size");
1913 static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1914 bool IsVectorSuperClass) {
1915 // Currently, there is only 32-bit WWM register spills needed.
1916 if (Size != 4)
1917 llvm_unreachable("unknown wwm register spill size");
1919 if (IsVectorSuperClass)
1920 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1922 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1925 static unsigned
1926 getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
1927 unsigned Size, const SIRegisterInfo &TRI,
1928 const SIMachineFunctionInfo &MFI) {
1929 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1931 // Choose the right opcode if restoring a WWM register.
1932 if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
1933 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1935 if (IsVectorSuperClass)
1936 return getAVSpillRestoreOpcode(Size);
1938 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1939 : getVGPRSpillRestoreOpcode(Size);
1942 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1943 MachineBasicBlock::iterator MI,
1944 Register DestReg, int FrameIndex,
1945 const TargetRegisterClass *RC,
1946 const TargetRegisterInfo *TRI,
1947 Register VReg) const {
1948 MachineFunction *MF = MBB.getParent();
1949 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1950 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1951 const DebugLoc &DL = MBB.findDebugLoc(MI);
1952 unsigned SpillSize = TRI->getSpillSize(*RC);
1954 MachinePointerInfo PtrInfo
1955 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1957 MachineMemOperand *MMO = MF->getMachineMemOperand(
1958 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1959 FrameInfo.getObjectAlign(FrameIndex));
1961 if (RI.isSGPRClass(RC)) {
1962 MFI->setHasSpilledSGPRs();
1963 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1964 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1965 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1967 // FIXME: Maybe this should not include a memoperand because it will be
1968 // lowered to non-memory instructions.
1969 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1970 if (DestReg.isVirtual() && SpillSize == 4) {
1971 MachineRegisterInfo &MRI = MF->getRegInfo();
1972 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1975 if (RI.spillSGPRToVGPR())
1976 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1977 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1978 .addFrameIndex(FrameIndex) // addr
1979 .addMemOperand(MMO)
1980 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1982 return;
1985 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1986 SpillSize, RI, *MFI);
1987 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1988 .addFrameIndex(FrameIndex) // vaddr
1989 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1990 .addImm(0) // offset
1991 .addMemOperand(MMO);
1994 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1995 MachineBasicBlock::iterator MI) const {
1996 insertNoops(MBB, MI, 1);
1999 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
2000 MachineBasicBlock::iterator MI,
2001 unsigned Quantity) const {
2002 DebugLoc DL = MBB.findDebugLoc(MI);
2003 while (Quantity > 0) {
2004 unsigned Arg = std::min(Quantity, 8u);
2005 Quantity -= Arg;
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2010 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
2011 auto *MF = MBB.getParent();
2012 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2014 assert(Info->isEntryFunction());
2016 if (MBB.succ_empty()) {
2017 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2018 if (HasNoTerminator) {
2019 if (Info->returnsVoid()) {
2020 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2021 } else {
2022 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2028 MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
2029 MachineBasicBlock &MBB,
2030 MachineInstr &MI,
2031 const DebugLoc &DL) const {
2032 MachineFunction *MF = MBB.getParent();
2033 constexpr unsigned DoorbellIDMask = 0x3ff;
2034 constexpr unsigned ECQueueWaveAbort = 0x400;
2036 MachineBasicBlock *TrapBB = &MBB;
2037 MachineBasicBlock *ContBB = &MBB;
2038 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2040 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2041 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2042 TrapBB = MF->CreateMachineBasicBlock();
2043 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2044 MF->push_back(TrapBB);
2045 MBB.addSuccessor(TrapBB);
2048 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2049 // will be a nop.
2050 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2051 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2052 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2053 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2054 DoorbellReg)
2055 .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2056 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2057 .addUse(AMDGPU::M0);
2058 Register DoorbellRegMasked =
2059 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2061 .addUse(DoorbellReg)
2062 .addImm(DoorbellIDMask);
2063 Register SetWaveAbortBit =
2064 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2066 .addUse(DoorbellRegMasked)
2067 .addImm(ECQueueWaveAbort);
2068 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2069 .addUse(SetWaveAbortBit);
2070 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2071 .addImm(AMDGPU::SendMsg::ID_INTERRUPT);
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2073 .addUse(AMDGPU::TTMP2);
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2075 TrapBB->addSuccessor(HaltLoopBB);
2077 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2078 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2079 .addMBB(HaltLoopBB);
2080 MF->push_back(HaltLoopBB);
2081 HaltLoopBB->addSuccessor(HaltLoopBB);
2083 return ContBB;
2086 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2087 switch (MI.getOpcode()) {
2088 default:
2089 if (MI.isMetaInstruction())
2090 return 0;
2091 return 1; // FIXME: Do wait states equal cycles?
2093 case AMDGPU::S_NOP:
2094 return MI.getOperand(0).getImm() + 1;
2095 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2096 // hazard, even if one exist, won't really be visible. Should we handle it?
2100 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2101 MachineBasicBlock &MBB = *MI.getParent();
2102 DebugLoc DL = MBB.findDebugLoc(MI);
2103 switch (MI.getOpcode()) {
2104 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2105 case AMDGPU::S_MOV_B64_term:
2106 // This is only a terminator to get the correct spill code placement during
2107 // register allocation.
2108 MI.setDesc(get(AMDGPU::S_MOV_B64));
2109 break;
2111 case AMDGPU::S_MOV_B32_term:
2112 // This is only a terminator to get the correct spill code placement during
2113 // register allocation.
2114 MI.setDesc(get(AMDGPU::S_MOV_B32));
2115 break;
2117 case AMDGPU::S_XOR_B64_term:
2118 // This is only a terminator to get the correct spill code placement during
2119 // register allocation.
2120 MI.setDesc(get(AMDGPU::S_XOR_B64));
2121 break;
2123 case AMDGPU::S_XOR_B32_term:
2124 // This is only a terminator to get the correct spill code placement during
2125 // register allocation.
2126 MI.setDesc(get(AMDGPU::S_XOR_B32));
2127 break;
2128 case AMDGPU::S_OR_B64_term:
2129 // This is only a terminator to get the correct spill code placement during
2130 // register allocation.
2131 MI.setDesc(get(AMDGPU::S_OR_B64));
2132 break;
2133 case AMDGPU::S_OR_B32_term:
2134 // This is only a terminator to get the correct spill code placement during
2135 // register allocation.
2136 MI.setDesc(get(AMDGPU::S_OR_B32));
2137 break;
2139 case AMDGPU::S_ANDN2_B64_term:
2140 // This is only a terminator to get the correct spill code placement during
2141 // register allocation.
2142 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2143 break;
2145 case AMDGPU::S_ANDN2_B32_term:
2146 // This is only a terminator to get the correct spill code placement during
2147 // register allocation.
2148 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2149 break;
2151 case AMDGPU::S_AND_B64_term:
2152 // This is only a terminator to get the correct spill code placement during
2153 // register allocation.
2154 MI.setDesc(get(AMDGPU::S_AND_B64));
2155 break;
2157 case AMDGPU::S_AND_B32_term:
2158 // This is only a terminator to get the correct spill code placement during
2159 // register allocation.
2160 MI.setDesc(get(AMDGPU::S_AND_B32));
2161 break;
2163 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2164 // This is only a terminator to get the correct spill code placement during
2165 // register allocation.
2166 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2167 break;
2169 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2170 // This is only a terminator to get the correct spill code placement during
2171 // register allocation.
2172 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2173 break;
2175 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2176 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2177 break;
2179 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2180 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2181 break;
2183 case AMDGPU::V_MOV_B64_PSEUDO: {
2184 Register Dst = MI.getOperand(0).getReg();
2185 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2186 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2188 const MachineOperand &SrcOp = MI.getOperand(1);
2189 // FIXME: Will this work for 64-bit floating point immediates?
2190 assert(!SrcOp.isFPImm());
2191 if (ST.hasMovB64()) {
2192 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2193 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2194 isUInt<32>(SrcOp.getImm()))
2195 break;
2197 if (SrcOp.isImm()) {
2198 APInt Imm(64, SrcOp.getImm());
2199 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2200 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2201 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2202 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2203 .addImm(SISrcMods::OP_SEL_1)
2204 .addImm(Lo.getSExtValue())
2205 .addImm(SISrcMods::OP_SEL_1)
2206 .addImm(Lo.getSExtValue())
2207 .addImm(0) // op_sel_lo
2208 .addImm(0) // op_sel_hi
2209 .addImm(0) // neg_lo
2210 .addImm(0) // neg_hi
2211 .addImm(0); // clamp
2212 } else {
2213 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2214 .addImm(Lo.getSExtValue())
2215 .addReg(Dst, RegState::Implicit | RegState::Define);
2216 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2217 .addImm(Hi.getSExtValue())
2218 .addReg(Dst, RegState::Implicit | RegState::Define);
2220 } else {
2221 assert(SrcOp.isReg());
2222 if (ST.hasPkMovB32() &&
2223 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2224 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2225 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2226 .addReg(SrcOp.getReg())
2227 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2228 .addReg(SrcOp.getReg())
2229 .addImm(0) // op_sel_lo
2230 .addImm(0) // op_sel_hi
2231 .addImm(0) // neg_lo
2232 .addImm(0) // neg_hi
2233 .addImm(0); // clamp
2234 } else {
2235 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2236 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2237 .addReg(Dst, RegState::Implicit | RegState::Define);
2238 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2239 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2240 .addReg(Dst, RegState::Implicit | RegState::Define);
2243 MI.eraseFromParent();
2244 break;
2246 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2247 expandMovDPP64(MI);
2248 break;
2250 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2251 const MachineOperand &SrcOp = MI.getOperand(1);
2252 assert(!SrcOp.isFPImm());
2253 APInt Imm(64, SrcOp.getImm());
2254 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2255 MI.setDesc(get(AMDGPU::S_MOV_B64));
2256 break;
2259 Register Dst = MI.getOperand(0).getReg();
2260 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2261 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2263 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2264 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2265 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2266 .addImm(Lo.getSExtValue())
2267 .addReg(Dst, RegState::Implicit | RegState::Define);
2268 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2269 .addImm(Hi.getSExtValue())
2270 .addReg(Dst, RegState::Implicit | RegState::Define);
2271 MI.eraseFromParent();
2272 break;
2274 case AMDGPU::V_SET_INACTIVE_B32: {
2275 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2276 Register DstReg = MI.getOperand(0).getReg();
2277 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2278 .add(MI.getOperand(3))
2279 .add(MI.getOperand(4))
2280 .add(MI.getOperand(1))
2281 .add(MI.getOperand(2))
2282 .add(MI.getOperand(5));
2283 MI.eraseFromParent();
2284 break;
2286 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2315 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2317 unsigned Opc;
2318 if (RI.hasVGPRs(EltRC)) {
2319 Opc = AMDGPU::V_MOVRELD_B32_e32;
2320 } else {
2321 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2322 : AMDGPU::S_MOVRELD_B32;
2325 const MCInstrDesc &OpDesc = get(Opc);
2326 Register VecReg = MI.getOperand(0).getReg();
2327 bool IsUndef = MI.getOperand(1).isUndef();
2328 unsigned SubReg = MI.getOperand(3).getImm();
2329 assert(VecReg == MI.getOperand(1).getReg());
2331 MachineInstrBuilder MIB =
2332 BuildMI(MBB, MI, DL, OpDesc)
2333 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2334 .add(MI.getOperand(2))
2335 .addReg(VecReg, RegState::ImplicitDefine)
2336 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2338 const int ImpDefIdx =
2339 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2340 const int ImpUseIdx = ImpDefIdx + 1;
2341 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2342 MI.eraseFromParent();
2343 break;
2345 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2346 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2347 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2348 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2349 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2357 assert(ST.useVGPRIndexMode());
2358 Register VecReg = MI.getOperand(0).getReg();
2359 bool IsUndef = MI.getOperand(1).isUndef();
2360 Register Idx = MI.getOperand(3).getReg();
2361 Register SubReg = MI.getOperand(4).getImm();
2363 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2364 .addReg(Idx)
2365 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2366 SetOn->getOperand(3).setIsUndef();
2368 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2369 MachineInstrBuilder MIB =
2370 BuildMI(MBB, MI, DL, OpDesc)
2371 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2372 .add(MI.getOperand(2))
2373 .addReg(VecReg, RegState::ImplicitDefine)
2374 .addReg(VecReg,
2375 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2377 const int ImpDefIdx =
2378 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2379 const int ImpUseIdx = ImpDefIdx + 1;
2380 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2382 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2384 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2386 MI.eraseFromParent();
2387 break;
2389 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2390 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2391 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2392 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2393 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2394 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2401 assert(ST.useVGPRIndexMode());
2402 Register Dst = MI.getOperand(0).getReg();
2403 Register VecReg = MI.getOperand(1).getReg();
2404 bool IsUndef = MI.getOperand(1).isUndef();
2405 Register Idx = MI.getOperand(2).getReg();
2406 Register SubReg = MI.getOperand(3).getImm();
2408 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2409 .addReg(Idx)
2410 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2411 SetOn->getOperand(3).setIsUndef();
2413 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2414 .addDef(Dst)
2415 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2416 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2418 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2420 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2422 MI.eraseFromParent();
2423 break;
2425 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2426 MachineFunction &MF = *MBB.getParent();
2427 Register Reg = MI.getOperand(0).getReg();
2428 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2429 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2430 MachineOperand OpLo = MI.getOperand(1);
2431 MachineOperand OpHi = MI.getOperand(2);
2433 // Create a bundle so these instructions won't be re-ordered by the
2434 // post-RA scheduler.
2435 MIBundleBuilder Bundler(MBB, MI);
2436 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2438 // What we want here is an offset from the value returned by s_getpc (which
2439 // is the address of the s_add_u32 instruction) to the global variable, but
2440 // since the encoding of $symbol starts 4 bytes after the start of the
2441 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2442 // small. This requires us to add 4 to the global variable offset in order
2443 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2444 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2445 // instruction.
2447 int64_t Adjust = 0;
2448 if (ST.hasGetPCZeroExtension()) {
2449 // Fix up hardware that does not sign-extend the 48-bit PC value by
2450 // inserting: s_sext_i32_i16 reghi, reghi
2451 Bundler.append(
2452 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2453 Adjust += 4;
2456 if (OpLo.isGlobal())
2457 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2458 Bundler.append(
2459 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2461 if (OpHi.isGlobal())
2462 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2463 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2464 .addReg(RegHi)
2465 .add(OpHi));
2467 finalizeBundle(MBB, Bundler.begin());
2469 MI.eraseFromParent();
2470 break;
2472 case AMDGPU::ENTER_STRICT_WWM: {
2473 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2474 // Whole Wave Mode is entered.
2475 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2476 : AMDGPU::S_OR_SAVEEXEC_B64));
2477 break;
2479 case AMDGPU::ENTER_STRICT_WQM: {
2480 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2481 // STRICT_WQM is entered.
2482 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2483 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2484 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2485 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2486 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2488 MI.eraseFromParent();
2489 break;
2491 case AMDGPU::EXIT_STRICT_WWM:
2492 case AMDGPU::EXIT_STRICT_WQM: {
2493 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2494 // WWM/STICT_WQM is exited.
2495 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2496 break;
2498 case AMDGPU::SI_RETURN: {
2499 const MachineFunction *MF = MBB.getParent();
2500 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2501 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2502 // Hiding the return address use with SI_RETURN may lead to extra kills in
2503 // the function and missing live-ins. We are fine in practice because callee
2504 // saved register handling ensures the register value is restored before
2505 // RET, but we need the undef flag here to appease the MachineVerifier
2506 // liveness checks.
2507 MachineInstrBuilder MIB =
2508 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2509 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2511 MIB.copyImplicitOps(MI);
2512 MI.eraseFromParent();
2513 break;
2516 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2517 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2518 MI.setDesc(get(AMDGPU::S_MUL_U64));
2519 break;
2521 case AMDGPU::S_GETPC_B64_pseudo:
2522 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2523 if (ST.hasGetPCZeroExtension()) {
2524 Register Dst = MI.getOperand(0).getReg();
2525 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2526 // Fix up hardware that does not sign-extend the 48-bit PC value by
2527 // inserting: s_sext_i32_i16 dsthi, dsthi
2528 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2529 DstHi)
2530 .addReg(DstHi);
2532 break;
2534 return true;
2537 void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2538 MachineBasicBlock::iterator I, Register DestReg,
2539 unsigned SubIdx, const MachineInstr &Orig,
2540 const TargetRegisterInfo &RI) const {
2542 // Try shrinking the instruction to remat only the part needed for current
2543 // context.
2544 // TODO: Handle more cases.
2545 unsigned Opcode = Orig.getOpcode();
2546 switch (Opcode) {
2547 case AMDGPU::S_LOAD_DWORDX16_IMM:
2548 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2549 if (SubIdx != 0)
2550 break;
2552 if (I == MBB.end())
2553 break;
2555 if (I->isBundled())
2556 break;
2558 // Look for a single use of the register that is also a subreg.
2559 Register RegToFind = Orig.getOperand(0).getReg();
2560 MachineOperand *UseMO = nullptr;
2561 for (auto &CandMO : I->operands()) {
2562 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2563 continue;
2564 if (UseMO) {
2565 UseMO = nullptr;
2566 break;
2568 UseMO = &CandMO;
2570 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2571 break;
2573 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2574 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2576 MachineFunction *MF = MBB.getParent();
2577 MachineRegisterInfo &MRI = MF->getRegInfo();
2578 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2580 unsigned NewOpcode = -1;
2581 if (SubregSize == 256)
2582 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2583 else if (SubregSize == 128)
2584 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2585 else
2586 break;
2588 const MCInstrDesc &TID = get(NewOpcode);
2589 const TargetRegisterClass *NewRC =
2590 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2591 MRI.setRegClass(DestReg, NewRC);
2593 UseMO->setReg(DestReg);
2594 UseMO->setSubReg(AMDGPU::NoSubRegister);
2596 // Use a smaller load with the desired size, possibly with updated offset.
2597 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2598 MI->setDesc(TID);
2599 MI->getOperand(0).setReg(DestReg);
2600 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2601 if (Offset) {
2602 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2603 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2604 OffsetMO->setImm(FinalOffset);
2606 SmallVector<MachineMemOperand *> NewMMOs;
2607 for (const MachineMemOperand *MemOp : Orig.memoperands())
2608 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2609 SubregSize / 8));
2610 MI->setMemRefs(*MF, NewMMOs);
2612 MBB.insert(I, MI);
2613 return;
2616 default:
2617 break;
2620 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2623 std::pair<MachineInstr*, MachineInstr*>
2624 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2625 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2627 if (ST.hasMovB64() &&
2628 AMDGPU::isLegalDPALU_DPPControl(
2629 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2630 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2631 return std::pair(&MI, nullptr);
2634 MachineBasicBlock &MBB = *MI.getParent();
2635 DebugLoc DL = MBB.findDebugLoc(MI);
2636 MachineFunction *MF = MBB.getParent();
2637 MachineRegisterInfo &MRI = MF->getRegInfo();
2638 Register Dst = MI.getOperand(0).getReg();
2639 unsigned Part = 0;
2640 MachineInstr *Split[2];
2642 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2643 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2644 if (Dst.isPhysical()) {
2645 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2646 } else {
2647 assert(MRI.isSSA());
2648 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2649 MovDPP.addDef(Tmp);
2652 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2653 const MachineOperand &SrcOp = MI.getOperand(I);
2654 assert(!SrcOp.isFPImm());
2655 if (SrcOp.isImm()) {
2656 APInt Imm(64, SrcOp.getImm());
2657 Imm.ashrInPlace(Part * 32);
2658 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2659 } else {
2660 assert(SrcOp.isReg());
2661 Register Src = SrcOp.getReg();
2662 if (Src.isPhysical())
2663 MovDPP.addReg(RI.getSubReg(Src, Sub));
2664 else
2665 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2669 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2670 MovDPP.addImm(MO.getImm());
2672 Split[Part] = MovDPP;
2673 ++Part;
2676 if (Dst.isVirtual())
2677 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2678 .addReg(Split[0]->getOperand(0).getReg())
2679 .addImm(AMDGPU::sub0)
2680 .addReg(Split[1]->getOperand(0).getReg())
2681 .addImm(AMDGPU::sub1);
2683 MI.eraseFromParent();
2684 return std::pair(Split[0], Split[1]);
2687 std::optional<DestSourcePair>
2688 SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2689 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2690 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2692 return std::nullopt;
2695 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
2696 MachineOperand &Src0,
2697 unsigned Src0OpName,
2698 MachineOperand &Src1,
2699 unsigned Src1OpName) const {
2700 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2701 if (!Src0Mods)
2702 return false;
2704 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2705 assert(Src1Mods &&
2706 "All commutable instructions have both src0 and src1 modifiers");
2708 int Src0ModsVal = Src0Mods->getImm();
2709 int Src1ModsVal = Src1Mods->getImm();
2711 Src1Mods->setImm(Src0ModsVal);
2712 Src0Mods->setImm(Src1ModsVal);
2713 return true;
2716 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2717 MachineOperand &RegOp,
2718 MachineOperand &NonRegOp) {
2719 Register Reg = RegOp.getReg();
2720 unsigned SubReg = RegOp.getSubReg();
2721 bool IsKill = RegOp.isKill();
2722 bool IsDead = RegOp.isDead();
2723 bool IsUndef = RegOp.isUndef();
2724 bool IsDebug = RegOp.isDebug();
2726 if (NonRegOp.isImm())
2727 RegOp.ChangeToImmediate(NonRegOp.getImm());
2728 else if (NonRegOp.isFI())
2729 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2730 else if (NonRegOp.isGlobal()) {
2731 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2732 NonRegOp.getTargetFlags());
2733 } else
2734 return nullptr;
2736 // Make sure we don't reinterpret a subreg index in the target flags.
2737 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2739 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2740 NonRegOp.setSubReg(SubReg);
2742 return &MI;
2745 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2746 unsigned Src0Idx,
2747 unsigned Src1Idx) const {
2748 assert(!NewMI && "this should never be used");
2750 unsigned Opc = MI.getOpcode();
2751 int CommutedOpcode = commuteOpcode(Opc);
2752 if (CommutedOpcode == -1)
2753 return nullptr;
2755 if (Src0Idx > Src1Idx)
2756 std::swap(Src0Idx, Src1Idx);
2758 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2759 static_cast<int>(Src0Idx) &&
2760 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2761 static_cast<int>(Src1Idx) &&
2762 "inconsistency with findCommutedOpIndices");
2764 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2765 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2767 MachineInstr *CommutedMI = nullptr;
2768 if (Src0.isReg() && Src1.isReg()) {
2769 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2770 // Be sure to copy the source modifiers to the right place.
2771 CommutedMI
2772 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2775 } else if (Src0.isReg() && !Src1.isReg()) {
2776 if (isOperandLegal(MI, Src1Idx, &Src0))
2777 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2778 } else if (!Src0.isReg() && Src1.isReg()) {
2779 if (isOperandLegal(MI, Src1Idx, &Src0))
2780 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2781 } else {
2782 // FIXME: Found two non registers to commute. This does happen.
2783 return nullptr;
2786 if (CommutedMI) {
2787 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2788 Src1, AMDGPU::OpName::src1_modifiers);
2790 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2791 AMDGPU::OpName::src1_sel);
2793 CommutedMI->setDesc(get(CommutedOpcode));
2796 return CommutedMI;
2799 // This needs to be implemented because the source modifiers may be inserted
2800 // between the true commutable operands, and the base
2801 // TargetInstrInfo::commuteInstruction uses it.
2802 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2803 unsigned &SrcOpIdx0,
2804 unsigned &SrcOpIdx1) const {
2805 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2808 bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2809 unsigned &SrcOpIdx0,
2810 unsigned &SrcOpIdx1) const {
2811 if (!Desc.isCommutable())
2812 return false;
2814 unsigned Opc = Desc.getOpcode();
2815 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2816 if (Src0Idx == -1)
2817 return false;
2819 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2820 if (Src1Idx == -1)
2821 return false;
2823 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2826 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2827 int64_t BrOffset) const {
2828 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2829 // block is unanalyzable.
2830 assert(BranchOp != AMDGPU::S_SETPC_B64);
2832 // Convert to dwords.
2833 BrOffset /= 4;
2835 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2836 // from the next instruction.
2837 BrOffset -= 1;
2839 return isIntN(BranchOffsetBits, BrOffset);
2842 MachineBasicBlock *
2843 SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2844 return MI.getOperand(0).getMBB();
2847 bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2848 for (const MachineInstr &MI : MBB->terminators()) {
2849 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2850 MI.getOpcode() == AMDGPU::SI_LOOP)
2851 return true;
2853 return false;
2856 void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2857 MachineBasicBlock &DestBB,
2858 MachineBasicBlock &RestoreBB,
2859 const DebugLoc &DL, int64_t BrOffset,
2860 RegScavenger *RS) const {
2861 assert(RS && "RegScavenger required for long branching");
2862 assert(MBB.empty() &&
2863 "new block should be inserted for expanding unconditional branch");
2864 assert(MBB.pred_size() == 1);
2865 assert(RestoreBB.empty() &&
2866 "restore block should be inserted for restoring clobbered registers");
2868 MachineFunction *MF = MBB.getParent();
2869 MachineRegisterInfo &MRI = MF->getRegInfo();
2870 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2872 // FIXME: Virtual register workaround for RegScavenger not working with empty
2873 // blocks.
2874 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2876 auto I = MBB.end();
2878 // Note: as this is used after hazard recognizer we need to apply some hazard
2879 // workarounds directly.
2880 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2881 ST.hasVALUReadSGPRHazard();
2882 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2883 if (FlushSGPRWrites)
2884 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2885 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2888 // We need to compute the offset relative to the instruction immediately after
2889 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2890 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2891 ApplyHazardWorkarounds();
2893 auto &MCCtx = MF->getContext();
2894 MCSymbol *PostGetPCLabel =
2895 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2896 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2898 MCSymbol *OffsetLo =
2899 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2900 MCSymbol *OffsetHi =
2901 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2902 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2903 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2904 .addReg(PCReg, 0, AMDGPU::sub0)
2905 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2906 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2907 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2908 .addReg(PCReg, 0, AMDGPU::sub1)
2909 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2910 ApplyHazardWorkarounds();
2912 // Insert the indirect branch after the other terminator.
2913 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2914 .addReg(PCReg);
2916 // If a spill is needed for the pc register pair, we need to insert a spill
2917 // restore block right before the destination block, and insert a short branch
2918 // into the old destination block's fallthrough predecessor.
2919 // e.g.:
2921 // s_cbranch_scc0 skip_long_branch:
2923 // long_branch_bb:
2924 // spill s[8:9]
2925 // s_getpc_b64 s[8:9]
2926 // s_add_u32 s8, s8, restore_bb
2927 // s_addc_u32 s9, s9, 0
2928 // s_setpc_b64 s[8:9]
2930 // skip_long_branch:
2931 // foo;
2933 // .....
2935 // dest_bb_fallthrough_predecessor:
2936 // bar;
2937 // s_branch dest_bb
2939 // restore_bb:
2940 // restore s[8:9]
2941 // fallthrough dest_bb
2943 // dest_bb:
2944 // buzz;
2946 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2947 Register Scav;
2949 // If we've previously reserved a register for long branches
2950 // avoid running the scavenger and just use those registers
2951 if (LongBranchReservedReg) {
2952 RS->enterBasicBlock(MBB);
2953 Scav = LongBranchReservedReg;
2954 } else {
2955 RS->enterBasicBlockEnd(MBB);
2956 Scav = RS->scavengeRegisterBackwards(
2957 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2958 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2960 if (Scav) {
2961 RS->setRegUsed(Scav);
2962 MRI.replaceRegWith(PCReg, Scav);
2963 MRI.clearVirtRegs();
2964 } else {
2965 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2966 // SGPR spill.
2967 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2968 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2969 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2970 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2971 MRI.clearVirtRegs();
2974 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2975 // Now, the distance could be defined.
2976 auto *Offset = MCBinaryExpr::createSub(
2977 MCSymbolRefExpr::create(DestLabel, MCCtx),
2978 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2979 // Add offset assignments.
2980 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2981 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2982 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2983 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2986 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2987 switch (Cond) {
2988 case SIInstrInfo::SCC_TRUE:
2989 return AMDGPU::S_CBRANCH_SCC1;
2990 case SIInstrInfo::SCC_FALSE:
2991 return AMDGPU::S_CBRANCH_SCC0;
2992 case SIInstrInfo::VCCNZ:
2993 return AMDGPU::S_CBRANCH_VCCNZ;
2994 case SIInstrInfo::VCCZ:
2995 return AMDGPU::S_CBRANCH_VCCZ;
2996 case SIInstrInfo::EXECNZ:
2997 return AMDGPU::S_CBRANCH_EXECNZ;
2998 case SIInstrInfo::EXECZ:
2999 return AMDGPU::S_CBRANCH_EXECZ;
3000 default:
3001 llvm_unreachable("invalid branch predicate");
3005 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3006 switch (Opcode) {
3007 case AMDGPU::S_CBRANCH_SCC0:
3008 return SCC_FALSE;
3009 case AMDGPU::S_CBRANCH_SCC1:
3010 return SCC_TRUE;
3011 case AMDGPU::S_CBRANCH_VCCNZ:
3012 return VCCNZ;
3013 case AMDGPU::S_CBRANCH_VCCZ:
3014 return VCCZ;
3015 case AMDGPU::S_CBRANCH_EXECNZ:
3016 return EXECNZ;
3017 case AMDGPU::S_CBRANCH_EXECZ:
3018 return EXECZ;
3019 default:
3020 return INVALID_BR;
3024 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3025 MachineBasicBlock::iterator I,
3026 MachineBasicBlock *&TBB,
3027 MachineBasicBlock *&FBB,
3028 SmallVectorImpl<MachineOperand> &Cond,
3029 bool AllowModify) const {
3030 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3031 // Unconditional Branch
3032 TBB = I->getOperand(0).getMBB();
3033 return false;
3036 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3037 if (Pred == INVALID_BR)
3038 return true;
3040 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3041 Cond.push_back(MachineOperand::CreateImm(Pred));
3042 Cond.push_back(I->getOperand(1)); // Save the branch register.
3044 ++I;
3046 if (I == MBB.end()) {
3047 // Conditional branch followed by fall-through.
3048 TBB = CondBB;
3049 return false;
3052 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3053 TBB = CondBB;
3054 FBB = I->getOperand(0).getMBB();
3055 return false;
3058 return true;
3061 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3062 MachineBasicBlock *&FBB,
3063 SmallVectorImpl<MachineOperand> &Cond,
3064 bool AllowModify) const {
3065 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3066 auto E = MBB.end();
3067 if (I == E)
3068 return false;
3070 // Skip over the instructions that are artificially terminators for special
3071 // exec management.
3072 while (I != E && !I->isBranch() && !I->isReturn()) {
3073 switch (I->getOpcode()) {
3074 case AMDGPU::S_MOV_B64_term:
3075 case AMDGPU::S_XOR_B64_term:
3076 case AMDGPU::S_OR_B64_term:
3077 case AMDGPU::S_ANDN2_B64_term:
3078 case AMDGPU::S_AND_B64_term:
3079 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3080 case AMDGPU::S_MOV_B32_term:
3081 case AMDGPU::S_XOR_B32_term:
3082 case AMDGPU::S_OR_B32_term:
3083 case AMDGPU::S_ANDN2_B32_term:
3084 case AMDGPU::S_AND_B32_term:
3085 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3086 break;
3087 case AMDGPU::SI_IF:
3088 case AMDGPU::SI_ELSE:
3089 case AMDGPU::SI_KILL_I1_TERMINATOR:
3090 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3091 // FIXME: It's messy that these need to be considered here at all.
3092 return true;
3093 default:
3094 llvm_unreachable("unexpected non-branch terminator inst");
3097 ++I;
3100 if (I == E)
3101 return false;
3103 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3106 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3107 int *BytesRemoved) const {
3108 unsigned Count = 0;
3109 unsigned RemovedSize = 0;
3110 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3111 // Skip over artificial terminators when removing instructions.
3112 if (MI.isBranch() || MI.isReturn()) {
3113 RemovedSize += getInstSizeInBytes(MI);
3114 MI.eraseFromParent();
3115 ++Count;
3119 if (BytesRemoved)
3120 *BytesRemoved = RemovedSize;
3122 return Count;
3125 // Copy the flags onto the implicit condition register operand.
3126 static void preserveCondRegFlags(MachineOperand &CondReg,
3127 const MachineOperand &OrigCond) {
3128 CondReg.setIsUndef(OrigCond.isUndef());
3129 CondReg.setIsKill(OrigCond.isKill());
3132 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3133 MachineBasicBlock *TBB,
3134 MachineBasicBlock *FBB,
3135 ArrayRef<MachineOperand> Cond,
3136 const DebugLoc &DL,
3137 int *BytesAdded) const {
3138 if (!FBB && Cond.empty()) {
3139 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3140 .addMBB(TBB);
3141 if (BytesAdded)
3142 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3143 return 1;
3146 assert(TBB && Cond[0].isImm());
3148 unsigned Opcode
3149 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3151 if (!FBB) {
3152 MachineInstr *CondBr =
3153 BuildMI(&MBB, DL, get(Opcode))
3154 .addMBB(TBB);
3156 // Copy the flags onto the implicit condition register operand.
3157 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3158 fixImplicitOperands(*CondBr);
3160 if (BytesAdded)
3161 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3162 return 1;
3165 assert(TBB && FBB);
3167 MachineInstr *CondBr =
3168 BuildMI(&MBB, DL, get(Opcode))
3169 .addMBB(TBB);
3170 fixImplicitOperands(*CondBr);
3171 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3172 .addMBB(FBB);
3174 MachineOperand &CondReg = CondBr->getOperand(1);
3175 CondReg.setIsUndef(Cond[1].isUndef());
3176 CondReg.setIsKill(Cond[1].isKill());
3178 if (BytesAdded)
3179 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3181 return 2;
3184 bool SIInstrInfo::reverseBranchCondition(
3185 SmallVectorImpl<MachineOperand> &Cond) const {
3186 if (Cond.size() != 2) {
3187 return true;
3190 if (Cond[0].isImm()) {
3191 Cond[0].setImm(-Cond[0].getImm());
3192 return false;
3195 return true;
3198 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3199 ArrayRef<MachineOperand> Cond,
3200 Register DstReg, Register TrueReg,
3201 Register FalseReg, int &CondCycles,
3202 int &TrueCycles, int &FalseCycles) const {
3203 switch (Cond[0].getImm()) {
3204 case VCCNZ:
3205 case VCCZ: {
3206 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3207 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3208 if (MRI.getRegClass(FalseReg) != RC)
3209 return false;
3211 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3212 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3214 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3215 return RI.hasVGPRs(RC) && NumInsts <= 6;
3217 case SCC_TRUE:
3218 case SCC_FALSE: {
3219 // FIXME: We could insert for VGPRs if we could replace the original compare
3220 // with a vector one.
3221 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3222 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3223 if (MRI.getRegClass(FalseReg) != RC)
3224 return false;
3226 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3228 // Multiples of 8 can do s_cselect_b64
3229 if (NumInsts % 2 == 0)
3230 NumInsts /= 2;
3232 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3233 return RI.isSGPRClass(RC);
3235 default:
3236 return false;
3240 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3241 MachineBasicBlock::iterator I, const DebugLoc &DL,
3242 Register DstReg, ArrayRef<MachineOperand> Cond,
3243 Register TrueReg, Register FalseReg) const {
3244 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3245 if (Pred == VCCZ || Pred == SCC_FALSE) {
3246 Pred = static_cast<BranchPredicate>(-Pred);
3247 std::swap(TrueReg, FalseReg);
3250 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3251 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3252 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3254 if (DstSize == 32) {
3255 MachineInstr *Select;
3256 if (Pred == SCC_TRUE) {
3257 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3258 .addReg(TrueReg)
3259 .addReg(FalseReg);
3260 } else {
3261 // Instruction's operands are backwards from what is expected.
3262 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3263 .addReg(FalseReg)
3264 .addReg(TrueReg);
3267 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3268 return;
3271 if (DstSize == 64 && Pred == SCC_TRUE) {
3272 MachineInstr *Select =
3273 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3274 .addReg(TrueReg)
3275 .addReg(FalseReg);
3277 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3278 return;
3281 static const int16_t Sub0_15[] = {
3282 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3283 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3284 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3285 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3288 static const int16_t Sub0_15_64[] = {
3289 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3290 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3291 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3292 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3295 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3296 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3297 const int16_t *SubIndices = Sub0_15;
3298 int NElts = DstSize / 32;
3300 // 64-bit select is only available for SALU.
3301 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3302 if (Pred == SCC_TRUE) {
3303 if (NElts % 2) {
3304 SelOp = AMDGPU::S_CSELECT_B32;
3305 EltRC = &AMDGPU::SGPR_32RegClass;
3306 } else {
3307 SelOp = AMDGPU::S_CSELECT_B64;
3308 EltRC = &AMDGPU::SGPR_64RegClass;
3309 SubIndices = Sub0_15_64;
3310 NElts /= 2;
3314 MachineInstrBuilder MIB = BuildMI(
3315 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3317 I = MIB->getIterator();
3319 SmallVector<Register, 8> Regs;
3320 for (int Idx = 0; Idx != NElts; ++Idx) {
3321 Register DstElt = MRI.createVirtualRegister(EltRC);
3322 Regs.push_back(DstElt);
3324 unsigned SubIdx = SubIndices[Idx];
3326 MachineInstr *Select;
3327 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3328 Select =
3329 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3330 .addReg(FalseReg, 0, SubIdx)
3331 .addReg(TrueReg, 0, SubIdx);
3332 } else {
3333 Select =
3334 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3335 .addReg(TrueReg, 0, SubIdx)
3336 .addReg(FalseReg, 0, SubIdx);
3339 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3340 fixImplicitOperands(*Select);
3342 MIB.addReg(DstElt)
3343 .addImm(SubIdx);
3347 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3348 switch (MI.getOpcode()) {
3349 case AMDGPU::V_MOV_B16_t16_e32:
3350 case AMDGPU::V_MOV_B16_t16_e64:
3351 case AMDGPU::V_MOV_B32_e32:
3352 case AMDGPU::V_MOV_B32_e64:
3353 case AMDGPU::V_MOV_B64_PSEUDO:
3354 case AMDGPU::V_MOV_B64_e32:
3355 case AMDGPU::V_MOV_B64_e64:
3356 case AMDGPU::S_MOV_B32:
3357 case AMDGPU::S_MOV_B64:
3358 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3359 case AMDGPU::COPY:
3360 case AMDGPU::WWM_COPY:
3361 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3362 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3363 case AMDGPU::V_ACCVGPR_MOV_B32:
3364 return true;
3365 default:
3366 return false;
3370 static constexpr unsigned ModifierOpNames[] = {
3371 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3372 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3373 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3375 void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3376 unsigned Opc = MI.getOpcode();
3377 for (unsigned Name : reverse(ModifierOpNames)) {
3378 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3379 if (Idx >= 0)
3380 MI.removeOperand(Idx);
3384 bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3385 Register Reg, MachineRegisterInfo *MRI) const {
3386 if (!MRI->hasOneNonDBGUse(Reg))
3387 return false;
3389 switch (DefMI.getOpcode()) {
3390 default:
3391 return false;
3392 case AMDGPU::V_MOV_B64_e32:
3393 case AMDGPU::S_MOV_B64:
3394 case AMDGPU::V_MOV_B64_PSEUDO:
3395 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3396 case AMDGPU::V_MOV_B32_e32:
3397 case AMDGPU::S_MOV_B32:
3398 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3399 break;
3402 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3403 assert(ImmOp);
3404 // FIXME: We could handle FrameIndex values here.
3405 if (!ImmOp->isImm())
3406 return false;
3408 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3409 int64_t Imm = ImmOp->getImm();
3410 switch (UseOp.getSubReg()) {
3411 default:
3412 return Imm;
3413 case AMDGPU::sub0:
3414 return Lo_32(Imm);
3415 case AMDGPU::sub1:
3416 return Hi_32(Imm);
3417 case AMDGPU::lo16:
3418 return SignExtend64<16>(Imm);
3419 case AMDGPU::hi16:
3420 return SignExtend64<16>(Imm >> 16);
3421 case AMDGPU::sub1_lo16:
3422 return SignExtend64<16>(Imm >> 32);
3423 case AMDGPU::sub1_hi16:
3424 return SignExtend64<16>(Imm >> 48);
3428 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3430 unsigned Opc = UseMI.getOpcode();
3431 if (Opc == AMDGPU::COPY) {
3432 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3434 Register DstReg = UseMI.getOperand(0).getReg();
3435 unsigned OpSize = getOpSize(UseMI, 0);
3436 bool Is16Bit = OpSize == 2;
3437 bool Is64Bit = OpSize == 8;
3438 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3439 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3440 : AMDGPU::V_MOV_B32_e32
3441 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3442 : AMDGPU::S_MOV_B32;
3443 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
3444 /*isSigned=*/true, /*implicitTrunc=*/true);
3446 if (RI.isAGPR(*MRI, DstReg)) {
3447 if (Is64Bit || !isInlineConstant(Imm))
3448 return false;
3449 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3452 if (Is16Bit) {
3453 if (isVGPRCopy)
3454 return false; // Do not clobber vgpr_hi16
3456 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3457 return false;
3459 UseMI.getOperand(0).setSubReg(0);
3460 if (DstReg.isPhysical()) {
3461 DstReg = RI.get32BitRegister(DstReg);
3462 UseMI.getOperand(0).setReg(DstReg);
3464 assert(UseMI.getOperand(1).getReg().isVirtual());
3467 const MCInstrDesc &NewMCID = get(NewOpc);
3468 if (DstReg.isPhysical() &&
3469 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3470 return false;
3472 UseMI.setDesc(NewMCID);
3473 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3474 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3475 return true;
3478 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3479 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3480 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3481 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3482 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3483 // Don't fold if we are using source or output modifiers. The new VOP2
3484 // instructions don't have them.
3485 if (hasAnyModifiersSet(UseMI))
3486 return false;
3488 // If this is a free constant, there's no reason to do this.
3489 // TODO: We could fold this here instead of letting SIFoldOperands do it
3490 // later.
3491 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3493 // Any src operand can be used for the legality check.
3494 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3495 return false;
3497 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3498 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3499 bool IsFMA =
3500 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3501 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3503 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3504 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3506 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3507 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3508 (Src1->isReg() && Src1->getReg() == Reg)) {
3509 MachineOperand *RegSrc =
3510 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3511 if (!RegSrc->isReg())
3512 return false;
3513 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3514 ST.getConstantBusLimit(Opc) < 2)
3515 return false;
3517 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3518 return false;
3520 // If src2 is also a literal constant then we have to choose which one to
3521 // fold. In general it is better to choose madak so that the other literal
3522 // can be materialized in an sgpr instead of a vgpr:
3523 // s_mov_b32 s0, literal
3524 // v_madak_f32 v0, s0, v0, literal
3525 // Instead of:
3526 // v_mov_b32 v1, literal
3527 // v_madmk_f32 v0, v0, literal, v1
3528 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3529 if (Def && Def->isMoveImmediate() &&
3530 !isInlineConstant(Def->getOperand(1)))
3531 return false;
3533 unsigned NewOpc =
3534 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3535 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3536 : AMDGPU::V_FMAMK_F16)
3537 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3538 if (pseudoToMCOpcode(NewOpc) == -1)
3539 return false;
3541 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3542 // would also require restricting their register classes. For now
3543 // just bail out.
3544 if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3545 return false;
3547 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3549 // FIXME: This would be a lot easier if we could return a new instruction
3550 // instead of having to modify in place.
3552 Register SrcReg = RegSrc->getReg();
3553 unsigned SrcSubReg = RegSrc->getSubReg();
3554 Src0->setReg(SrcReg);
3555 Src0->setSubReg(SrcSubReg);
3556 Src0->setIsKill(RegSrc->isKill());
3558 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3559 Opc == AMDGPU::V_FMAC_F32_e64 ||
3560 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3561 UseMI.untieRegOperand(
3562 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3564 Src1->ChangeToImmediate(Imm);
3566 removeModOperands(UseMI);
3567 UseMI.setDesc(get(NewOpc));
3569 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3570 if (DeleteDef)
3571 DefMI.eraseFromParent();
3573 return true;
3576 // Added part is the constant: Use v_madak_{f16, f32}.
3577 if (Src2->isReg() && Src2->getReg() == Reg) {
3578 if (ST.getConstantBusLimit(Opc) < 2) {
3579 // Not allowed to use constant bus for another operand.
3580 // We can however allow an inline immediate as src0.
3581 bool Src0Inlined = false;
3582 if (Src0->isReg()) {
3583 // Try to inline constant if possible.
3584 // If the Def moves immediate and the use is single
3585 // We are saving VGPR here.
3586 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3587 if (Def && Def->isMoveImmediate() &&
3588 isInlineConstant(Def->getOperand(1)) &&
3589 MRI->hasOneUse(Src0->getReg())) {
3590 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3591 Src0Inlined = true;
3592 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3593 RI.isSGPRReg(*MRI, Src0->getReg())) {
3594 return false;
3596 // VGPR is okay as Src0 - fallthrough
3599 if (Src1->isReg() && !Src0Inlined) {
3600 // We have one slot for inlinable constant so far - try to fill it
3601 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3602 if (Def && Def->isMoveImmediate() &&
3603 isInlineConstant(Def->getOperand(1)) &&
3604 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3605 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3606 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3607 return false;
3608 // VGPR is okay as Src1 - fallthrough
3612 unsigned NewOpc =
3613 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3614 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3615 : AMDGPU::V_FMAAK_F16)
3616 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3617 if (pseudoToMCOpcode(NewOpc) == -1)
3618 return false;
3620 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3621 // would also require restricting their register classes. For now
3622 // just bail out.
3623 if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3624 return false;
3626 // FIXME: This would be a lot easier if we could return a new instruction
3627 // instead of having to modify in place.
3629 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3630 Opc == AMDGPU::V_FMAC_F32_e64 ||
3631 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3632 UseMI.untieRegOperand(
3633 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3635 // ChangingToImmediate adds Src2 back to the instruction.
3636 Src2->ChangeToImmediate(getImmFor(*Src2));
3638 // These come before src2.
3639 removeModOperands(UseMI);
3640 UseMI.setDesc(get(NewOpc));
3641 // It might happen that UseMI was commuted
3642 // and we now have SGPR as SRC1. If so 2 inlined
3643 // constant and SGPR are illegal.
3644 legalizeOperands(UseMI);
3646 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3647 if (DeleteDef)
3648 DefMI.eraseFromParent();
3650 return true;
3654 return false;
3657 static bool
3658 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3659 ArrayRef<const MachineOperand *> BaseOps2) {
3660 if (BaseOps1.size() != BaseOps2.size())
3661 return false;
3662 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3663 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3664 return false;
3666 return true;
3669 static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3670 LocationSize WidthB, int OffsetB) {
3671 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3672 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3673 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3674 return LowWidth.hasValue() &&
3675 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3678 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3679 const MachineInstr &MIb) const {
3680 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3681 int64_t Offset0, Offset1;
3682 LocationSize Dummy0 = 0, Dummy1 = 0;
3683 bool Offset0IsScalable, Offset1IsScalable;
3684 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3685 Dummy0, &RI) ||
3686 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3687 Dummy1, &RI))
3688 return false;
3690 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3691 return false;
3693 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3694 // FIXME: Handle ds_read2 / ds_write2.
3695 return false;
3697 LocationSize Width0 = MIa.memoperands().front()->getSize();
3698 LocationSize Width1 = MIb.memoperands().front()->getSize();
3699 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3702 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
3703 const MachineInstr &MIb) const {
3704 assert(MIa.mayLoadOrStore() &&
3705 "MIa must load from or modify a memory location");
3706 assert(MIb.mayLoadOrStore() &&
3707 "MIb must load from or modify a memory location");
3709 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
3710 return false;
3712 // XXX - Can we relax this between address spaces?
3713 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3714 return false;
3716 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3717 return false;
3719 // TODO: Should we check the address space from the MachineMemOperand? That
3720 // would allow us to distinguish objects we know don't alias based on the
3721 // underlying address space, even if it was lowered to a different one,
3722 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3723 // buffer.
3724 if (isDS(MIa)) {
3725 if (isDS(MIb))
3726 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3728 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3731 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3732 if (isMUBUF(MIb) || isMTBUF(MIb))
3733 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3735 if (isFLAT(MIb))
3736 return isFLATScratch(MIb);
3738 return !isSMRD(MIb);
3741 if (isSMRD(MIa)) {
3742 if (isSMRD(MIb))
3743 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3745 if (isFLAT(MIb))
3746 return isFLATScratch(MIb);
3748 return !isMUBUF(MIb) && !isMTBUF(MIb);
3751 if (isFLAT(MIa)) {
3752 if (isFLAT(MIb)) {
3753 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3754 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3755 return true;
3757 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3760 return false;
3763 return false;
3766 static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
3767 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3768 if (Reg.isPhysical())
3769 return false;
3770 auto *Def = MRI.getUniqueVRegDef(Reg);
3771 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3772 Imm = Def->getOperand(1).getImm();
3773 if (DefMI)
3774 *DefMI = Def;
3775 return true;
3777 return false;
3780 static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3781 MachineInstr **DefMI = nullptr) {
3782 if (!MO->isReg())
3783 return false;
3784 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3785 const MachineRegisterInfo &MRI = MF->getRegInfo();
3786 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3789 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
3790 MachineInstr &NewMI) {
3791 if (LV) {
3792 unsigned NumOps = MI.getNumOperands();
3793 for (unsigned I = 1; I < NumOps; ++I) {
3794 MachineOperand &Op = MI.getOperand(I);
3795 if (Op.isReg() && Op.isKill())
3796 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3801 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3802 LiveVariables *LV,
3803 LiveIntervals *LIS) const {
3804 MachineBasicBlock &MBB = *MI.getParent();
3805 unsigned Opc = MI.getOpcode();
3807 // Handle MFMA.
3808 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3809 if (NewMFMAOpc != -1) {
3810 MachineInstrBuilder MIB =
3811 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3812 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3813 MIB.add(MI.getOperand(I));
3814 updateLiveVariables(LV, MI, *MIB);
3815 if (LIS) {
3816 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3817 // SlotIndex of defs needs to be updated when converting to early-clobber
3818 MachineOperand &Def = MIB->getOperand(0);
3819 if (Def.isEarlyClobber() && Def.isReg() &&
3820 LIS->hasInterval(Def.getReg())) {
3821 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3822 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3823 auto &LI = LIS->getInterval(Def.getReg());
3824 auto UpdateDefIndex = [&](LiveRange &LR) {
3825 auto *S = LR.find(OldIndex);
3826 if (S != LR.end() && S->start == OldIndex) {
3827 assert(S->valno && S->valno->def == OldIndex);
3828 S->start = NewIndex;
3829 S->valno->def = NewIndex;
3832 UpdateDefIndex(LI);
3833 for (auto &SR : LI.subranges())
3834 UpdateDefIndex(SR);
3837 return MIB;
3840 if (SIInstrInfo::isWMMA(MI)) {
3841 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3842 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3843 .setMIFlags(MI.getFlags());
3844 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3845 MIB->addOperand(MI.getOperand(I));
3847 updateLiveVariables(LV, MI, *MIB);
3848 if (LIS)
3849 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3851 return MIB;
3854 assert(
3855 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3856 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3857 "pre-RA");
3859 // Handle MAC/FMAC.
3860 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3861 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3862 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3863 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3864 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3865 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3866 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3867 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3868 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3869 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3870 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3871 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3872 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3873 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3874 bool Src0Literal = false;
3876 switch (Opc) {
3877 default:
3878 return nullptr;
3879 case AMDGPU::V_MAC_F16_e64:
3880 case AMDGPU::V_FMAC_F16_e64:
3881 case AMDGPU::V_FMAC_F16_fake16_e64:
3882 case AMDGPU::V_MAC_F32_e64:
3883 case AMDGPU::V_MAC_LEGACY_F32_e64:
3884 case AMDGPU::V_FMAC_F32_e64:
3885 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3886 case AMDGPU::V_FMAC_F64_e64:
3887 break;
3888 case AMDGPU::V_MAC_F16_e32:
3889 case AMDGPU::V_FMAC_F16_e32:
3890 case AMDGPU::V_MAC_F32_e32:
3891 case AMDGPU::V_MAC_LEGACY_F32_e32:
3892 case AMDGPU::V_FMAC_F32_e32:
3893 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3894 case AMDGPU::V_FMAC_F64_e32: {
3895 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3896 AMDGPU::OpName::src0);
3897 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3898 if (!Src0->isReg() && !Src0->isImm())
3899 return nullptr;
3901 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3902 Src0Literal = true;
3904 break;
3908 MachineInstrBuilder MIB;
3909 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3910 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3911 const MachineOperand *Src0Mods =
3912 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3913 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3914 const MachineOperand *Src1Mods =
3915 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3916 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3917 const MachineOperand *Src2Mods =
3918 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3919 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3920 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3921 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3923 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3924 !IsLegacy &&
3925 // If we have an SGPR input, we will violate the constant bus restriction.
3926 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3927 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3928 MachineInstr *DefMI;
3929 const auto killDef = [&]() -> void {
3930 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3931 // The only user is the instruction which will be killed.
3932 Register DefReg = DefMI->getOperand(0).getReg();
3934 if (MRI.hasOneNonDBGUse(DefReg)) {
3935 // We cannot just remove the DefMI here, calling pass will crash.
3936 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3937 DefMI->getOperand(0).setIsDead(true);
3938 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3939 DefMI->removeOperand(I);
3940 if (LV)
3941 LV->getVarInfo(DefReg).AliveBlocks.clear();
3944 if (LIS) {
3945 LiveInterval &DefLI = LIS->getInterval(DefReg);
3947 // We cannot delete the original instruction here, so hack out the use
3948 // in the original instruction with a dummy register so we can use
3949 // shrinkToUses to deal with any multi-use edge cases. Other targets do
3950 // not have the complexity of deleting a use to consider here.
3951 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
3952 for (MachineOperand &MIOp : MI.uses()) {
3953 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
3954 MIOp.setIsUndef(true);
3955 MIOp.setReg(DummyReg);
3959 LIS->shrinkToUses(&DefLI);
3963 int64_t Imm;
3964 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3965 unsigned NewOpc =
3966 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3967 : AMDGPU::V_FMAAK_F16)
3968 : AMDGPU::V_FMAAK_F32)
3969 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3970 if (pseudoToMCOpcode(NewOpc) != -1) {
3971 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3972 .add(*Dst)
3973 .add(*Src0)
3974 .add(*Src1)
3975 .addImm(Imm)
3976 .setMIFlags(MI.getFlags());
3977 updateLiveVariables(LV, MI, *MIB);
3978 if (LIS)
3979 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3980 killDef();
3981 return MIB;
3984 unsigned NewOpc =
3985 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3986 : AMDGPU::V_FMAMK_F16)
3987 : AMDGPU::V_FMAMK_F32)
3988 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3989 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3990 if (pseudoToMCOpcode(NewOpc) != -1) {
3991 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3992 .add(*Dst)
3993 .add(*Src0)
3994 .addImm(Imm)
3995 .add(*Src2)
3996 .setMIFlags(MI.getFlags());
3997 updateLiveVariables(LV, MI, *MIB);
3999 if (LIS)
4000 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4001 killDef();
4002 return MIB;
4005 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4006 if (Src0Literal) {
4007 Imm = Src0->getImm();
4008 DefMI = nullptr;
4010 if (pseudoToMCOpcode(NewOpc) != -1 &&
4011 isOperandLegal(
4012 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4013 Src1)) {
4014 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4015 .add(*Dst)
4016 .add(*Src1)
4017 .addImm(Imm)
4018 .add(*Src2)
4019 .setMIFlags(MI.getFlags());
4020 updateLiveVariables(LV, MI, *MIB);
4022 if (LIS)
4023 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4024 if (DefMI)
4025 killDef();
4026 return MIB;
4031 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4032 // if VOP3 does not allow a literal operand.
4033 if (Src0Literal && !ST.hasVOP3Literal())
4034 return nullptr;
4036 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4037 : IsF64 ? AMDGPU::V_FMA_F64_e64
4038 : IsLegacy
4039 ? AMDGPU::V_FMA_LEGACY_F32_e64
4040 : AMDGPU::V_FMA_F32_e64
4041 : IsF16 ? AMDGPU::V_MAD_F16_e64
4042 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4043 : AMDGPU::V_MAD_F32_e64;
4044 if (pseudoToMCOpcode(NewOpc) == -1)
4045 return nullptr;
4047 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4048 .add(*Dst)
4049 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4050 .add(*Src0)
4051 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4052 .add(*Src1)
4053 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4054 .add(*Src2)
4055 .addImm(Clamp ? Clamp->getImm() : 0)
4056 .addImm(Omod ? Omod->getImm() : 0)
4057 .setMIFlags(MI.getFlags());
4058 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4059 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4060 updateLiveVariables(LV, MI, *MIB);
4061 if (LIS)
4062 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4063 return MIB;
4066 // It's not generally safe to move VALU instructions across these since it will
4067 // start using the register as a base index rather than directly.
4068 // XXX - Why isn't hasSideEffects sufficient for these?
4069 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4070 switch (MI.getOpcode()) {
4071 case AMDGPU::S_SET_GPR_IDX_ON:
4072 case AMDGPU::S_SET_GPR_IDX_MODE:
4073 case AMDGPU::S_SET_GPR_IDX_OFF:
4074 return true;
4075 default:
4076 return false;
4080 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4081 const MachineBasicBlock *MBB,
4082 const MachineFunction &MF) const {
4083 // Skipping the check for SP writes in the base implementation. The reason it
4084 // was added was apparently due to compile time concerns.
4086 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4087 // but is probably avoidable.
4089 // Copied from base implementation.
4090 // Terminators and labels can't be scheduled around.
4091 if (MI.isTerminator() || MI.isPosition())
4092 return true;
4094 // INLINEASM_BR can jump to another block
4095 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4096 return true;
4098 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4099 return true;
4101 // Target-independent instructions do not have an implicit-use of EXEC, even
4102 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4103 // boundaries prevents incorrect movements of such instructions.
4104 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4105 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4106 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4107 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4108 changesVGPRIndexingMode(MI);
4111 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
4112 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4115 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4116 // Skip the full operand and register alias search modifiesRegister
4117 // does. There's only a handful of instructions that touch this, it's only an
4118 // implicit def, and doesn't alias any other registers.
4119 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4122 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4123 unsigned Opcode = MI.getOpcode();
4125 if (MI.mayStore() && isSMRD(MI))
4126 return true; // scalar store or atomic
4128 // This will terminate the function when other lanes may need to continue.
4129 if (MI.isReturn())
4130 return true;
4132 // These instructions cause shader I/O that may cause hardware lockups
4133 // when executed with an empty EXEC mask.
4135 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4136 // EXEC = 0, but checking for that case here seems not worth it
4137 // given the typical code patterns.
4138 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4139 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4140 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4141 return true;
4143 if (MI.isCall() || MI.isInlineAsm())
4144 return true; // conservative assumption
4146 // Assume that barrier interactions are only intended with active lanes.
4147 if (isBarrier(Opcode))
4148 return true;
4150 // A mode change is a scalar operation that influences vector instructions.
4151 if (modifiesModeRegister(MI))
4152 return true;
4154 // These are like SALU instructions in terms of effects, so it's questionable
4155 // whether we should return true for those.
4157 // However, executing them with EXEC = 0 causes them to operate on undefined
4158 // data, which we avoid by returning true here.
4159 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4160 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4161 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4162 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4163 return true;
4165 return false;
4168 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4169 const MachineInstr &MI) const {
4170 if (MI.isMetaInstruction())
4171 return false;
4173 // This won't read exec if this is an SGPR->SGPR copy.
4174 if (MI.isCopyLike()) {
4175 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4176 return true;
4178 // Make sure this isn't copying exec as a normal operand
4179 return MI.readsRegister(AMDGPU::EXEC, &RI);
4182 // Make a conservative assumption about the callee.
4183 if (MI.isCall())
4184 return true;
4186 // Be conservative with any unhandled generic opcodes.
4187 if (!isTargetSpecificOpcode(MI.getOpcode()))
4188 return true;
4190 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4193 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4194 switch (Imm.getBitWidth()) {
4195 case 1: // This likely will be a condition code mask.
4196 return true;
4198 case 32:
4199 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4200 ST.hasInv2PiInlineImm());
4201 case 64:
4202 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4203 ST.hasInv2PiInlineImm());
4204 case 16:
4205 return ST.has16BitInsts() &&
4206 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4207 ST.hasInv2PiInlineImm());
4208 default:
4209 llvm_unreachable("invalid bitwidth");
4213 bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4214 APInt IntImm = Imm.bitcastToAPInt();
4215 int64_t IntImmVal = IntImm.getSExtValue();
4216 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4217 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4218 default:
4219 llvm_unreachable("invalid fltSemantics");
4220 case APFloatBase::S_IEEEsingle:
4221 case APFloatBase::S_IEEEdouble:
4222 return isInlineConstant(IntImm);
4223 case APFloatBase::S_BFloat:
4224 return ST.has16BitInsts() &&
4225 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4226 case APFloatBase::S_IEEEhalf:
4227 return ST.has16BitInsts() &&
4228 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4232 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
4233 uint8_t OperandType) const {
4234 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4235 if (!MO.isImm())
4236 return false;
4238 // MachineOperand provides no way to tell the true operand size, since it only
4239 // records a 64-bit value. We need to know the size to determine if a 32-bit
4240 // floating point immediate bit pattern is legal for an integer immediate. It
4241 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4243 int64_t Imm = MO.getImm();
4244 switch (OperandType) {
4245 case AMDGPU::OPERAND_REG_IMM_INT32:
4246 case AMDGPU::OPERAND_REG_IMM_FP32:
4247 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
4248 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4249 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4250 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4251 case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
4252 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4253 case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
4254 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4255 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4256 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4257 int32_t Trunc = static_cast<int32_t>(Imm);
4258 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4260 case AMDGPU::OPERAND_REG_IMM_INT64:
4261 case AMDGPU::OPERAND_REG_IMM_FP64:
4262 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4263 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4264 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4265 return AMDGPU::isInlinableLiteral64(MO.getImm(),
4266 ST.hasInv2PiInlineImm());
4267 case AMDGPU::OPERAND_REG_IMM_INT16:
4268 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4269 case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
4270 // We would expect inline immediates to not be concerned with an integer/fp
4271 // distinction. However, in the case of 16-bit integer operations, the
4272 // "floating point" values appear to not work. It seems read the low 16-bits
4273 // of 32-bit immediates, which happens to always work for the integer
4274 // values.
4276 // See llvm bugzilla 46302.
4278 // TODO: Theoretically we could use op-sel to use the high bits of the
4279 // 32-bit FP values.
4280 return AMDGPU::isInlinableIntLiteral(Imm);
4281 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4282 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4283 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
4284 return AMDGPU::isInlinableLiteralV2I16(Imm);
4285 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4286 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4287 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
4288 return AMDGPU::isInlinableLiteralV2F16(Imm);
4289 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4290 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4291 case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
4292 return AMDGPU::isInlinableLiteralV2BF16(Imm);
4293 case AMDGPU::OPERAND_REG_IMM_FP16:
4294 case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
4295 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
4296 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
4297 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4298 // A few special case instructions have 16-bit operands on subtargets
4299 // where 16-bit instructions are not legal.
4300 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4301 // constants in these cases
4302 int16_t Trunc = static_cast<int16_t>(Imm);
4303 return ST.has16BitInsts() &&
4304 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4307 return false;
4309 case AMDGPU::OPERAND_REG_IMM_BF16:
4310 case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
4311 case AMDGPU::OPERAND_REG_INLINE_C_BF16:
4312 case AMDGPU::OPERAND_REG_INLINE_AC_BF16: {
4313 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4314 int16_t Trunc = static_cast<int16_t>(Imm);
4315 return ST.has16BitInsts() &&
4316 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4318 return false;
4320 case AMDGPU::OPERAND_KIMM32:
4321 case AMDGPU::OPERAND_KIMM16:
4322 return false;
4323 case AMDGPU::OPERAND_INPUT_MODS:
4324 case MCOI::OPERAND_IMMEDIATE:
4325 // Always embedded in the instruction for free.
4326 return true;
4327 case MCOI::OPERAND_UNKNOWN:
4328 case MCOI::OPERAND_REGISTER:
4329 case MCOI::OPERAND_PCREL:
4330 case MCOI::OPERAND_GENERIC_0:
4331 case MCOI::OPERAND_GENERIC_1:
4332 case MCOI::OPERAND_GENERIC_2:
4333 case MCOI::OPERAND_GENERIC_3:
4334 case MCOI::OPERAND_GENERIC_4:
4335 case MCOI::OPERAND_GENERIC_5:
4336 // Just ignore anything else.
4337 return true;
4338 default:
4339 llvm_unreachable("invalid operand type");
4343 static bool compareMachineOp(const MachineOperand &Op0,
4344 const MachineOperand &Op1) {
4345 if (Op0.getType() != Op1.getType())
4346 return false;
4348 switch (Op0.getType()) {
4349 case MachineOperand::MO_Register:
4350 return Op0.getReg() == Op1.getReg();
4351 case MachineOperand::MO_Immediate:
4352 return Op0.getImm() == Op1.getImm();
4353 default:
4354 llvm_unreachable("Didn't expect to be comparing these operand types");
4358 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
4359 const MachineOperand &MO) const {
4360 const MCInstrDesc &InstDesc = MI.getDesc();
4361 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4363 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4365 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4366 return true;
4368 if (OpInfo.RegClass < 0)
4369 return false;
4371 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4372 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4373 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4374 AMDGPU::OpName::src2))
4375 return false;
4376 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4379 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4380 return false;
4382 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4383 return true;
4385 return ST.hasVOP3Literal();
4388 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4389 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4390 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4391 return false;
4393 int Op32 = AMDGPU::getVOPe32(Opcode);
4394 if (Op32 == -1)
4395 return false;
4397 return pseudoToMCOpcode(Op32) != -1;
4400 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4401 // The src0_modifier operand is present on all instructions
4402 // that have modifiers.
4404 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4407 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4408 unsigned OpName) const {
4409 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4410 return Mods && Mods->getImm();
4413 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4414 return any_of(ModifierOpNames,
4415 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4418 bool SIInstrInfo::canShrink(const MachineInstr &MI,
4419 const MachineRegisterInfo &MRI) const {
4420 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4421 // Can't shrink instruction with three operands.
4422 if (Src2) {
4423 switch (MI.getOpcode()) {
4424 default: return false;
4426 case AMDGPU::V_ADDC_U32_e64:
4427 case AMDGPU::V_SUBB_U32_e64:
4428 case AMDGPU::V_SUBBREV_U32_e64: {
4429 const MachineOperand *Src1
4430 = getNamedOperand(MI, AMDGPU::OpName::src1);
4431 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4432 return false;
4433 // Additional verification is needed for sdst/src2.
4434 return true;
4436 case AMDGPU::V_MAC_F16_e64:
4437 case AMDGPU::V_MAC_F32_e64:
4438 case AMDGPU::V_MAC_LEGACY_F32_e64:
4439 case AMDGPU::V_FMAC_F16_e64:
4440 case AMDGPU::V_FMAC_F16_fake16_e64:
4441 case AMDGPU::V_FMAC_F32_e64:
4442 case AMDGPU::V_FMAC_F64_e64:
4443 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4444 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4445 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4446 return false;
4447 break;
4449 case AMDGPU::V_CNDMASK_B32_e64:
4450 break;
4454 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4455 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4456 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4457 return false;
4459 // We don't need to check src0, all input types are legal, so just make sure
4460 // src0 isn't using any modifiers.
4461 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4462 return false;
4464 // Can it be shrunk to a valid 32 bit opcode?
4465 if (!hasVALU32BitEncoding(MI.getOpcode()))
4466 return false;
4468 // Check output modifiers
4469 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4470 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4471 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4472 // TODO: Can we avoid checking bound_ctrl/fi here?
4473 // They are only used by permlane*_swap special case.
4474 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4475 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4478 // Set VCC operand with all flags from \p Orig, except for setting it as
4479 // implicit.
4480 static void copyFlagsToImplicitVCC(MachineInstr &MI,
4481 const MachineOperand &Orig) {
4483 for (MachineOperand &Use : MI.implicit_operands()) {
4484 if (Use.isUse() &&
4485 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4486 Use.setIsUndef(Orig.isUndef());
4487 Use.setIsKill(Orig.isKill());
4488 return;
4493 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4494 unsigned Op32) const {
4495 MachineBasicBlock *MBB = MI.getParent();
4497 const MCInstrDesc &Op32Desc = get(Op32);
4498 MachineInstrBuilder Inst32 =
4499 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4500 .setMIFlags(MI.getFlags());
4502 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4503 // For VOPC instructions, this is replaced by an implicit def of vcc.
4505 // We assume the defs of the shrunk opcode are in the same order, and the
4506 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4507 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4508 Inst32.add(MI.getOperand(I));
4510 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4512 int Idx = MI.getNumExplicitDefs();
4513 for (const MachineOperand &Use : MI.explicit_uses()) {
4514 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4515 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
4516 continue;
4518 if (&Use == Src2) {
4519 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4520 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4521 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4522 // of vcc was already added during the initial BuildMI, but we
4523 // 1) may need to change vcc to vcc_lo to preserve the original register
4524 // 2) have to preserve the original flags.
4525 copyFlagsToImplicitVCC(*Inst32, *Src2);
4526 continue;
4530 Inst32.add(Use);
4533 // FIXME: Losing implicit operands
4534 fixImplicitOperands(*Inst32);
4535 return Inst32;
4538 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
4539 const MachineOperand &MO,
4540 const MCOperandInfo &OpInfo) const {
4541 // Literal constants use the constant bus.
4542 if (!MO.isReg())
4543 return !isInlineConstant(MO, OpInfo);
4545 if (!MO.isUse())
4546 return false;
4548 if (MO.getReg().isVirtual())
4549 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4551 // Null is free
4552 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4553 return false;
4555 // SGPRs use the constant bus
4556 if (MO.isImplicit()) {
4557 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4558 MO.getReg() == AMDGPU::VCC_LO;
4560 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4561 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4564 static Register findImplicitSGPRRead(const MachineInstr &MI) {
4565 for (const MachineOperand &MO : MI.implicit_operands()) {
4566 // We only care about reads.
4567 if (MO.isDef())
4568 continue;
4570 switch (MO.getReg()) {
4571 case AMDGPU::VCC:
4572 case AMDGPU::VCC_LO:
4573 case AMDGPU::VCC_HI:
4574 case AMDGPU::M0:
4575 case AMDGPU::FLAT_SCR:
4576 return MO.getReg();
4578 default:
4579 break;
4583 return Register();
4586 static bool shouldReadExec(const MachineInstr &MI) {
4587 if (SIInstrInfo::isVALU(MI)) {
4588 switch (MI.getOpcode()) {
4589 case AMDGPU::V_READLANE_B32:
4590 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4591 case AMDGPU::V_WRITELANE_B32:
4592 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4593 return false;
4596 return true;
4599 if (MI.isPreISelOpcode() ||
4600 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4601 SIInstrInfo::isSALU(MI) ||
4602 SIInstrInfo::isSMRD(MI))
4603 return false;
4605 return true;
4608 static bool isRegOrFI(const MachineOperand &MO) {
4609 return MO.isReg() || MO.isFI();
4612 static bool isSubRegOf(const SIRegisterInfo &TRI,
4613 const MachineOperand &SuperVec,
4614 const MachineOperand &SubReg) {
4615 if (SubReg.getReg().isPhysical())
4616 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4618 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4619 SubReg.getReg() == SuperVec.getReg();
4622 // Verify the illegal copy from vector register to SGPR for generic opcode COPY
4623 bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4624 const MachineRegisterInfo &MRI,
4625 StringRef &ErrInfo) const {
4626 Register DstReg = MI.getOperand(0).getReg();
4627 Register SrcReg = MI.getOperand(1).getReg();
4628 // This is a check for copy from vector register to SGPR
4629 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4630 ErrInfo = "illegal copy from vector register to SGPR";
4631 return false;
4633 return true;
4636 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
4637 StringRef &ErrInfo) const {
4638 uint16_t Opcode = MI.getOpcode();
4639 const MachineFunction *MF = MI.getParent()->getParent();
4640 const MachineRegisterInfo &MRI = MF->getRegInfo();
4642 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4643 // Find a better property to recognize the point where instruction selection
4644 // is just done.
4645 // We can only enforce this check after SIFixSGPRCopies pass so that the
4646 // illegal copies are legalized and thereafter we don't expect a pass
4647 // inserting similar copies.
4648 if (!MRI.isSSA() && MI.isCopy())
4649 return verifyCopy(MI, MRI, ErrInfo);
4651 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4652 return true;
4654 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4655 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4656 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4657 int Src3Idx = -1;
4658 if (Src0Idx == -1) {
4659 // VOPD V_DUAL_* instructions use different operand names.
4660 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4661 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4662 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4663 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4666 // Make sure the number of operands is correct.
4667 const MCInstrDesc &Desc = get(Opcode);
4668 if (!Desc.isVariadic() &&
4669 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4670 ErrInfo = "Instruction has wrong number of operands.";
4671 return false;
4674 if (MI.isInlineAsm()) {
4675 // Verify register classes for inlineasm constraints.
4676 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4677 I != E; ++I) {
4678 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4679 if (!RC)
4680 continue;
4682 const MachineOperand &Op = MI.getOperand(I);
4683 if (!Op.isReg())
4684 continue;
4686 Register Reg = Op.getReg();
4687 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4688 ErrInfo = "inlineasm operand has incorrect register class.";
4689 return false;
4693 return true;
4696 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4697 ErrInfo = "missing memory operand from image instruction.";
4698 return false;
4701 // Make sure the register classes are correct.
4702 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4703 const MachineOperand &MO = MI.getOperand(i);
4704 if (MO.isFPImm()) {
4705 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4706 "all fp values to integers.";
4707 return false;
4710 int RegClass = Desc.operands()[i].RegClass;
4712 switch (Desc.operands()[i].OperandType) {
4713 case MCOI::OPERAND_REGISTER:
4714 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4715 ErrInfo = "Illegal immediate value for operand.";
4716 return false;
4718 break;
4719 case AMDGPU::OPERAND_REG_IMM_INT32:
4720 case AMDGPU::OPERAND_REG_IMM_FP32:
4721 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
4722 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4723 break;
4724 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4725 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4726 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4727 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4728 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4729 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
4730 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4731 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4732 case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
4733 case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
4734 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
4735 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4736 ErrInfo = "Illegal immediate value for operand.";
4737 return false;
4739 break;
4741 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
4742 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4743 ErrInfo = "Expected inline constant for operand.";
4744 return false;
4746 break;
4747 case MCOI::OPERAND_IMMEDIATE:
4748 case AMDGPU::OPERAND_KIMM32:
4749 // Check if this operand is an immediate.
4750 // FrameIndex operands will be replaced by immediates, so they are
4751 // allowed.
4752 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4753 ErrInfo = "Expected immediate, but got non-immediate";
4754 return false;
4756 [[fallthrough]];
4757 default:
4758 continue;
4761 if (!MO.isReg())
4762 continue;
4763 Register Reg = MO.getReg();
4764 if (!Reg)
4765 continue;
4767 // FIXME: Ideally we would have separate instruction definitions with the
4768 // aligned register constraint.
4769 // FIXME: We do not verify inline asm operands, but custom inline asm
4770 // verification is broken anyway
4771 if (ST.needsAlignedVGPRs()) {
4772 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4773 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4774 const TargetRegisterClass *SubRC =
4775 RI.getSubRegisterClass(RC, MO.getSubReg());
4776 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4777 if (RC)
4778 RC = SubRC;
4781 // Check that this is the aligned version of the class.
4782 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4783 ErrInfo = "Subtarget requires even aligned vector registers";
4784 return false;
4788 if (RegClass != -1) {
4789 if (Reg.isVirtual())
4790 continue;
4792 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4793 if (!RC->contains(Reg)) {
4794 ErrInfo = "Operand has incorrect register class.";
4795 return false;
4800 // Verify SDWA
4801 if (isSDWA(MI)) {
4802 if (!ST.hasSDWA()) {
4803 ErrInfo = "SDWA is not supported on this target";
4804 return false;
4807 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4809 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4810 if (OpIdx == -1)
4811 continue;
4812 const MachineOperand &MO = MI.getOperand(OpIdx);
4814 if (!ST.hasSDWAScalar()) {
4815 // Only VGPRS on VI
4816 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4817 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4818 return false;
4820 } else {
4821 // No immediates on GFX9
4822 if (!MO.isReg()) {
4823 ErrInfo =
4824 "Only reg allowed as operands in SDWA instructions on GFX9+";
4825 return false;
4830 if (!ST.hasSDWAOmod()) {
4831 // No omod allowed on VI
4832 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4833 if (OMod != nullptr &&
4834 (!OMod->isImm() || OMod->getImm() != 0)) {
4835 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4836 return false;
4840 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4841 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4842 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4843 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4844 const MachineOperand *Src0ModsMO =
4845 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4846 unsigned Mods = Src0ModsMO->getImm();
4847 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4848 Mods & SISrcMods::SEXT) {
4849 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4850 return false;
4854 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4855 if (isVOPC(BasicOpcode)) {
4856 if (!ST.hasSDWASdst() && DstIdx != -1) {
4857 // Only vcc allowed as dst on VI for VOPC
4858 const MachineOperand &Dst = MI.getOperand(DstIdx);
4859 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4860 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4861 return false;
4863 } else if (!ST.hasSDWAOutModsVOPC()) {
4864 // No clamp allowed on GFX9 for VOPC
4865 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4866 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4867 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4868 return false;
4871 // No omod allowed on GFX9 for VOPC
4872 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4873 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4874 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4875 return false;
4880 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4881 if (DstUnused && DstUnused->isImm() &&
4882 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4883 const MachineOperand &Dst = MI.getOperand(DstIdx);
4884 if (!Dst.isReg() || !Dst.isTied()) {
4885 ErrInfo = "Dst register should have tied register";
4886 return false;
4889 const MachineOperand &TiedMO =
4890 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4891 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4892 ErrInfo =
4893 "Dst register should be tied to implicit use of preserved register";
4894 return false;
4896 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4897 ErrInfo = "Dst register should use same physical register as preserved";
4898 return false;
4903 // Verify MIMG / VIMAGE / VSAMPLE
4904 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4905 // Ensure that the return type used is large enough for all the options
4906 // being used TFE/LWE require an extra result register.
4907 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4908 if (DMask) {
4909 uint64_t DMaskImm = DMask->getImm();
4910 uint32_t RegCount =
4911 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4912 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4913 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4914 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4916 // Adjust for packed 16 bit values
4917 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4918 RegCount = divideCeil(RegCount, 2);
4920 // Adjust if using LWE or TFE
4921 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4922 RegCount += 1;
4924 const uint32_t DstIdx =
4925 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4926 const MachineOperand &Dst = MI.getOperand(DstIdx);
4927 if (Dst.isReg()) {
4928 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4929 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4930 if (RegCount > DstSize) {
4931 ErrInfo = "Image instruction returns too many registers for dst "
4932 "register class";
4933 return false;
4939 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4940 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4941 unsigned ConstantBusCount = 0;
4942 bool UsesLiteral = false;
4943 const MachineOperand *LiteralVal = nullptr;
4945 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4946 if (ImmIdx != -1) {
4947 ++ConstantBusCount;
4948 UsesLiteral = true;
4949 LiteralVal = &MI.getOperand(ImmIdx);
4952 SmallVector<Register, 2> SGPRsUsed;
4953 Register SGPRUsed;
4955 // Only look at the true operands. Only a real operand can use the constant
4956 // bus, and we don't want to check pseudo-operands like the source modifier
4957 // flags.
4958 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4959 if (OpIdx == -1)
4960 continue;
4961 const MachineOperand &MO = MI.getOperand(OpIdx);
4962 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4963 if (MO.isReg()) {
4964 SGPRUsed = MO.getReg();
4965 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4966 ++ConstantBusCount;
4967 SGPRsUsed.push_back(SGPRUsed);
4969 } else if (!MO.isFI()) { // Treat FI like a register.
4970 if (!UsesLiteral) {
4971 ++ConstantBusCount;
4972 UsesLiteral = true;
4973 LiteralVal = &MO;
4974 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4975 assert(isVOP2(MI) || isVOP3(MI));
4976 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4977 return false;
4983 SGPRUsed = findImplicitSGPRRead(MI);
4984 if (SGPRUsed) {
4985 // Implicit uses may safely overlap true operands
4986 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4987 return !RI.regsOverlap(SGPRUsed, SGPR);
4988 })) {
4989 ++ConstantBusCount;
4990 SGPRsUsed.push_back(SGPRUsed);
4994 // v_writelane_b32 is an exception from constant bus restriction:
4995 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4996 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4997 Opcode != AMDGPU::V_WRITELANE_B32) {
4998 ErrInfo = "VOP* instruction violates constant bus restriction";
4999 return false;
5002 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5003 ErrInfo = "VOP3 instruction uses literal";
5004 return false;
5008 // Special case for writelane - this can break the multiple constant bus rule,
5009 // but still can't use more than one SGPR register
5010 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5011 unsigned SGPRCount = 0;
5012 Register SGPRUsed;
5014 for (int OpIdx : {Src0Idx, Src1Idx}) {
5015 if (OpIdx == -1)
5016 break;
5018 const MachineOperand &MO = MI.getOperand(OpIdx);
5020 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5021 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5022 if (MO.getReg() != SGPRUsed)
5023 ++SGPRCount;
5024 SGPRUsed = MO.getReg();
5027 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5028 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5029 return false;
5034 // Verify misc. restrictions on specific instructions.
5035 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5036 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5037 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5038 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5039 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5040 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5041 if (!compareMachineOp(Src0, Src1) &&
5042 !compareMachineOp(Src0, Src2)) {
5043 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5044 return false;
5047 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5048 SISrcMods::ABS) ||
5049 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5050 SISrcMods::ABS) ||
5051 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5052 SISrcMods::ABS)) {
5053 ErrInfo = "ABS not allowed in VOP3B instructions";
5054 return false;
5058 if (isSOP2(MI) || isSOPC(MI)) {
5059 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5060 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5062 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5063 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5064 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5065 !Src0.isIdenticalTo(Src1)) {
5066 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5067 return false;
5071 if (isSOPK(MI)) {
5072 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5073 if (Desc.isBranch()) {
5074 if (!Op->isMBB()) {
5075 ErrInfo = "invalid branch target for SOPK instruction";
5076 return false;
5078 } else {
5079 uint64_t Imm = Op->getImm();
5080 if (sopkIsZext(Opcode)) {
5081 if (!isUInt<16>(Imm)) {
5082 ErrInfo = "invalid immediate for SOPK instruction";
5083 return false;
5085 } else {
5086 if (!isInt<16>(Imm)) {
5087 ErrInfo = "invalid immediate for SOPK instruction";
5088 return false;
5094 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5095 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5096 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5097 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5098 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5099 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5101 const unsigned StaticNumOps =
5102 Desc.getNumOperands() + Desc.implicit_uses().size();
5103 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5105 // Allow additional implicit operands. This allows a fixup done by the post
5106 // RA scheduler where the main implicit operand is killed and implicit-defs
5107 // are added for sub-registers that remain live after this instruction.
5108 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5109 ErrInfo = "missing implicit register operands";
5110 return false;
5113 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5114 if (IsDst) {
5115 if (!Dst->isUse()) {
5116 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5117 return false;
5120 unsigned UseOpIdx;
5121 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5122 UseOpIdx != StaticNumOps + 1) {
5123 ErrInfo = "movrel implicit operands should be tied";
5124 return false;
5128 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5129 const MachineOperand &ImpUse
5130 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5131 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5132 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5133 ErrInfo = "src0 should be subreg of implicit vector use";
5134 return false;
5138 // Make sure we aren't losing exec uses in the td files. This mostly requires
5139 // being careful when using let Uses to try to add other use registers.
5140 if (shouldReadExec(MI)) {
5141 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5142 ErrInfo = "VALU instruction does not implicitly read exec mask";
5143 return false;
5147 if (isSMRD(MI)) {
5148 if (MI.mayStore() &&
5149 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5150 // The register offset form of scalar stores may only use m0 as the
5151 // soffset register.
5152 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5153 if (Soff && Soff->getReg() != AMDGPU::M0) {
5154 ErrInfo = "scalar stores must use m0 as offset register";
5155 return false;
5160 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5161 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5162 if (Offset->getImm() != 0) {
5163 ErrInfo = "subtarget does not support offsets in flat instructions";
5164 return false;
5168 if (isDS(MI) && !ST.hasGDS()) {
5169 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5170 if (GDSOp && GDSOp->getImm() != 0) {
5171 ErrInfo = "GDS is not supported on this subtarget";
5172 return false;
5176 if (isImage(MI)) {
5177 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5178 if (DimOp) {
5179 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5180 AMDGPU::OpName::vaddr0);
5181 int RSrcOpName =
5182 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5183 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5184 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5185 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5186 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5187 const AMDGPU::MIMGDimInfo *Dim =
5188 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
5190 if (!Dim) {
5191 ErrInfo = "dim is out of range";
5192 return false;
5195 bool IsA16 = false;
5196 if (ST.hasR128A16()) {
5197 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5198 IsA16 = R128A16->getImm() != 0;
5199 } else if (ST.hasA16()) {
5200 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5201 IsA16 = A16->getImm() != 0;
5204 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5206 unsigned AddrWords =
5207 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5209 unsigned VAddrWords;
5210 if (IsNSA) {
5211 VAddrWords = RsrcIdx - VAddr0Idx;
5212 if (ST.hasPartialNSAEncoding() &&
5213 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5214 unsigned LastVAddrIdx = RsrcIdx - 1;
5215 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5217 } else {
5218 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5219 if (AddrWords > 12)
5220 AddrWords = 16;
5223 if (VAddrWords != AddrWords) {
5224 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5225 << " but got " << VAddrWords << "\n");
5226 ErrInfo = "bad vaddr size";
5227 return false;
5232 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5233 if (DppCt) {
5234 using namespace AMDGPU::DPP;
5236 unsigned DC = DppCt->getImm();
5237 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5238 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5239 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5240 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5241 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5242 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5243 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5244 ErrInfo = "Invalid dpp_ctrl value";
5245 return false;
5247 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5248 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5249 ErrInfo = "Invalid dpp_ctrl value: "
5250 "wavefront shifts are not supported on GFX10+";
5251 return false;
5253 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5254 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5255 ErrInfo = "Invalid dpp_ctrl value: "
5256 "broadcasts are not supported on GFX10+";
5257 return false;
5259 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5260 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5261 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5262 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5263 !ST.hasGFX90AInsts()) {
5264 ErrInfo = "Invalid dpp_ctrl value: "
5265 "row_newbroadcast/row_share is not supported before "
5266 "GFX90A/GFX10";
5267 return false;
5269 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5270 ErrInfo = "Invalid dpp_ctrl value: "
5271 "row_share and row_xmask are not supported before GFX10";
5272 return false;
5276 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5277 !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
5278 ErrInfo = "Invalid dpp_ctrl value: "
5279 "DP ALU dpp only support row_newbcast";
5280 return false;
5284 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5285 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5286 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5287 : AMDGPU::OpName::vdata;
5288 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5289 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5290 if (Data && !Data->isReg())
5291 Data = nullptr;
5293 if (ST.hasGFX90AInsts()) {
5294 if (Dst && Data &&
5295 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5296 ErrInfo = "Invalid register class: "
5297 "vdata and vdst should be both VGPR or AGPR";
5298 return false;
5300 if (Data && Data2 &&
5301 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5302 ErrInfo = "Invalid register class: "
5303 "both data operands should be VGPR or AGPR";
5304 return false;
5306 } else {
5307 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5308 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5309 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5310 ErrInfo = "Invalid register class: "
5311 "agpr loads and stores not supported on this GPU";
5312 return false;
5317 if (ST.needsAlignedVGPRs()) {
5318 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5319 const MachineOperand *Op = getNamedOperand(MI, OpName);
5320 if (!Op)
5321 return true;
5322 Register Reg = Op->getReg();
5323 if (Reg.isPhysical())
5324 return !(RI.getHWRegIndex(Reg) & 1);
5325 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5326 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5327 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5330 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5331 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5332 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5334 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5335 ErrInfo = "Subtarget requires even aligned vector registers "
5336 "for DS_GWS instructions";
5337 return false;
5341 if (isMIMG(MI)) {
5342 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5343 ErrInfo = "Subtarget requires even aligned vector registers "
5344 "for vaddr operand of image instructions";
5345 return false;
5350 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5351 !ST.hasGFX90AInsts()) {
5352 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5353 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5354 ErrInfo = "Invalid register class: "
5355 "v_accvgpr_write with an SGPR is not supported on this GPU";
5356 return false;
5360 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5361 const MachineOperand &SrcOp = MI.getOperand(1);
5362 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5363 ErrInfo = "pseudo expects only physical SGPRs";
5364 return false;
5368 return true;
5371 // It is more readable to list mapped opcodes on the same line.
5372 // clang-format off
5374 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5375 switch (MI.getOpcode()) {
5376 default: return AMDGPU::INSTRUCTION_LIST_END;
5377 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5378 case AMDGPU::COPY: return AMDGPU::COPY;
5379 case AMDGPU::PHI: return AMDGPU::PHI;
5380 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5381 case AMDGPU::WQM: return AMDGPU::WQM;
5382 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5383 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5384 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5385 case AMDGPU::S_MOV_B32: {
5386 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5387 return MI.getOperand(1).isReg() ||
5388 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5389 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5391 case AMDGPU::S_ADD_I32:
5392 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5393 case AMDGPU::S_ADDC_U32:
5394 return AMDGPU::V_ADDC_U32_e32;
5395 case AMDGPU::S_SUB_I32:
5396 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5397 // FIXME: These are not consistently handled, and selected when the carry is
5398 // used.
5399 case AMDGPU::S_ADD_U32:
5400 return AMDGPU::V_ADD_CO_U32_e32;
5401 case AMDGPU::S_SUB_U32:
5402 return AMDGPU::V_SUB_CO_U32_e32;
5403 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5404 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5405 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5406 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5407 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5408 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5409 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5410 case AMDGPU::S_XNOR_B32:
5411 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5412 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5413 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5414 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5415 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5416 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5417 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5418 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5419 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5420 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5421 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5422 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5423 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5424 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5425 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5426 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5427 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5428 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5429 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5430 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5431 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5432 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5433 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5434 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5435 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5436 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5437 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5438 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5439 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5440 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5441 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5442 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5443 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5444 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5445 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5446 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5447 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5448 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5449 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5450 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5451 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5452 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5453 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5454 case AMDGPU::S_CVT_F32_F16:
5455 case AMDGPU::S_CVT_HI_F32_F16:
5456 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5457 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5458 case AMDGPU::S_CVT_F16_F32:
5459 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5460 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5461 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5462 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5463 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5464 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5465 case AMDGPU::S_CEIL_F16:
5466 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5467 : AMDGPU::V_CEIL_F16_fake16_e64;
5468 case AMDGPU::S_FLOOR_F16:
5469 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5470 : AMDGPU::V_FLOOR_F16_fake16_e64;
5471 case AMDGPU::S_TRUNC_F16:
5472 return AMDGPU::V_TRUNC_F16_fake16_e64;
5473 case AMDGPU::S_RNDNE_F16:
5474 return AMDGPU::V_RNDNE_F16_fake16_e64;
5475 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5476 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5477 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5478 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5479 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5480 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5481 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5482 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5483 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5484 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5485 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5486 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5487 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5488 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5489 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5490 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5491 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5492 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5493 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5494 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5495 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5496 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5497 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5498 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5499 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5500 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5501 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5502 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5503 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5504 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5505 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5506 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5507 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5508 case AMDGPU::S_CMP_LT_F16:
5509 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5510 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5511 case AMDGPU::S_CMP_EQ_F16:
5512 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5513 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5514 case AMDGPU::S_CMP_LE_F16:
5515 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5516 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5517 case AMDGPU::S_CMP_GT_F16:
5518 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5519 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5520 case AMDGPU::S_CMP_LG_F16:
5521 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5522 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5523 case AMDGPU::S_CMP_GE_F16:
5524 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5525 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5526 case AMDGPU::S_CMP_O_F16:
5527 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5528 : AMDGPU::V_CMP_O_F16_fake16_e64;
5529 case AMDGPU::S_CMP_U_F16:
5530 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5531 : AMDGPU::V_CMP_U_F16_fake16_e64;
5532 case AMDGPU::S_CMP_NGE_F16:
5533 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5534 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5535 case AMDGPU::S_CMP_NLG_F16:
5536 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5537 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5538 case AMDGPU::S_CMP_NGT_F16:
5539 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5540 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5541 case AMDGPU::S_CMP_NLE_F16:
5542 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5543 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5544 case AMDGPU::S_CMP_NEQ_F16:
5545 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5546 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5547 case AMDGPU::S_CMP_NLT_F16:
5548 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5549 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5550 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5551 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5552 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5553 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5554 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5555 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5556 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5557 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5558 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5559 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5561 llvm_unreachable(
5562 "Unexpected scalar opcode without corresponding vector one!");
5565 // clang-format on
5567 void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
5568 MachineBasicBlock &MBB,
5569 MachineBasicBlock::iterator MBBI,
5570 const DebugLoc &DL, Register Reg,
5571 bool IsSCCLive,
5572 SlotIndexes *Indexes) const {
5573 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5574 const SIInstrInfo *TII = ST.getInstrInfo();
5575 bool IsWave32 = ST.isWave32();
5576 if (IsSCCLive) {
5577 // Insert two move instructions, one to save the original value of EXEC and
5578 // the other to turn on all bits in EXEC. This is required as we can't use
5579 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5580 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5581 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5582 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5583 .addReg(Exec, RegState::Kill);
5584 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5585 if (Indexes) {
5586 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5587 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5589 } else {
5590 const unsigned OrSaveExec =
5591 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5592 auto SaveExec =
5593 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5594 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5595 if (Indexes)
5596 Indexes->insertMachineInstrInMaps(*SaveExec);
5600 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
5601 MachineBasicBlock::iterator MBBI,
5602 const DebugLoc &DL, Register Reg,
5603 SlotIndexes *Indexes) const {
5604 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5605 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5606 auto ExecRestoreMI =
5607 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5608 if (Indexes)
5609 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5612 static const TargetRegisterClass *
5613 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
5614 const MachineRegisterInfo &MRI,
5615 const MCInstrDesc &TID, unsigned RCID,
5616 bool IsAllocatable) {
5617 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5618 (((TID.mayLoad() || TID.mayStore()) &&
5619 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5620 (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
5621 switch (RCID) {
5622 case AMDGPU::AV_32RegClassID:
5623 RCID = AMDGPU::VGPR_32RegClassID;
5624 break;
5625 case AMDGPU::AV_64RegClassID:
5626 RCID = AMDGPU::VReg_64RegClassID;
5627 break;
5628 case AMDGPU::AV_96RegClassID:
5629 RCID = AMDGPU::VReg_96RegClassID;
5630 break;
5631 case AMDGPU::AV_128RegClassID:
5632 RCID = AMDGPU::VReg_128RegClassID;
5633 break;
5634 case AMDGPU::AV_160RegClassID:
5635 RCID = AMDGPU::VReg_160RegClassID;
5636 break;
5637 case AMDGPU::AV_512RegClassID:
5638 RCID = AMDGPU::VReg_512RegClassID;
5639 break;
5640 default:
5641 break;
5645 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5648 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
5649 unsigned OpNum, const TargetRegisterInfo *TRI,
5650 const MachineFunction &MF)
5651 const {
5652 if (OpNum >= TID.getNumOperands())
5653 return nullptr;
5654 auto RegClass = TID.operands()[OpNum].RegClass;
5655 bool IsAllocatable = false;
5656 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
5657 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5658 // with two data operands. Request register class constrained to VGPR only
5659 // of both operands present as Machine Copy Propagation can not check this
5660 // constraint and possibly other passes too.
5662 // The check is limited to FLAT and DS because atomics in non-flat encoding
5663 // have their vdst and vdata tied to be the same register.
5664 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5665 AMDGPU::OpName::vdst);
5666 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5667 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5668 : AMDGPU::OpName::vdata);
5669 if (DataIdx != -1) {
5670 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5671 TID.Opcode, AMDGPU::OpName::data1);
5674 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5675 IsAllocatable);
5678 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
5679 unsigned OpNo) const {
5680 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5681 const MCInstrDesc &Desc = get(MI.getOpcode());
5682 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5683 Desc.operands()[OpNo].RegClass == -1) {
5684 Register Reg = MI.getOperand(OpNo).getReg();
5686 if (Reg.isVirtual())
5687 return MRI.getRegClass(Reg);
5688 return RI.getPhysRegBaseClass(Reg);
5691 unsigned RCID = Desc.operands()[OpNo].RegClass;
5692 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5695 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
5696 MachineBasicBlock::iterator I = MI;
5697 MachineBasicBlock *MBB = MI.getParent();
5698 MachineOperand &MO = MI.getOperand(OpIdx);
5699 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
5700 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5701 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5702 unsigned Size = RI.getRegSizeInBits(*RC);
5703 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5704 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5705 : AMDGPU::V_MOV_B32_e32;
5706 if (MO.isReg())
5707 Opcode = AMDGPU::COPY;
5708 else if (RI.isSGPRClass(RC))
5709 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5711 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5712 Register Reg = MRI.createVirtualRegister(VRC);
5713 DebugLoc DL = MBB->findDebugLoc(I);
5714 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5715 MO.ChangeToRegister(Reg, false);
5718 unsigned SIInstrInfo::buildExtractSubReg(
5719 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
5720 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5721 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5722 if (!SuperReg.getReg().isVirtual())
5723 return RI.getSubReg(SuperReg.getReg(), SubIdx);
5725 MachineBasicBlock *MBB = MI->getParent();
5726 DebugLoc DL = MI->getDebugLoc();
5727 Register SubReg = MRI.createVirtualRegister(SubRC);
5729 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5730 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5731 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5732 return SubReg;
5735 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
5736 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
5737 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5738 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5739 if (Op.isImm()) {
5740 if (SubIdx == AMDGPU::sub0)
5741 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5742 if (SubIdx == AMDGPU::sub1)
5743 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5745 llvm_unreachable("Unhandled register index for immediate");
5748 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5749 SubIdx, SubRC);
5750 return MachineOperand::CreateReg(SubReg, false);
5753 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
5754 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5755 assert(Inst.getNumExplicitOperands() == 3);
5756 MachineOperand Op1 = Inst.getOperand(1);
5757 Inst.removeOperand(1);
5758 Inst.addOperand(Op1);
5761 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
5762 const MCOperandInfo &OpInfo,
5763 const MachineOperand &MO) const {
5764 if (!MO.isReg())
5765 return false;
5767 Register Reg = MO.getReg();
5769 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5770 if (Reg.isPhysical())
5771 return DRC->contains(Reg);
5773 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5775 if (MO.getSubReg()) {
5776 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5777 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5778 if (!SuperRC)
5779 return false;
5781 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5782 if (!DRC)
5783 return false;
5785 return RC->hasSuperClassEq(DRC);
5788 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
5789 const MCOperandInfo &OpInfo,
5790 const MachineOperand &MO) const {
5791 if (MO.isReg())
5792 return isLegalRegOperand(MRI, OpInfo, MO);
5794 // Handle non-register types that are treated like immediates.
5795 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5796 return true;
5799 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5800 const MachineOperand *MO) const {
5801 const MachineFunction &MF = *MI.getParent()->getParent();
5802 const MachineRegisterInfo &MRI = MF.getRegInfo();
5803 const MCInstrDesc &InstDesc = MI.getDesc();
5804 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5805 const TargetRegisterClass *DefinedRC =
5806 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5807 if (!MO)
5808 MO = &MI.getOperand(OpIdx);
5810 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5811 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5812 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5813 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5814 return false;
5816 SmallDenseSet<RegSubRegPair> SGPRsUsed;
5817 if (MO->isReg())
5818 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5820 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5821 if (i == OpIdx)
5822 continue;
5823 const MachineOperand &Op = MI.getOperand(i);
5824 if (Op.isReg()) {
5825 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5826 if (!SGPRsUsed.count(SGPR) &&
5827 // FIXME: This can access off the end of the operands() array.
5828 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5829 if (--ConstantBusLimit <= 0)
5830 return false;
5831 SGPRsUsed.insert(SGPR);
5833 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5834 !isInlineConstant(Op, InstDesc.operands()[i])) {
5835 if (!LiteralLimit--)
5836 return false;
5837 if (--ConstantBusLimit <= 0)
5838 return false;
5841 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5842 isF16PseudoScalarTrans(MI.getOpcode()) &&
5843 isInlineConstant(*MO, OpInfo)) {
5844 return false;
5847 if (MO->isReg()) {
5848 if (!DefinedRC)
5849 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5850 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5851 return false;
5852 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5853 if (IsAGPR && !ST.hasMAIInsts())
5854 return false;
5855 unsigned Opc = MI.getOpcode();
5856 if (IsAGPR &&
5857 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5858 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5859 return false;
5860 // Atomics should have both vdst and vdata either vgpr or agpr.
5861 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5862 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5863 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5864 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5865 MI.getOperand(DataIdx).isReg() &&
5866 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5867 return false;
5868 if ((int)OpIdx == DataIdx) {
5869 if (VDstIdx != -1 &&
5870 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5871 return false;
5872 // DS instructions with 2 src operands also must have tied RC.
5873 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5874 AMDGPU::OpName::data1);
5875 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5876 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5877 return false;
5879 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5880 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5881 RI.isSGPRReg(MRI, MO->getReg()))
5882 return false;
5883 return true;
5886 if (MO->isImm()) {
5887 uint64_t Imm = MO->getImm();
5888 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5889 bool Is64BitOp = Is64BitFPOp ||
5890 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
5891 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
5892 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
5893 if (Is64BitOp &&
5894 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
5895 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5896 return false;
5898 // FIXME: We can use sign extended 64-bit literals, but only for signed
5899 // operands. At the moment we do not know if an operand is signed.
5900 // Such operand will be encoded as its low 32 bits and then either
5901 // correctly sign extended or incorrectly zero extended by HW.
5902 if (!Is64BitFPOp && (int32_t)Imm < 0)
5903 return false;
5907 // Handle non-register types that are treated like immediates.
5908 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5910 if (!DefinedRC) {
5911 // This operand expects an immediate.
5912 return true;
5915 return isImmOperandLegal(MI, OpIdx, *MO);
5918 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
5919 MachineInstr &MI) const {
5920 unsigned Opc = MI.getOpcode();
5921 const MCInstrDesc &InstrDesc = get(Opc);
5923 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5924 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5926 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5927 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5929 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5930 // we need to only have one constant bus use before GFX10.
5931 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5932 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5933 RI.isSGPRReg(MRI, Src0.getReg()))
5934 legalizeOpWithMove(MI, Src0Idx);
5936 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5937 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5938 // src0/src1 with V_READFIRSTLANE.
5939 if (Opc == AMDGPU::V_WRITELANE_B32) {
5940 const DebugLoc &DL = MI.getDebugLoc();
5941 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5942 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5943 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5944 .add(Src0);
5945 Src0.ChangeToRegister(Reg, false);
5947 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5948 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5949 const DebugLoc &DL = MI.getDebugLoc();
5950 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5951 .add(Src1);
5952 Src1.ChangeToRegister(Reg, false);
5954 return;
5957 // No VOP2 instructions support AGPRs.
5958 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5959 legalizeOpWithMove(MI, Src0Idx);
5961 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5962 legalizeOpWithMove(MI, Src1Idx);
5964 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5965 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5966 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5967 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5968 legalizeOpWithMove(MI, Src2Idx);
5971 // VOP2 src0 instructions support all operand types, so we don't need to check
5972 // their legality. If src1 is already legal, we don't need to do anything.
5973 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5974 return;
5976 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5977 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5978 // select is uniform.
5979 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5980 RI.isVGPR(MRI, Src1.getReg())) {
5981 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5982 const DebugLoc &DL = MI.getDebugLoc();
5983 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5984 .add(Src1);
5985 Src1.ChangeToRegister(Reg, false);
5986 return;
5989 // We do not use commuteInstruction here because it is too aggressive and will
5990 // commute if it is possible. We only want to commute here if it improves
5991 // legality. This can be called a fairly large number of times so don't waste
5992 // compile time pointlessly swapping and checking legality again.
5993 if (HasImplicitSGPR || !MI.isCommutable()) {
5994 legalizeOpWithMove(MI, Src1Idx);
5995 return;
5998 // If src0 can be used as src1, commuting will make the operands legal.
5999 // Otherwise we have to give up and insert a move.
6001 // TODO: Other immediate-like operand kinds could be commuted if there was a
6002 // MachineOperand::ChangeTo* for them.
6003 if ((!Src1.isImm() && !Src1.isReg()) ||
6004 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6005 legalizeOpWithMove(MI, Src1Idx);
6006 return;
6009 int CommutedOpc = commuteOpcode(MI);
6010 if (CommutedOpc == -1) {
6011 legalizeOpWithMove(MI, Src1Idx);
6012 return;
6015 MI.setDesc(get(CommutedOpc));
6017 Register Src0Reg = Src0.getReg();
6018 unsigned Src0SubReg = Src0.getSubReg();
6019 bool Src0Kill = Src0.isKill();
6021 if (Src1.isImm())
6022 Src0.ChangeToImmediate(Src1.getImm());
6023 else if (Src1.isReg()) {
6024 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6025 Src0.setSubReg(Src1.getSubReg());
6026 } else
6027 llvm_unreachable("Should only have register or immediate operands");
6029 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6030 Src1.setSubReg(Src0SubReg);
6031 fixImplicitOperands(MI);
6034 // Legalize VOP3 operands. All operand types are supported for any operand
6035 // but only one literal constant and only starting from GFX10.
6036 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6037 MachineInstr &MI) const {
6038 unsigned Opc = MI.getOpcode();
6040 int VOP3Idx[3] = {
6041 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6042 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6043 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6046 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6047 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6048 // src1 and src2 must be scalar
6049 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6050 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6051 const DebugLoc &DL = MI.getDebugLoc();
6052 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6053 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6054 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6055 .add(Src1);
6056 Src1.ChangeToRegister(Reg, false);
6058 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6059 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6060 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6061 .add(Src2);
6062 Src2.ChangeToRegister(Reg, false);
6066 // Find the one SGPR operand we are allowed to use.
6067 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6068 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6069 SmallDenseSet<unsigned> SGPRsUsed;
6070 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6071 if (SGPRReg) {
6072 SGPRsUsed.insert(SGPRReg);
6073 --ConstantBusLimit;
6076 for (int Idx : VOP3Idx) {
6077 if (Idx == -1)
6078 break;
6079 MachineOperand &MO = MI.getOperand(Idx);
6081 if (!MO.isReg()) {
6082 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6083 continue;
6085 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6086 --LiteralLimit;
6087 --ConstantBusLimit;
6088 continue;
6091 --LiteralLimit;
6092 --ConstantBusLimit;
6093 legalizeOpWithMove(MI, Idx);
6094 continue;
6097 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6098 !isOperandLegal(MI, Idx, &MO)) {
6099 legalizeOpWithMove(MI, Idx);
6100 continue;
6103 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6104 continue; // VGPRs are legal
6106 // We can use one SGPR in each VOP3 instruction prior to GFX10
6107 // and two starting from GFX10.
6108 if (SGPRsUsed.count(MO.getReg()))
6109 continue;
6110 if (ConstantBusLimit > 0) {
6111 SGPRsUsed.insert(MO.getReg());
6112 --ConstantBusLimit;
6113 continue;
6116 // If we make it this far, then the operand is not legal and we must
6117 // legalize it.
6118 legalizeOpWithMove(MI, Idx);
6121 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6122 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6123 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6124 legalizeOpWithMove(MI, VOP3Idx[2]);
6127 Register SIInstrInfo::readlaneVGPRToSGPR(
6128 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6129 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6130 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6131 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6132 if (DstRC)
6133 SRC = RI.getCommonSubClass(SRC, DstRC);
6135 Register DstReg = MRI.createVirtualRegister(SRC);
6136 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6138 if (RI.hasAGPRs(VRC)) {
6139 VRC = RI.getEquivalentVGPRClass(VRC);
6140 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6141 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6142 get(TargetOpcode::COPY), NewSrcReg)
6143 .addReg(SrcReg);
6144 SrcReg = NewSrcReg;
6147 if (SubRegs == 1) {
6148 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6149 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6150 .addReg(SrcReg);
6151 return DstReg;
6154 SmallVector<Register, 8> SRegs;
6155 for (unsigned i = 0; i < SubRegs; ++i) {
6156 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6157 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6158 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6159 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6160 SRegs.push_back(SGPR);
6163 MachineInstrBuilder MIB =
6164 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6165 get(AMDGPU::REG_SEQUENCE), DstReg);
6166 for (unsigned i = 0; i < SubRegs; ++i) {
6167 MIB.addReg(SRegs[i]);
6168 MIB.addImm(RI.getSubRegFromChannel(i));
6170 return DstReg;
6173 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6174 MachineInstr &MI) const {
6176 // If the pointer is store in VGPRs, then we need to move them to
6177 // SGPRs using v_readfirstlane. This is safe because we only select
6178 // loads with uniform pointers to SMRD instruction so we know the
6179 // pointer value is uniform.
6180 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6181 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6182 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6183 SBase->setReg(SGPR);
6185 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6186 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6187 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6188 SOff->setReg(SGPR);
6192 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6193 unsigned Opc = Inst.getOpcode();
6194 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6195 if (OldSAddrIdx < 0)
6196 return false;
6198 assert(isSegmentSpecificFLAT(Inst));
6200 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6201 if (NewOpc < 0)
6202 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
6203 if (NewOpc < 0)
6204 return false;
6206 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6207 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6208 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6209 return false;
6211 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6212 if (NewVAddrIdx < 0)
6213 return false;
6215 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6217 // Check vaddr, it shall be zero or absent.
6218 MachineInstr *VAddrDef = nullptr;
6219 if (OldVAddrIdx >= 0) {
6220 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6221 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6222 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6223 !VAddrDef->getOperand(1).isImm() ||
6224 VAddrDef->getOperand(1).getImm() != 0)
6225 return false;
6228 const MCInstrDesc &NewDesc = get(NewOpc);
6229 Inst.setDesc(NewDesc);
6231 // Callers expect iterator to be valid after this call, so modify the
6232 // instruction in place.
6233 if (OldVAddrIdx == NewVAddrIdx) {
6234 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6235 // Clear use list from the old vaddr holding a zero register.
6236 MRI.removeRegOperandFromUseList(&NewVAddr);
6237 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6238 Inst.removeOperand(OldSAddrIdx);
6239 // Update the use list with the pointer we have just moved from vaddr to
6240 // saddr position. Otherwise new vaddr will be missing from the use list.
6241 MRI.removeRegOperandFromUseList(&NewVAddr);
6242 MRI.addRegOperandToUseList(&NewVAddr);
6243 } else {
6244 assert(OldSAddrIdx == NewVAddrIdx);
6246 if (OldVAddrIdx >= 0) {
6247 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6248 AMDGPU::OpName::vdst_in);
6250 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6251 // it asserts. Untie the operands for now and retie them afterwards.
6252 if (NewVDstIn != -1) {
6253 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6254 Inst.untieRegOperand(OldVDstIn);
6257 Inst.removeOperand(OldVAddrIdx);
6259 if (NewVDstIn != -1) {
6260 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6261 Inst.tieOperands(NewVDst, NewVDstIn);
6266 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6267 VAddrDef->eraseFromParent();
6269 return true;
6272 // FIXME: Remove this when SelectionDAG is obsoleted.
6273 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6274 MachineInstr &MI) const {
6275 if (!isSegmentSpecificFLAT(MI))
6276 return;
6278 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6279 // thinks they are uniform, so a readfirstlane should be valid.
6280 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6281 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6282 return;
6284 if (moveFlatAddrToVGPR(MI))
6285 return;
6287 const TargetRegisterClass *DeclaredRC = getRegClass(
6288 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6290 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6291 SAddr->setReg(ToSGPR);
6294 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
6295 MachineBasicBlock::iterator I,
6296 const TargetRegisterClass *DstRC,
6297 MachineOperand &Op,
6298 MachineRegisterInfo &MRI,
6299 const DebugLoc &DL) const {
6300 Register OpReg = Op.getReg();
6301 unsigned OpSubReg = Op.getSubReg();
6303 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6304 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6306 // Check if operand is already the correct register class.
6307 if (DstRC == OpRC)
6308 return;
6310 Register DstReg = MRI.createVirtualRegister(DstRC);
6311 auto Copy =
6312 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6313 Op.setReg(DstReg);
6315 MachineInstr *Def = MRI.getVRegDef(OpReg);
6316 if (!Def)
6317 return;
6319 // Try to eliminate the copy if it is copying an immediate value.
6320 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6321 foldImmediate(*Copy, *Def, OpReg, &MRI);
6323 bool ImpDef = Def->isImplicitDef();
6324 while (!ImpDef && Def && Def->isCopy()) {
6325 if (Def->getOperand(1).getReg().isPhysical())
6326 break;
6327 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6328 ImpDef = Def && Def->isImplicitDef();
6330 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6331 !ImpDef)
6332 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6335 // Emit the actual waterfall loop, executing the wrapped instruction for each
6336 // unique value of \p ScalarOps across all lanes. In the best case we execute 1
6337 // iteration, in the worst case we execute 64 (once per lane).
6338 static void
6339 emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
6340 MachineRegisterInfo &MRI,
6341 MachineBasicBlock &LoopBB,
6342 MachineBasicBlock &BodyBB,
6343 const DebugLoc &DL,
6344 ArrayRef<MachineOperand *> ScalarOps) {
6345 MachineFunction &MF = *LoopBB.getParent();
6346 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6347 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6348 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6349 unsigned SaveExecOpc =
6350 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6351 unsigned XorTermOpc =
6352 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6353 unsigned AndOpc =
6354 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6355 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6357 MachineBasicBlock::iterator I = LoopBB.begin();
6358 Register CondReg;
6360 for (MachineOperand *ScalarOp : ScalarOps) {
6361 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6362 unsigned NumSubRegs = RegSize / 32;
6363 Register VScalarOp = ScalarOp->getReg();
6365 if (NumSubRegs == 1) {
6366 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6368 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6369 .addReg(VScalarOp);
6371 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6373 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6374 .addReg(CurReg)
6375 .addReg(VScalarOp);
6377 // Combine the comparison results with AND.
6378 if (!CondReg) // First.
6379 CondReg = NewCondReg;
6380 else { // If not the first, we create an AND.
6381 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6382 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6383 .addReg(CondReg)
6384 .addReg(NewCondReg);
6385 CondReg = AndReg;
6388 // Update ScalarOp operand to use the SGPR ScalarOp.
6389 ScalarOp->setReg(CurReg);
6390 ScalarOp->setIsKill();
6391 } else {
6392 SmallVector<Register, 8> ReadlanePieces;
6393 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6394 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6395 "Unhandled register size");
6397 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6398 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6399 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6401 // Read the next variant <- also loop target.
6402 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6403 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6405 // Read the next variant <- also loop target.
6406 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6407 .addReg(VScalarOp, VScalarOpUndef,
6408 TRI->getSubRegFromChannel(Idx + 1));
6410 ReadlanePieces.push_back(CurRegLo);
6411 ReadlanePieces.push_back(CurRegHi);
6413 // Comparison is to be done as 64-bit.
6414 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6415 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6416 .addReg(CurRegLo)
6417 .addImm(AMDGPU::sub0)
6418 .addReg(CurRegHi)
6419 .addImm(AMDGPU::sub1);
6421 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6422 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6423 NewCondReg)
6424 .addReg(CurReg);
6425 if (NumSubRegs <= 2)
6426 Cmp.addReg(VScalarOp);
6427 else
6428 Cmp.addReg(VScalarOp, VScalarOpUndef,
6429 TRI->getSubRegFromChannel(Idx, 2));
6431 // Combine the comparison results with AND.
6432 if (!CondReg) // First.
6433 CondReg = NewCondReg;
6434 else { // If not the first, we create an AND.
6435 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6436 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6437 .addReg(CondReg)
6438 .addReg(NewCondReg);
6439 CondReg = AndReg;
6441 } // End for loop.
6443 const auto *SScalarOpRC =
6444 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6445 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6447 // Build scalar ScalarOp.
6448 auto Merge =
6449 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6450 unsigned Channel = 0;
6451 for (Register Piece : ReadlanePieces) {
6452 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6455 // Update ScalarOp operand to use the SGPR ScalarOp.
6456 ScalarOp->setReg(SScalarOp);
6457 ScalarOp->setIsKill();
6461 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6462 MRI.setSimpleHint(SaveExec, CondReg);
6464 // Update EXEC to matching lanes, saving original to SaveExec.
6465 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6466 .addReg(CondReg, RegState::Kill);
6468 // The original instruction is here; we insert the terminators after it.
6469 I = BodyBB.end();
6471 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6472 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6473 .addReg(Exec)
6474 .addReg(SaveExec);
6476 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6479 // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6480 // with SGPRs by iterating over all unique values across all lanes.
6481 // Returns the loop basic block that now contains \p MI.
6482 static MachineBasicBlock *
6483 loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
6484 ArrayRef<MachineOperand *> ScalarOps,
6485 MachineDominatorTree *MDT,
6486 MachineBasicBlock::iterator Begin = nullptr,
6487 MachineBasicBlock::iterator End = nullptr) {
6488 MachineBasicBlock &MBB = *MI.getParent();
6489 MachineFunction &MF = *MBB.getParent();
6490 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6491 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6492 MachineRegisterInfo &MRI = MF.getRegInfo();
6493 if (!Begin.isValid())
6494 Begin = &MI;
6495 if (!End.isValid()) {
6496 End = &MI;
6497 ++End;
6499 const DebugLoc &DL = MI.getDebugLoc();
6500 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6501 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6502 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6504 // Save SCC. Waterfall Loop may overwrite SCC.
6505 Register SaveSCCReg;
6507 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6508 // rather than unlimited scan everywhere
6509 bool SCCNotDead =
6510 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6511 std::numeric_limits<unsigned>::max()) !=
6512 MachineBasicBlock::LQR_Dead;
6513 if (SCCNotDead) {
6514 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6515 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6516 .addImm(1)
6517 .addImm(0);
6520 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6522 // Save the EXEC mask
6523 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6525 // Killed uses in the instruction we are waterfalling around will be
6526 // incorrect due to the added control-flow.
6527 MachineBasicBlock::iterator AfterMI = MI;
6528 ++AfterMI;
6529 for (auto I = Begin; I != AfterMI; I++) {
6530 for (auto &MO : I->all_uses())
6531 MRI.clearKillFlags(MO.getReg());
6534 // To insert the loop we need to split the block. Move everything after this
6535 // point to a new block, and insert a new empty block between the two.
6536 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
6537 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
6538 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6539 MachineFunction::iterator MBBI(MBB);
6540 ++MBBI;
6542 MF.insert(MBBI, LoopBB);
6543 MF.insert(MBBI, BodyBB);
6544 MF.insert(MBBI, RemainderBB);
6546 LoopBB->addSuccessor(BodyBB);
6547 BodyBB->addSuccessor(LoopBB);
6548 BodyBB->addSuccessor(RemainderBB);
6550 // Move Begin to MI to the BodyBB, and the remainder of the block to
6551 // RemainderBB.
6552 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6553 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6554 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6556 MBB.addSuccessor(LoopBB);
6558 // Update dominators. We know that MBB immediately dominates LoopBB, that
6559 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6560 // RemainderBB. RemainderBB immediately dominates all of the successors
6561 // transferred to it from MBB that MBB used to properly dominate.
6562 if (MDT) {
6563 MDT->addNewBlock(LoopBB, &MBB);
6564 MDT->addNewBlock(BodyBB, LoopBB);
6565 MDT->addNewBlock(RemainderBB, BodyBB);
6566 for (auto &Succ : RemainderBB->successors()) {
6567 if (MDT->properlyDominates(&MBB, Succ)) {
6568 MDT->changeImmediateDominator(Succ, RemainderBB);
6573 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
6575 MachineBasicBlock::iterator First = RemainderBB->begin();
6576 // Restore SCC
6577 if (SCCNotDead) {
6578 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6579 .addReg(SaveSCCReg, RegState::Kill)
6580 .addImm(0);
6583 // Restore the EXEC mask
6584 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6585 return BodyBB;
6588 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6589 static std::tuple<unsigned, unsigned>
6590 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
6591 MachineBasicBlock &MBB = *MI.getParent();
6592 MachineFunction &MF = *MBB.getParent();
6593 MachineRegisterInfo &MRI = MF.getRegInfo();
6595 // Extract the ptr from the resource descriptor.
6596 unsigned RsrcPtr =
6597 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6598 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6600 // Create an empty resource descriptor
6601 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6602 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6603 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6604 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6605 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6607 // Zero64 = 0
6608 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6609 .addImm(0);
6611 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6612 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6613 .addImm(Lo_32(RsrcDataFormat));
6615 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6616 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6617 .addImm(Hi_32(RsrcDataFormat));
6619 // NewSRsrc = {Zero64, SRsrcFormat}
6620 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6621 .addReg(Zero64)
6622 .addImm(AMDGPU::sub0_sub1)
6623 .addReg(SRsrcFormatLo)
6624 .addImm(AMDGPU::sub2)
6625 .addReg(SRsrcFormatHi)
6626 .addImm(AMDGPU::sub3);
6628 return std::tuple(RsrcPtr, NewSRsrc);
6631 MachineBasicBlock *
6632 SIInstrInfo::legalizeOperands(MachineInstr &MI,
6633 MachineDominatorTree *MDT) const {
6634 MachineFunction &MF = *MI.getParent()->getParent();
6635 MachineRegisterInfo &MRI = MF.getRegInfo();
6636 MachineBasicBlock *CreatedBB = nullptr;
6638 // Legalize VOP2
6639 if (isVOP2(MI) || isVOPC(MI)) {
6640 legalizeOperandsVOP2(MRI, MI);
6641 return CreatedBB;
6644 // Legalize VOP3
6645 if (isVOP3(MI)) {
6646 legalizeOperandsVOP3(MRI, MI);
6647 return CreatedBB;
6650 // Legalize SMRD
6651 if (isSMRD(MI)) {
6652 legalizeOperandsSMRD(MRI, MI);
6653 return CreatedBB;
6656 // Legalize FLAT
6657 if (isFLAT(MI)) {
6658 legalizeOperandsFLAT(MRI, MI);
6659 return CreatedBB;
6662 // Legalize REG_SEQUENCE and PHI
6663 // The register class of the operands much be the same type as the register
6664 // class of the output.
6665 if (MI.getOpcode() == AMDGPU::PHI) {
6666 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6667 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6668 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6669 continue;
6670 const TargetRegisterClass *OpRC =
6671 MRI.getRegClass(MI.getOperand(i).getReg());
6672 if (RI.hasVectorRegisters(OpRC)) {
6673 VRC = OpRC;
6674 } else {
6675 SRC = OpRC;
6679 // If any of the operands are VGPR registers, then they all most be
6680 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6681 // them.
6682 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6683 if (!VRC) {
6684 assert(SRC);
6685 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6686 VRC = &AMDGPU::VReg_1RegClass;
6687 } else
6688 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6689 ? RI.getEquivalentAGPRClass(SRC)
6690 : RI.getEquivalentVGPRClass(SRC);
6691 } else {
6692 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6693 ? RI.getEquivalentAGPRClass(VRC)
6694 : RI.getEquivalentVGPRClass(VRC);
6696 RC = VRC;
6697 } else {
6698 RC = SRC;
6701 // Update all the operands so they have the same type.
6702 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6703 MachineOperand &Op = MI.getOperand(I);
6704 if (!Op.isReg() || !Op.getReg().isVirtual())
6705 continue;
6707 // MI is a PHI instruction.
6708 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6709 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
6711 // Avoid creating no-op copies with the same src and dst reg class. These
6712 // confuse some of the machine passes.
6713 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6717 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6718 // VGPR dest type and SGPR sources, insert copies so all operands are
6719 // VGPRs. This seems to help operand folding / the register coalescer.
6720 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6721 MachineBasicBlock *MBB = MI.getParent();
6722 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6723 if (RI.hasVGPRs(DstRC)) {
6724 // Update all the operands so they are VGPR register classes. These may
6725 // not be the same register class because REG_SEQUENCE supports mixing
6726 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6727 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6728 MachineOperand &Op = MI.getOperand(I);
6729 if (!Op.isReg() || !Op.getReg().isVirtual())
6730 continue;
6732 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6733 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6734 if (VRC == OpRC)
6735 continue;
6737 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6738 Op.setIsKill();
6742 return CreatedBB;
6745 // Legalize INSERT_SUBREG
6746 // src0 must have the same register class as dst
6747 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6748 Register Dst = MI.getOperand(0).getReg();
6749 Register Src0 = MI.getOperand(1).getReg();
6750 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6751 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6752 if (DstRC != Src0RC) {
6753 MachineBasicBlock *MBB = MI.getParent();
6754 MachineOperand &Op = MI.getOperand(1);
6755 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6757 return CreatedBB;
6760 // Legalize SI_INIT_M0
6761 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6762 MachineOperand &Src = MI.getOperand(0);
6763 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6764 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6765 return CreatedBB;
6768 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6769 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6770 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6771 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6772 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6773 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6774 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6775 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6776 MachineOperand &Src = MI.getOperand(1);
6777 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6778 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6779 return CreatedBB;
6782 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6784 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6785 // scratch memory access. In both cases, the legalization never involves
6786 // conversion to the addr64 form.
6787 if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
6788 (isMUBUF(MI) || isMTBUF(MI)))) {
6789 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6790 : AMDGPU::OpName::srsrc;
6791 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6792 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6793 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6795 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6796 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6797 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6798 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6800 return CreatedBB;
6803 // Legalize SI_CALL
6804 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6805 MachineOperand *Dest = &MI.getOperand(0);
6806 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6807 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6808 // following copies, we also need to move copies from and to physical
6809 // registers into the loop block.
6810 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6811 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6813 // Also move the copies to physical registers into the loop block
6814 MachineBasicBlock &MBB = *MI.getParent();
6815 MachineBasicBlock::iterator Start(&MI);
6816 while (Start->getOpcode() != FrameSetupOpcode)
6817 --Start;
6818 MachineBasicBlock::iterator End(&MI);
6819 while (End->getOpcode() != FrameDestroyOpcode)
6820 ++End;
6821 // Also include following copies of the return value
6822 ++End;
6823 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6824 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6825 ++End;
6826 CreatedBB =
6827 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6831 // Legalize s_sleep_var.
6832 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6833 const DebugLoc &DL = MI.getDebugLoc();
6834 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6835 int Src0Idx =
6836 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6837 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6838 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6839 .add(Src0);
6840 Src0.ChangeToRegister(Reg, false);
6841 return nullptr;
6844 // Legalize MUBUF instructions.
6845 bool isSoffsetLegal = true;
6846 int SoffsetIdx =
6847 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6848 if (SoffsetIdx != -1) {
6849 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6850 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6851 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6852 isSoffsetLegal = false;
6856 bool isRsrcLegal = true;
6857 int RsrcIdx =
6858 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6859 if (RsrcIdx != -1) {
6860 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6861 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6862 isRsrcLegal = false;
6866 // The operands are legal.
6867 if (isRsrcLegal && isSoffsetLegal)
6868 return CreatedBB;
6870 if (!isRsrcLegal) {
6871 // Legalize a VGPR Rsrc
6873 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6874 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6875 // a zero-value SRsrc.
6877 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6878 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6879 // above.
6881 // Otherwise we are on non-ADDR64 hardware, and/or we have
6882 // idxen/offen/bothen and we fall back to a waterfall loop.
6884 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6885 MachineBasicBlock &MBB = *MI.getParent();
6887 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6888 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6889 // This is already an ADDR64 instruction so we need to add the pointer
6890 // extracted from the resource descriptor to the current value of VAddr.
6891 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6892 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6893 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6895 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
6896 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6897 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6899 unsigned RsrcPtr, NewSRsrc;
6900 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6902 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6903 const DebugLoc &DL = MI.getDebugLoc();
6904 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6905 .addDef(CondReg0)
6906 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6907 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6908 .addImm(0);
6910 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6911 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6912 .addDef(CondReg1, RegState::Dead)
6913 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6914 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6915 .addReg(CondReg0, RegState::Kill)
6916 .addImm(0);
6918 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6919 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6920 .addReg(NewVAddrLo)
6921 .addImm(AMDGPU::sub0)
6922 .addReg(NewVAddrHi)
6923 .addImm(AMDGPU::sub1);
6925 VAddr->setReg(NewVAddr);
6926 Rsrc->setReg(NewSRsrc);
6927 } else if (!VAddr && ST.hasAddr64()) {
6928 // This instructions is the _OFFSET variant, so we need to convert it to
6929 // ADDR64.
6930 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
6931 "FIXME: Need to emit flat atomics here");
6933 unsigned RsrcPtr, NewSRsrc;
6934 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6936 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6937 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6938 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6939 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6940 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6942 // Atomics with return have an additional tied operand and are
6943 // missing some of the special bits.
6944 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6945 MachineInstr *Addr64;
6947 if (!VDataIn) {
6948 // Regular buffer load / store.
6949 MachineInstrBuilder MIB =
6950 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6951 .add(*VData)
6952 .addReg(NewVAddr)
6953 .addReg(NewSRsrc)
6954 .add(*SOffset)
6955 .add(*Offset);
6957 if (const MachineOperand *CPol =
6958 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6959 MIB.addImm(CPol->getImm());
6962 if (const MachineOperand *TFE =
6963 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6964 MIB.addImm(TFE->getImm());
6967 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6969 MIB.cloneMemRefs(MI);
6970 Addr64 = MIB;
6971 } else {
6972 // Atomics with return.
6973 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6974 .add(*VData)
6975 .add(*VDataIn)
6976 .addReg(NewVAddr)
6977 .addReg(NewSRsrc)
6978 .add(*SOffset)
6979 .add(*Offset)
6980 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6981 .cloneMemRefs(MI);
6984 MI.removeFromParent();
6986 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6987 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6988 NewVAddr)
6989 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6990 .addImm(AMDGPU::sub0)
6991 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6992 .addImm(AMDGPU::sub1);
6993 } else {
6994 // Legalize a VGPR Rsrc and soffset together.
6995 if (!isSoffsetLegal) {
6996 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6997 CreatedBB =
6998 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6999 return CreatedBB;
7001 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7002 return CreatedBB;
7006 // Legalize a VGPR soffset.
7007 if (!isSoffsetLegal) {
7008 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7009 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7010 return CreatedBB;
7012 return CreatedBB;
7015 void SIInstrWorklist::insert(MachineInstr *MI) {
7016 InstrList.insert(MI);
7017 // Add MBUF instructiosn to deferred list.
7018 int RsrcIdx =
7019 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7020 if (RsrcIdx != -1) {
7021 DeferredList.insert(MI);
7025 bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7026 return DeferredList.contains(MI);
7029 void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7030 MachineDominatorTree *MDT) const {
7032 while (!Worklist.empty()) {
7033 MachineInstr &Inst = *Worklist.top();
7034 Worklist.erase_top();
7035 // Skip MachineInstr in the deferred list.
7036 if (Worklist.isDeferred(&Inst))
7037 continue;
7038 moveToVALUImpl(Worklist, MDT, Inst);
7041 // Deferred list of instructions will be processed once
7042 // all the MachineInstr in the worklist are done.
7043 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7044 moveToVALUImpl(Worklist, MDT, *Inst);
7045 assert(Worklist.empty() &&
7046 "Deferred MachineInstr are not supposed to re-populate worklist");
7050 void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7051 MachineDominatorTree *MDT,
7052 MachineInstr &Inst) const {
7054 MachineBasicBlock *MBB = Inst.getParent();
7055 if (!MBB)
7056 return;
7057 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7058 unsigned Opcode = Inst.getOpcode();
7059 unsigned NewOpcode = getVALUOp(Inst);
7060 // Handle some special cases
7061 switch (Opcode) {
7062 default:
7063 break;
7064 case AMDGPU::S_ADD_U64_PSEUDO:
7065 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
7066 break;
7067 case AMDGPU::S_SUB_U64_PSEUDO:
7068 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
7069 break;
7070 case AMDGPU::S_ADD_I32:
7071 case AMDGPU::S_SUB_I32: {
7072 // FIXME: The u32 versions currently selected use the carry.
7073 bool Changed;
7074 MachineBasicBlock *CreatedBBTmp = nullptr;
7075 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7076 if (Changed)
7077 return;
7079 // Default handling
7080 break;
7083 case AMDGPU::S_MUL_U64:
7084 // Split s_mul_u64 in 32-bit vector multiplications.
7085 splitScalarSMulU64(Worklist, Inst, MDT);
7086 Inst.eraseFromParent();
7087 return;
7089 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7090 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7091 // This is a special case of s_mul_u64 where all the operands are either
7092 // zero extended or sign extended.
7093 splitScalarSMulPseudo(Worklist, Inst, MDT);
7094 Inst.eraseFromParent();
7095 return;
7097 case AMDGPU::S_AND_B64:
7098 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7099 Inst.eraseFromParent();
7100 return;
7102 case AMDGPU::S_OR_B64:
7103 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7104 Inst.eraseFromParent();
7105 return;
7107 case AMDGPU::S_XOR_B64:
7108 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7109 Inst.eraseFromParent();
7110 return;
7112 case AMDGPU::S_NAND_B64:
7113 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7114 Inst.eraseFromParent();
7115 return;
7117 case AMDGPU::S_NOR_B64:
7118 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7119 Inst.eraseFromParent();
7120 return;
7122 case AMDGPU::S_XNOR_B64:
7123 if (ST.hasDLInsts())
7124 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7125 else
7126 splitScalar64BitXnor(Worklist, Inst, MDT);
7127 Inst.eraseFromParent();
7128 return;
7130 case AMDGPU::S_ANDN2_B64:
7131 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7132 Inst.eraseFromParent();
7133 return;
7135 case AMDGPU::S_ORN2_B64:
7136 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7137 Inst.eraseFromParent();
7138 return;
7140 case AMDGPU::S_BREV_B64:
7141 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7142 Inst.eraseFromParent();
7143 return;
7145 case AMDGPU::S_NOT_B64:
7146 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7147 Inst.eraseFromParent();
7148 return;
7150 case AMDGPU::S_BCNT1_I32_B64:
7151 splitScalar64BitBCNT(Worklist, Inst);
7152 Inst.eraseFromParent();
7153 return;
7155 case AMDGPU::S_BFE_I64:
7156 splitScalar64BitBFE(Worklist, Inst);
7157 Inst.eraseFromParent();
7158 return;
7160 case AMDGPU::S_FLBIT_I32_B64:
7161 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7162 Inst.eraseFromParent();
7163 return;
7164 case AMDGPU::S_FF1_I32_B64:
7165 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7166 Inst.eraseFromParent();
7167 return;
7169 case AMDGPU::S_LSHL_B32:
7170 if (ST.hasOnlyRevVALUShifts()) {
7171 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7172 swapOperands(Inst);
7174 break;
7175 case AMDGPU::S_ASHR_I32:
7176 if (ST.hasOnlyRevVALUShifts()) {
7177 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7178 swapOperands(Inst);
7180 break;
7181 case AMDGPU::S_LSHR_B32:
7182 if (ST.hasOnlyRevVALUShifts()) {
7183 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7184 swapOperands(Inst);
7186 break;
7187 case AMDGPU::S_LSHL_B64:
7188 if (ST.hasOnlyRevVALUShifts()) {
7189 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7190 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7191 : AMDGPU::V_LSHLREV_B64_e64;
7192 swapOperands(Inst);
7194 break;
7195 case AMDGPU::S_ASHR_I64:
7196 if (ST.hasOnlyRevVALUShifts()) {
7197 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7198 swapOperands(Inst);
7200 break;
7201 case AMDGPU::S_LSHR_B64:
7202 if (ST.hasOnlyRevVALUShifts()) {
7203 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7204 swapOperands(Inst);
7206 break;
7208 case AMDGPU::S_ABS_I32:
7209 lowerScalarAbs(Worklist, Inst);
7210 Inst.eraseFromParent();
7211 return;
7213 case AMDGPU::S_CBRANCH_SCC0:
7214 case AMDGPU::S_CBRANCH_SCC1: {
7215 // Clear unused bits of vcc
7216 Register CondReg = Inst.getOperand(1).getReg();
7217 bool IsSCC = CondReg == AMDGPU::SCC;
7218 Register VCC = RI.getVCC();
7219 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7220 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7221 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7222 .addReg(EXEC)
7223 .addReg(IsSCC ? VCC : CondReg);
7224 Inst.removeOperand(1);
7225 } break;
7227 case AMDGPU::S_BFE_U64:
7228 case AMDGPU::S_BFM_B64:
7229 llvm_unreachable("Moving this op to VALU not implemented");
7231 case AMDGPU::S_PACK_LL_B32_B16:
7232 case AMDGPU::S_PACK_LH_B32_B16:
7233 case AMDGPU::S_PACK_HL_B32_B16:
7234 case AMDGPU::S_PACK_HH_B32_B16:
7235 movePackToVALU(Worklist, MRI, Inst);
7236 Inst.eraseFromParent();
7237 return;
7239 case AMDGPU::S_XNOR_B32:
7240 lowerScalarXnor(Worklist, Inst);
7241 Inst.eraseFromParent();
7242 return;
7244 case AMDGPU::S_NAND_B32:
7245 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7246 Inst.eraseFromParent();
7247 return;
7249 case AMDGPU::S_NOR_B32:
7250 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7251 Inst.eraseFromParent();
7252 return;
7254 case AMDGPU::S_ANDN2_B32:
7255 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7256 Inst.eraseFromParent();
7257 return;
7259 case AMDGPU::S_ORN2_B32:
7260 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7261 Inst.eraseFromParent();
7262 return;
7264 // TODO: remove as soon as everything is ready
7265 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7266 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7267 // can only be selected from the uniform SDNode.
7268 case AMDGPU::S_ADD_CO_PSEUDO:
7269 case AMDGPU::S_SUB_CO_PSEUDO: {
7270 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7271 ? AMDGPU::V_ADDC_U32_e64
7272 : AMDGPU::V_SUBB_U32_e64;
7273 const auto *CarryRC = RI.getWaveMaskRegClass();
7275 Register CarryInReg = Inst.getOperand(4).getReg();
7276 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7277 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7278 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7279 .addReg(CarryInReg);
7282 Register CarryOutReg = Inst.getOperand(1).getReg();
7284 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7285 MRI.getRegClass(Inst.getOperand(0).getReg())));
7286 MachineInstr *CarryOp =
7287 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7288 .addReg(CarryOutReg, RegState::Define)
7289 .add(Inst.getOperand(2))
7290 .add(Inst.getOperand(3))
7291 .addReg(CarryInReg)
7292 .addImm(0);
7293 legalizeOperands(*CarryOp);
7294 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7295 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7296 Inst.eraseFromParent();
7298 return;
7299 case AMDGPU::S_UADDO_PSEUDO:
7300 case AMDGPU::S_USUBO_PSEUDO: {
7301 const DebugLoc &DL = Inst.getDebugLoc();
7302 MachineOperand &Dest0 = Inst.getOperand(0);
7303 MachineOperand &Dest1 = Inst.getOperand(1);
7304 MachineOperand &Src0 = Inst.getOperand(2);
7305 MachineOperand &Src1 = Inst.getOperand(3);
7307 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7308 ? AMDGPU::V_ADD_CO_U32_e64
7309 : AMDGPU::V_SUB_CO_U32_e64;
7310 const TargetRegisterClass *NewRC =
7311 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7312 Register DestReg = MRI.createVirtualRegister(NewRC);
7313 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7314 .addReg(Dest1.getReg(), RegState::Define)
7315 .add(Src0)
7316 .add(Src1)
7317 .addImm(0); // clamp bit
7319 legalizeOperands(*NewInstr, MDT);
7320 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7321 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7322 Worklist);
7323 Inst.eraseFromParent();
7325 return;
7327 case AMDGPU::S_CSELECT_B32:
7328 case AMDGPU::S_CSELECT_B64:
7329 lowerSelect(Worklist, Inst, MDT);
7330 Inst.eraseFromParent();
7331 return;
7332 case AMDGPU::S_CMP_EQ_I32:
7333 case AMDGPU::S_CMP_LG_I32:
7334 case AMDGPU::S_CMP_GT_I32:
7335 case AMDGPU::S_CMP_GE_I32:
7336 case AMDGPU::S_CMP_LT_I32:
7337 case AMDGPU::S_CMP_LE_I32:
7338 case AMDGPU::S_CMP_EQ_U32:
7339 case AMDGPU::S_CMP_LG_U32:
7340 case AMDGPU::S_CMP_GT_U32:
7341 case AMDGPU::S_CMP_GE_U32:
7342 case AMDGPU::S_CMP_LT_U32:
7343 case AMDGPU::S_CMP_LE_U32:
7344 case AMDGPU::S_CMP_EQ_U64:
7345 case AMDGPU::S_CMP_LG_U64:
7346 case AMDGPU::S_CMP_LT_F32:
7347 case AMDGPU::S_CMP_EQ_F32:
7348 case AMDGPU::S_CMP_LE_F32:
7349 case AMDGPU::S_CMP_GT_F32:
7350 case AMDGPU::S_CMP_LG_F32:
7351 case AMDGPU::S_CMP_GE_F32:
7352 case AMDGPU::S_CMP_O_F32:
7353 case AMDGPU::S_CMP_U_F32:
7354 case AMDGPU::S_CMP_NGE_F32:
7355 case AMDGPU::S_CMP_NLG_F32:
7356 case AMDGPU::S_CMP_NGT_F32:
7357 case AMDGPU::S_CMP_NLE_F32:
7358 case AMDGPU::S_CMP_NEQ_F32:
7359 case AMDGPU::S_CMP_NLT_F32: {
7360 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7361 auto NewInstr =
7362 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7363 .setMIFlags(Inst.getFlags());
7364 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7365 0) {
7366 NewInstr
7367 .addImm(0) // src0_modifiers
7368 .add(Inst.getOperand(0)) // src0
7369 .addImm(0) // src1_modifiers
7370 .add(Inst.getOperand(1)) // src1
7371 .addImm(0); // clamp
7372 } else {
7373 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7375 legalizeOperands(*NewInstr, MDT);
7376 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7377 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7378 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7379 Inst.eraseFromParent();
7380 return;
7382 case AMDGPU::S_CMP_LT_F16:
7383 case AMDGPU::S_CMP_EQ_F16:
7384 case AMDGPU::S_CMP_LE_F16:
7385 case AMDGPU::S_CMP_GT_F16:
7386 case AMDGPU::S_CMP_LG_F16:
7387 case AMDGPU::S_CMP_GE_F16:
7388 case AMDGPU::S_CMP_O_F16:
7389 case AMDGPU::S_CMP_U_F16:
7390 case AMDGPU::S_CMP_NGE_F16:
7391 case AMDGPU::S_CMP_NLG_F16:
7392 case AMDGPU::S_CMP_NGT_F16:
7393 case AMDGPU::S_CMP_NLE_F16:
7394 case AMDGPU::S_CMP_NEQ_F16:
7395 case AMDGPU::S_CMP_NLT_F16: {
7396 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7397 auto NewInstr =
7398 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7399 .setMIFlags(Inst.getFlags());
7400 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7401 NewInstr
7402 .addImm(0) // src0_modifiers
7403 .add(Inst.getOperand(0)) // src0
7404 .addImm(0) // src1_modifiers
7405 .add(Inst.getOperand(1)) // src1
7406 .addImm(0); // clamp
7407 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7408 NewInstr.addImm(0); // op_sel0
7409 } else {
7410 NewInstr
7411 .add(Inst.getOperand(0))
7412 .add(Inst.getOperand(1));
7414 legalizeOperands(*NewInstr, MDT);
7415 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7416 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7417 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7418 Inst.eraseFromParent();
7419 return;
7421 case AMDGPU::S_CVT_HI_F32_F16: {
7422 const DebugLoc &DL = Inst.getDebugLoc();
7423 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7424 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7425 if (ST.useRealTrue16Insts()) {
7426 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7427 .add(Inst.getOperand(1));
7428 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7429 .addImm(0) // src0_modifiers
7430 .addReg(TmpReg, 0, AMDGPU::hi16)
7431 .addImm(0) // clamp
7432 .addImm(0) // omod
7433 .addImm(0); // op_sel0
7434 } else {
7435 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7436 .addImm(16)
7437 .add(Inst.getOperand(1));
7438 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7439 .addImm(0) // src0_modifiers
7440 .addReg(TmpReg)
7441 .addImm(0) // clamp
7442 .addImm(0); // omod
7445 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7446 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7447 Inst.eraseFromParent();
7448 return;
7450 case AMDGPU::S_MINIMUM_F32:
7451 case AMDGPU::S_MAXIMUM_F32:
7452 case AMDGPU::S_MINIMUM_F16:
7453 case AMDGPU::S_MAXIMUM_F16: {
7454 const DebugLoc &DL = Inst.getDebugLoc();
7455 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7456 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7457 .addImm(0) // src0_modifiers
7458 .add(Inst.getOperand(1))
7459 .addImm(0) // src1_modifiers
7460 .add(Inst.getOperand(2))
7461 .addImm(0) // clamp
7462 .addImm(0); // omod
7463 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7465 legalizeOperands(*NewInstr, MDT);
7466 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7467 Inst.eraseFromParent();
7468 return;
7472 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7473 // We cannot move this instruction to the VALU, so we should try to
7474 // legalize its operands instead.
7475 legalizeOperands(Inst, MDT);
7476 return;
7478 // Handle converting generic instructions like COPY-to-SGPR into
7479 // COPY-to-VGPR.
7480 if (NewOpcode == Opcode) {
7481 Register DstReg = Inst.getOperand(0).getReg();
7482 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7484 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7485 // hope for the best.
7486 if (Inst.isCopy() && DstReg.isPhysical() &&
7487 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7488 // TODO: Only works for 32 bit registers.
7489 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7490 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7491 .add(Inst.getOperand(1));
7492 Inst.eraseFromParent();
7493 return;
7496 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7497 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7498 // Instead of creating a copy where src and dst are the same register
7499 // class, we just replace all uses of dst with src. These kinds of
7500 // copies interfere with the heuristics MachineSink uses to decide
7501 // whether or not to split a critical edge. Since the pass assumes
7502 // that copies will end up as machine instructions and not be
7503 // eliminated.
7504 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7505 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7506 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7507 Inst.getOperand(0).setReg(DstReg);
7508 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7509 // these are deleted later, but at -O0 it would leave a suspicious
7510 // looking illegal copy of an undef register.
7511 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7512 Inst.removeOperand(I);
7513 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7514 return;
7516 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7517 MRI.replaceRegWith(DstReg, NewDstReg);
7518 legalizeOperands(Inst, MDT);
7519 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7520 return;
7523 // Use the new VALU Opcode.
7524 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7525 .setMIFlags(Inst.getFlags());
7526 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7527 // Intersperse VOP3 modifiers among the SALU operands.
7528 NewInstr->addOperand(Inst.getOperand(0));
7529 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7530 AMDGPU::OpName::src0_modifiers) >= 0)
7531 NewInstr.addImm(0);
7532 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7533 MachineOperand Src = Inst.getOperand(1);
7534 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7535 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7536 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7537 else
7538 NewInstr->addOperand(Src);
7541 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7542 // We are converting these to a BFE, so we need to add the missing
7543 // operands for the size and offset.
7544 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7545 NewInstr.addImm(0);
7546 NewInstr.addImm(Size);
7547 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7548 // The VALU version adds the second operand to the result, so insert an
7549 // extra 0 operand.
7550 NewInstr.addImm(0);
7551 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7552 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7553 // If we need to move this to VGPRs, we need to unpack the second
7554 // operand back into the 2 separate ones for bit offset and width.
7555 assert(OffsetWidthOp.isImm() &&
7556 "Scalar BFE is only implemented for constant width and offset");
7557 uint32_t Imm = OffsetWidthOp.getImm();
7559 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7560 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7561 NewInstr.addImm(Offset);
7562 NewInstr.addImm(BitWidth);
7563 } else {
7564 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7565 AMDGPU::OpName::src1_modifiers) >= 0)
7566 NewInstr.addImm(0);
7567 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7568 NewInstr->addOperand(Inst.getOperand(2));
7569 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7570 AMDGPU::OpName::src2_modifiers) >= 0)
7571 NewInstr.addImm(0);
7572 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7573 NewInstr->addOperand(Inst.getOperand(3));
7574 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7575 NewInstr.addImm(0);
7576 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7577 NewInstr.addImm(0);
7578 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7579 NewInstr.addImm(0);
7581 } else {
7582 // Just copy the SALU operands.
7583 for (const MachineOperand &Op : Inst.explicit_operands())
7584 NewInstr->addOperand(Op);
7587 // Remove any references to SCC. Vector instructions can't read from it, and
7588 // We're just about to add the implicit use / defs of VCC, and we don't want
7589 // both.
7590 for (MachineOperand &Op : Inst.implicit_operands()) {
7591 if (Op.getReg() == AMDGPU::SCC) {
7592 // Only propagate through live-def of SCC.
7593 if (Op.isDef() && !Op.isDead())
7594 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7595 if (Op.isUse())
7596 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7599 Inst.eraseFromParent();
7600 Register NewDstReg;
7601 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7602 Register DstReg = NewInstr->getOperand(0).getReg();
7603 assert(DstReg.isVirtual());
7604 // Update the destination register class.
7605 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7606 assert(NewDstRC);
7607 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7608 MRI.replaceRegWith(DstReg, NewDstReg);
7610 fixImplicitOperands(*NewInstr);
7611 // Legalize the operands
7612 legalizeOperands(*NewInstr, MDT);
7613 if (NewDstReg)
7614 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7617 // Add/sub require special handling to deal with carry outs.
7618 std::pair<bool, MachineBasicBlock *>
7619 SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7620 MachineDominatorTree *MDT) const {
7621 if (ST.hasAddNoCarry()) {
7622 // Assume there is no user of scc since we don't select this in that case.
7623 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7624 // is used.
7626 MachineBasicBlock &MBB = *Inst.getParent();
7627 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7629 Register OldDstReg = Inst.getOperand(0).getReg();
7630 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7632 unsigned Opc = Inst.getOpcode();
7633 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7635 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7636 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7638 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7639 Inst.removeOperand(3);
7641 Inst.setDesc(get(NewOpc));
7642 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7643 Inst.addImplicitDefUseOperands(*MBB.getParent());
7644 MRI.replaceRegWith(OldDstReg, ResultReg);
7645 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7647 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7648 return std::pair(true, NewBB);
7651 return std::pair(false, nullptr);
7654 void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7655 MachineDominatorTree *MDT) const {
7657 MachineBasicBlock &MBB = *Inst.getParent();
7658 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7659 MachineBasicBlock::iterator MII = Inst;
7660 DebugLoc DL = Inst.getDebugLoc();
7662 MachineOperand &Dest = Inst.getOperand(0);
7663 MachineOperand &Src0 = Inst.getOperand(1);
7664 MachineOperand &Src1 = Inst.getOperand(2);
7665 MachineOperand &Cond = Inst.getOperand(3);
7667 Register CondReg = Cond.getReg();
7668 bool IsSCC = (CondReg == AMDGPU::SCC);
7670 // If this is a trivial select where the condition is effectively not SCC
7671 // (CondReg is a source of copy to SCC), then the select is semantically
7672 // equivalent to copying CondReg. Hence, there is no need to create
7673 // V_CNDMASK, we can just use that and bail out.
7674 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7675 (Src1.getImm() == 0)) {
7676 MRI.replaceRegWith(Dest.getReg(), CondReg);
7677 return;
7680 Register NewCondReg = CondReg;
7681 if (IsSCC) {
7682 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
7683 NewCondReg = MRI.createVirtualRegister(TC);
7685 // Now look for the closest SCC def if it is a copy
7686 // replacing the CondReg with the COPY source register
7687 bool CopyFound = false;
7688 for (MachineInstr &CandI :
7689 make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
7690 Inst.getParent()->rend())) {
7691 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7692 -1) {
7693 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7694 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7695 .addReg(CandI.getOperand(1).getReg());
7696 CopyFound = true;
7698 break;
7701 if (!CopyFound) {
7702 // SCC def is not a copy
7703 // Insert a trivial select instead of creating a copy, because a copy from
7704 // SCC would semantically mean just copying a single bit, but we may need
7705 // the result to be a vector condition mask that needs preserving.
7706 unsigned Opcode =
7707 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7708 auto NewSelect =
7709 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7710 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7714 Register NewDestReg = MRI.createVirtualRegister(
7715 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7716 MachineInstr *NewInst;
7717 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7718 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7719 .addImm(0)
7720 .add(Src1) // False
7721 .addImm(0)
7722 .add(Src0) // True
7723 .addReg(NewCondReg);
7724 } else {
7725 NewInst =
7726 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7727 .add(Src1) // False
7728 .add(Src0) // True
7729 .addReg(NewCondReg);
7731 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7732 legalizeOperands(*NewInst, MDT);
7733 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7736 void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7737 MachineInstr &Inst) const {
7738 MachineBasicBlock &MBB = *Inst.getParent();
7739 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7740 MachineBasicBlock::iterator MII = Inst;
7741 DebugLoc DL = Inst.getDebugLoc();
7743 MachineOperand &Dest = Inst.getOperand(0);
7744 MachineOperand &Src = Inst.getOperand(1);
7745 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7746 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7748 unsigned SubOp = ST.hasAddNoCarry() ?
7749 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7751 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7752 .addImm(0)
7753 .addReg(Src.getReg());
7755 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7756 .addReg(Src.getReg())
7757 .addReg(TmpReg);
7759 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7760 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7763 void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7764 MachineInstr &Inst) const {
7765 MachineBasicBlock &MBB = *Inst.getParent();
7766 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7767 MachineBasicBlock::iterator MII = Inst;
7768 const DebugLoc &DL = Inst.getDebugLoc();
7770 MachineOperand &Dest = Inst.getOperand(0);
7771 MachineOperand &Src0 = Inst.getOperand(1);
7772 MachineOperand &Src1 = Inst.getOperand(2);
7774 if (ST.hasDLInsts()) {
7775 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7776 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7777 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7779 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7780 .add(Src0)
7781 .add(Src1);
7783 MRI.replaceRegWith(Dest.getReg(), NewDest);
7784 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7785 } else {
7786 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7787 // invert either source and then perform the XOR. If either source is a
7788 // scalar register, then we can leave the inversion on the scalar unit to
7789 // achieve a better distribution of scalar and vector instructions.
7790 bool Src0IsSGPR = Src0.isReg() &&
7791 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7792 bool Src1IsSGPR = Src1.isReg() &&
7793 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7794 MachineInstr *Xor;
7795 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7796 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7798 // Build a pair of scalar instructions and add them to the work list.
7799 // The next iteration over the work list will lower these to the vector
7800 // unit as necessary.
7801 if (Src0IsSGPR) {
7802 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7803 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7804 .addReg(Temp)
7805 .add(Src1);
7806 } else if (Src1IsSGPR) {
7807 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7808 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7809 .add(Src0)
7810 .addReg(Temp);
7811 } else {
7812 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7813 .add(Src0)
7814 .add(Src1);
7815 MachineInstr *Not =
7816 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7817 Worklist.insert(Not);
7820 MRI.replaceRegWith(Dest.getReg(), NewDest);
7822 Worklist.insert(Xor);
7824 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7828 void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7829 MachineInstr &Inst,
7830 unsigned Opcode) const {
7831 MachineBasicBlock &MBB = *Inst.getParent();
7832 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7833 MachineBasicBlock::iterator MII = Inst;
7834 const DebugLoc &DL = Inst.getDebugLoc();
7836 MachineOperand &Dest = Inst.getOperand(0);
7837 MachineOperand &Src0 = Inst.getOperand(1);
7838 MachineOperand &Src1 = Inst.getOperand(2);
7840 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7841 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7843 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7844 .add(Src0)
7845 .add(Src1);
7847 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7848 .addReg(Interm);
7850 Worklist.insert(&Op);
7851 Worklist.insert(&Not);
7853 MRI.replaceRegWith(Dest.getReg(), NewDest);
7854 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7857 void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7858 MachineInstr &Inst,
7859 unsigned Opcode) const {
7860 MachineBasicBlock &MBB = *Inst.getParent();
7861 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7862 MachineBasicBlock::iterator MII = Inst;
7863 const DebugLoc &DL = Inst.getDebugLoc();
7865 MachineOperand &Dest = Inst.getOperand(0);
7866 MachineOperand &Src0 = Inst.getOperand(1);
7867 MachineOperand &Src1 = Inst.getOperand(2);
7869 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7870 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7872 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7873 .add(Src1);
7875 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7876 .add(Src0)
7877 .addReg(Interm);
7879 Worklist.insert(&Not);
7880 Worklist.insert(&Op);
7882 MRI.replaceRegWith(Dest.getReg(), NewDest);
7883 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7886 void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7887 MachineInstr &Inst, unsigned Opcode,
7888 bool Swap) const {
7889 MachineBasicBlock &MBB = *Inst.getParent();
7890 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7892 MachineOperand &Dest = Inst.getOperand(0);
7893 MachineOperand &Src0 = Inst.getOperand(1);
7894 DebugLoc DL = Inst.getDebugLoc();
7896 MachineBasicBlock::iterator MII = Inst;
7898 const MCInstrDesc &InstDesc = get(Opcode);
7899 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7900 MRI.getRegClass(Src0.getReg()) :
7901 &AMDGPU::SGPR_32RegClass;
7903 const TargetRegisterClass *Src0SubRC =
7904 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7906 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7907 AMDGPU::sub0, Src0SubRC);
7909 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7910 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7911 const TargetRegisterClass *NewDestSubRC =
7912 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7914 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7915 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7917 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7918 AMDGPU::sub1, Src0SubRC);
7920 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7921 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7923 if (Swap)
7924 std::swap(DestSub0, DestSub1);
7926 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7927 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7928 .addReg(DestSub0)
7929 .addImm(AMDGPU::sub0)
7930 .addReg(DestSub1)
7931 .addImm(AMDGPU::sub1);
7933 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7935 Worklist.insert(&LoHalf);
7936 Worklist.insert(&HiHalf);
7938 // We don't need to legalizeOperands here because for a single operand, src0
7939 // will support any kind of input.
7941 // Move all users of this moved value.
7942 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7945 // There is not a vector equivalent of s_mul_u64. For this reason, we need to
7946 // split the s_mul_u64 in 32-bit vector multiplications.
7947 void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7948 MachineInstr &Inst,
7949 MachineDominatorTree *MDT) const {
7950 MachineBasicBlock &MBB = *Inst.getParent();
7951 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7953 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7954 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7955 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7957 MachineOperand &Dest = Inst.getOperand(0);
7958 MachineOperand &Src0 = Inst.getOperand(1);
7959 MachineOperand &Src1 = Inst.getOperand(2);
7960 const DebugLoc &DL = Inst.getDebugLoc();
7961 MachineBasicBlock::iterator MII = Inst;
7963 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7964 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7965 const TargetRegisterClass *Src0SubRC =
7966 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7967 if (RI.isSGPRClass(Src0SubRC))
7968 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7969 const TargetRegisterClass *Src1SubRC =
7970 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7971 if (RI.isSGPRClass(Src1SubRC))
7972 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7974 // First, we extract the low 32-bit and high 32-bit values from each of the
7975 // operands.
7976 MachineOperand Op0L =
7977 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7978 MachineOperand Op1L =
7979 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7980 MachineOperand Op0H =
7981 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7982 MachineOperand Op1H =
7983 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7985 // The multilication is done as follows:
7987 // Op1H Op1L
7988 // * Op0H Op0L
7989 // --------------------
7990 // Op1H*Op0L Op1L*Op0L
7991 // + Op1H*Op0H Op1L*Op0H
7992 // -----------------------------------------
7993 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7995 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7996 // value and that would overflow.
7997 // The low 32-bit value is Op1L*Op0L.
7998 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8000 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8001 MachineInstr *Op1L_Op0H =
8002 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8003 .add(Op1L)
8004 .add(Op0H);
8006 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8007 MachineInstr *Op1H_Op0L =
8008 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8009 .add(Op1H)
8010 .add(Op0L);
8012 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8013 MachineInstr *Carry =
8014 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8015 .add(Op1L)
8016 .add(Op0L);
8018 MachineInstr *LoHalf =
8019 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8020 .add(Op1L)
8021 .add(Op0L);
8023 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8024 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8025 .addReg(Op1L_Op0H_Reg)
8026 .addReg(Op1H_Op0L_Reg);
8028 MachineInstr *HiHalf =
8029 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8030 .addReg(AddReg)
8031 .addReg(CarryReg);
8033 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8034 .addReg(DestSub0)
8035 .addImm(AMDGPU::sub0)
8036 .addReg(DestSub1)
8037 .addImm(AMDGPU::sub1);
8039 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8041 // Try to legalize the operands in case we need to swap the order to keep it
8042 // valid.
8043 legalizeOperands(*Op1L_Op0H, MDT);
8044 legalizeOperands(*Op1H_Op0L, MDT);
8045 legalizeOperands(*Carry, MDT);
8046 legalizeOperands(*LoHalf, MDT);
8047 legalizeOperands(*Add, MDT);
8048 legalizeOperands(*HiHalf, MDT);
8050 // Move all users of this moved value.
8051 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8054 // Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8055 // multiplications.
8056 void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8057 MachineInstr &Inst,
8058 MachineDominatorTree *MDT) const {
8059 MachineBasicBlock &MBB = *Inst.getParent();
8060 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8062 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8063 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8064 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8066 MachineOperand &Dest = Inst.getOperand(0);
8067 MachineOperand &Src0 = Inst.getOperand(1);
8068 MachineOperand &Src1 = Inst.getOperand(2);
8069 const DebugLoc &DL = Inst.getDebugLoc();
8070 MachineBasicBlock::iterator MII = Inst;
8072 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8073 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8074 const TargetRegisterClass *Src0SubRC =
8075 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8076 if (RI.isSGPRClass(Src0SubRC))
8077 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8078 const TargetRegisterClass *Src1SubRC =
8079 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8080 if (RI.isSGPRClass(Src1SubRC))
8081 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8083 // First, we extract the low 32-bit and high 32-bit values from each of the
8084 // operands.
8085 MachineOperand Op0L =
8086 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8087 MachineOperand Op1L =
8088 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8090 unsigned Opc = Inst.getOpcode();
8091 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8092 ? AMDGPU::V_MUL_HI_U32_e64
8093 : AMDGPU::V_MUL_HI_I32_e64;
8094 MachineInstr *HiHalf =
8095 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8097 MachineInstr *LoHalf =
8098 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8099 .add(Op1L)
8100 .add(Op0L);
8102 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8103 .addReg(DestSub0)
8104 .addImm(AMDGPU::sub0)
8105 .addReg(DestSub1)
8106 .addImm(AMDGPU::sub1);
8108 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8110 // Try to legalize the operands in case we need to swap the order to keep it
8111 // valid.
8112 legalizeOperands(*HiHalf, MDT);
8113 legalizeOperands(*LoHalf, MDT);
8115 // Move all users of this moved value.
8116 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8119 void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8120 MachineInstr &Inst, unsigned Opcode,
8121 MachineDominatorTree *MDT) const {
8122 MachineBasicBlock &MBB = *Inst.getParent();
8123 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8125 MachineOperand &Dest = Inst.getOperand(0);
8126 MachineOperand &Src0 = Inst.getOperand(1);
8127 MachineOperand &Src1 = Inst.getOperand(2);
8128 DebugLoc DL = Inst.getDebugLoc();
8130 MachineBasicBlock::iterator MII = Inst;
8132 const MCInstrDesc &InstDesc = get(Opcode);
8133 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8134 MRI.getRegClass(Src0.getReg()) :
8135 &AMDGPU::SGPR_32RegClass;
8137 const TargetRegisterClass *Src0SubRC =
8138 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8139 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8140 MRI.getRegClass(Src1.getReg()) :
8141 &AMDGPU::SGPR_32RegClass;
8143 const TargetRegisterClass *Src1SubRC =
8144 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8146 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8147 AMDGPU::sub0, Src0SubRC);
8148 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8149 AMDGPU::sub0, Src1SubRC);
8150 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8151 AMDGPU::sub1, Src0SubRC);
8152 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8153 AMDGPU::sub1, Src1SubRC);
8155 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8156 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8157 const TargetRegisterClass *NewDestSubRC =
8158 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8160 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8161 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8162 .add(SrcReg0Sub0)
8163 .add(SrcReg1Sub0);
8165 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8166 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8167 .add(SrcReg0Sub1)
8168 .add(SrcReg1Sub1);
8170 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8171 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8172 .addReg(DestSub0)
8173 .addImm(AMDGPU::sub0)
8174 .addReg(DestSub1)
8175 .addImm(AMDGPU::sub1);
8177 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8179 Worklist.insert(&LoHalf);
8180 Worklist.insert(&HiHalf);
8182 // Move all users of this moved value.
8183 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8186 void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8187 MachineInstr &Inst,
8188 MachineDominatorTree *MDT) const {
8189 MachineBasicBlock &MBB = *Inst.getParent();
8190 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8192 MachineOperand &Dest = Inst.getOperand(0);
8193 MachineOperand &Src0 = Inst.getOperand(1);
8194 MachineOperand &Src1 = Inst.getOperand(2);
8195 const DebugLoc &DL = Inst.getDebugLoc();
8197 MachineBasicBlock::iterator MII = Inst;
8199 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8201 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8203 MachineOperand* Op0;
8204 MachineOperand* Op1;
8206 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8207 Op0 = &Src0;
8208 Op1 = &Src1;
8209 } else {
8210 Op0 = &Src1;
8211 Op1 = &Src0;
8214 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8215 .add(*Op0);
8217 Register NewDest = MRI.createVirtualRegister(DestRC);
8219 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8220 .addReg(Interm)
8221 .add(*Op1);
8223 MRI.replaceRegWith(Dest.getReg(), NewDest);
8225 Worklist.insert(&Xor);
8228 void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8229 MachineInstr &Inst) const {
8230 MachineBasicBlock &MBB = *Inst.getParent();
8231 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8233 MachineBasicBlock::iterator MII = Inst;
8234 const DebugLoc &DL = Inst.getDebugLoc();
8236 MachineOperand &Dest = Inst.getOperand(0);
8237 MachineOperand &Src = Inst.getOperand(1);
8239 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8240 const TargetRegisterClass *SrcRC = Src.isReg() ?
8241 MRI.getRegClass(Src.getReg()) :
8242 &AMDGPU::SGPR_32RegClass;
8244 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8245 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8247 const TargetRegisterClass *SrcSubRC =
8248 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8250 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8251 AMDGPU::sub0, SrcSubRC);
8252 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8253 AMDGPU::sub1, SrcSubRC);
8255 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8257 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8259 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8261 // We don't need to legalize operands here. src0 for either instruction can be
8262 // an SGPR, and the second input is unused or determined here.
8263 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8266 void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8267 MachineInstr &Inst) const {
8268 MachineBasicBlock &MBB = *Inst.getParent();
8269 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8270 MachineBasicBlock::iterator MII = Inst;
8271 const DebugLoc &DL = Inst.getDebugLoc();
8273 MachineOperand &Dest = Inst.getOperand(0);
8274 uint32_t Imm = Inst.getOperand(2).getImm();
8275 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8276 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8278 (void) Offset;
8280 // Only sext_inreg cases handled.
8281 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8282 Offset == 0 && "Not implemented");
8284 if (BitWidth < 32) {
8285 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8286 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8287 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8289 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8290 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8291 .addImm(0)
8292 .addImm(BitWidth);
8294 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8295 .addImm(31)
8296 .addReg(MidRegLo);
8298 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8299 .addReg(MidRegLo)
8300 .addImm(AMDGPU::sub0)
8301 .addReg(MidRegHi)
8302 .addImm(AMDGPU::sub1);
8304 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8305 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8306 return;
8309 MachineOperand &Src = Inst.getOperand(1);
8310 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8311 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8313 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8314 .addImm(31)
8315 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8317 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8318 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8319 .addImm(AMDGPU::sub0)
8320 .addReg(TmpReg)
8321 .addImm(AMDGPU::sub1);
8323 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8324 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8327 void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8328 MachineInstr &Inst, unsigned Opcode,
8329 MachineDominatorTree *MDT) const {
8330 // (S_FLBIT_I32_B64 hi:lo) ->
8331 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8332 // (S_FF1_I32_B64 hi:lo) ->
8333 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8335 MachineBasicBlock &MBB = *Inst.getParent();
8336 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8337 MachineBasicBlock::iterator MII = Inst;
8338 const DebugLoc &DL = Inst.getDebugLoc();
8340 MachineOperand &Dest = Inst.getOperand(0);
8341 MachineOperand &Src = Inst.getOperand(1);
8343 const MCInstrDesc &InstDesc = get(Opcode);
8345 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8346 unsigned OpcodeAdd =
8347 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8349 const TargetRegisterClass *SrcRC =
8350 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8351 const TargetRegisterClass *SrcSubRC =
8352 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8354 MachineOperand SrcRegSub0 =
8355 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8356 MachineOperand SrcRegSub1 =
8357 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8359 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8360 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8361 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8362 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8364 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8366 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8368 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8369 .addReg(IsCtlz ? MidReg1 : MidReg2)
8370 .addImm(32)
8371 .addImm(1); // enable clamp
8373 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8374 .addReg(MidReg3)
8375 .addReg(IsCtlz ? MidReg2 : MidReg1);
8377 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8379 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8382 void SIInstrInfo::addUsersToMoveToVALUWorklist(
8383 Register DstReg, MachineRegisterInfo &MRI,
8384 SIInstrWorklist &Worklist) const {
8385 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8386 E = MRI.use_end(); I != E;) {
8387 MachineInstr &UseMI = *I->getParent();
8389 unsigned OpNo = 0;
8391 switch (UseMI.getOpcode()) {
8392 case AMDGPU::COPY:
8393 case AMDGPU::WQM:
8394 case AMDGPU::SOFT_WQM:
8395 case AMDGPU::STRICT_WWM:
8396 case AMDGPU::STRICT_WQM:
8397 case AMDGPU::REG_SEQUENCE:
8398 case AMDGPU::PHI:
8399 case AMDGPU::INSERT_SUBREG:
8400 break;
8401 default:
8402 OpNo = I.getOperandNo();
8403 break;
8406 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8407 Worklist.insert(&UseMI);
8409 do {
8410 ++I;
8411 } while (I != E && I->getParent() == &UseMI);
8412 } else {
8413 ++I;
8418 void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8419 MachineRegisterInfo &MRI,
8420 MachineInstr &Inst) const {
8421 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8422 MachineBasicBlock *MBB = Inst.getParent();
8423 MachineOperand &Src0 = Inst.getOperand(1);
8424 MachineOperand &Src1 = Inst.getOperand(2);
8425 const DebugLoc &DL = Inst.getDebugLoc();
8427 switch (Inst.getOpcode()) {
8428 case AMDGPU::S_PACK_LL_B32_B16: {
8429 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8430 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8432 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8433 // 0.
8434 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8435 .addImm(0xffff);
8437 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8438 .addReg(ImmReg, RegState::Kill)
8439 .add(Src0);
8441 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8442 .add(Src1)
8443 .addImm(16)
8444 .addReg(TmpReg, RegState::Kill);
8445 break;
8447 case AMDGPU::S_PACK_LH_B32_B16: {
8448 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8449 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8450 .addImm(0xffff);
8451 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8452 .addReg(ImmReg, RegState::Kill)
8453 .add(Src0)
8454 .add(Src1);
8455 break;
8457 case AMDGPU::S_PACK_HL_B32_B16: {
8458 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8459 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8460 .addImm(16)
8461 .add(Src0);
8462 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8463 .add(Src1)
8464 .addImm(16)
8465 .addReg(TmpReg, RegState::Kill);
8466 break;
8468 case AMDGPU::S_PACK_HH_B32_B16: {
8469 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8470 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8471 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8472 .addImm(16)
8473 .add(Src0);
8474 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8475 .addImm(0xffff0000);
8476 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8477 .add(Src1)
8478 .addReg(ImmReg, RegState::Kill)
8479 .addReg(TmpReg, RegState::Kill);
8480 break;
8482 default:
8483 llvm_unreachable("unhandled s_pack_* instruction");
8486 MachineOperand &Dest = Inst.getOperand(0);
8487 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8488 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8491 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8492 MachineInstr &SCCDefInst,
8493 SIInstrWorklist &Worklist,
8494 Register NewCond) const {
8496 // Ensure that def inst defines SCC, which is still live.
8497 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8498 !Op.isDead() && Op.getParent() == &SCCDefInst);
8499 SmallVector<MachineInstr *, 4> CopyToDelete;
8500 // This assumes that all the users of SCC are in the same block
8501 // as the SCC def.
8502 for (MachineInstr &MI : // Skip the def inst itself.
8503 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8504 SCCDefInst.getParent()->end())) {
8505 // Check if SCC is used first.
8506 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8507 if (SCCIdx != -1) {
8508 if (MI.isCopy()) {
8509 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8510 Register DestReg = MI.getOperand(0).getReg();
8512 MRI.replaceRegWith(DestReg, NewCond);
8513 CopyToDelete.push_back(&MI);
8514 } else {
8516 if (NewCond.isValid())
8517 MI.getOperand(SCCIdx).setReg(NewCond);
8519 Worklist.insert(&MI);
8522 // Exit if we find another SCC def.
8523 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8524 break;
8526 for (auto &Copy : CopyToDelete)
8527 Copy->eraseFromParent();
8530 // Instructions that use SCC may be converted to VALU instructions. When that
8531 // happens, the SCC register is changed to VCC_LO. The instruction that defines
8532 // SCC must be changed to an instruction that defines VCC. This function makes
8533 // sure that the instruction that defines SCC is added to the moveToVALU
8534 // worklist.
8535 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8536 SIInstrWorklist &Worklist) const {
8537 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8538 // then there is nothing to do because the defining instruction has been
8539 // converted to a VALU already. If SCC then that instruction needs to be
8540 // converted to a VALU.
8541 for (MachineInstr &MI :
8542 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8543 SCCUseInst->getParent()->rend())) {
8544 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8545 break;
8546 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8547 Worklist.insert(&MI);
8548 break;
8553 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8554 const MachineInstr &Inst) const {
8555 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8557 switch (Inst.getOpcode()) {
8558 // For target instructions, getOpRegClass just returns the virtual register
8559 // class associated with the operand, so we need to find an equivalent VGPR
8560 // register class in order to move the instruction to the VALU.
8561 case AMDGPU::COPY:
8562 case AMDGPU::PHI:
8563 case AMDGPU::REG_SEQUENCE:
8564 case AMDGPU::INSERT_SUBREG:
8565 case AMDGPU::WQM:
8566 case AMDGPU::SOFT_WQM:
8567 case AMDGPU::STRICT_WWM:
8568 case AMDGPU::STRICT_WQM: {
8569 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8570 if (RI.isAGPRClass(SrcRC)) {
8571 if (RI.isAGPRClass(NewDstRC))
8572 return nullptr;
8574 switch (Inst.getOpcode()) {
8575 case AMDGPU::PHI:
8576 case AMDGPU::REG_SEQUENCE:
8577 case AMDGPU::INSERT_SUBREG:
8578 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8579 break;
8580 default:
8581 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8584 if (!NewDstRC)
8585 return nullptr;
8586 } else {
8587 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8588 return nullptr;
8590 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8591 if (!NewDstRC)
8592 return nullptr;
8595 return NewDstRC;
8597 default:
8598 return NewDstRC;
8602 // Find the one SGPR operand we are allowed to use.
8603 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8604 int OpIndices[3]) const {
8605 const MCInstrDesc &Desc = MI.getDesc();
8607 // Find the one SGPR operand we are allowed to use.
8609 // First we need to consider the instruction's operand requirements before
8610 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8611 // of VCC, but we are still bound by the constant bus requirement to only use
8612 // one.
8614 // If the operand's class is an SGPR, we can never move it.
8616 Register SGPRReg = findImplicitSGPRRead(MI);
8617 if (SGPRReg)
8618 return SGPRReg;
8620 Register UsedSGPRs[3] = {Register()};
8621 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8623 for (unsigned i = 0; i < 3; ++i) {
8624 int Idx = OpIndices[i];
8625 if (Idx == -1)
8626 break;
8628 const MachineOperand &MO = MI.getOperand(Idx);
8629 if (!MO.isReg())
8630 continue;
8632 // Is this operand statically required to be an SGPR based on the operand
8633 // constraints?
8634 const TargetRegisterClass *OpRC =
8635 RI.getRegClass(Desc.operands()[Idx].RegClass);
8636 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8637 if (IsRequiredSGPR)
8638 return MO.getReg();
8640 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8641 Register Reg = MO.getReg();
8642 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8643 if (RI.isSGPRClass(RegRC))
8644 UsedSGPRs[i] = Reg;
8647 // We don't have a required SGPR operand, so we have a bit more freedom in
8648 // selecting operands to move.
8650 // Try to select the most used SGPR. If an SGPR is equal to one of the
8651 // others, we choose that.
8653 // e.g.
8654 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8655 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8657 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8658 // prefer those.
8660 if (UsedSGPRs[0]) {
8661 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8662 SGPRReg = UsedSGPRs[0];
8665 if (!SGPRReg && UsedSGPRs[1]) {
8666 if (UsedSGPRs[1] == UsedSGPRs[2])
8667 SGPRReg = UsedSGPRs[1];
8670 return SGPRReg;
8673 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
8674 unsigned OperandName) const {
8675 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8676 if (Idx == -1)
8677 return nullptr;
8679 return &MI.getOperand(Idx);
8682 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
8683 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
8684 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
8685 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
8686 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
8687 return (Format << 44) |
8688 (1ULL << 56) | // RESOURCE_LEVEL = 1
8689 (3ULL << 60); // OOB_SELECT = 3
8692 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8693 if (ST.isAmdHsaOS()) {
8694 // Set ATC = 1. GFX9 doesn't have this bit.
8695 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8696 RsrcDataFormat |= (1ULL << 56);
8698 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8699 // BTW, it disables TC L2 and therefore decreases performance.
8700 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
8701 RsrcDataFormat |= (2ULL << 59);
8704 return RsrcDataFormat;
8707 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
8708 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
8709 AMDGPU::RSRC_TID_ENABLE |
8710 0xffffffff; // Size;
8712 // GFX9 doesn't have ELEMENT_SIZE.
8713 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
8714 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8715 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8718 // IndexStride = 64 / 32.
8719 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
8720 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8722 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8723 // Clear them unless we want a huge stride.
8724 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
8725 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
8726 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8728 return Rsrc23;
8731 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
8732 unsigned Opc = MI.getOpcode();
8734 return isSMRD(Opc);
8737 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
8738 return get(Opc).mayLoad() &&
8739 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8742 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
8743 int &FrameIndex) const {
8744 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8745 if (!Addr || !Addr->isFI())
8746 return Register();
8748 assert(!MI.memoperands_empty() &&
8749 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8751 FrameIndex = Addr->getIndex();
8752 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8755 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
8756 int &FrameIndex) const {
8757 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8758 assert(Addr && Addr->isFI());
8759 FrameIndex = Addr->getIndex();
8760 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8763 Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
8764 int &FrameIndex) const {
8765 if (!MI.mayLoad())
8766 return Register();
8768 if (isMUBUF(MI) || isVGPRSpill(MI))
8769 return isStackAccess(MI, FrameIndex);
8771 if (isSGPRSpill(MI))
8772 return isSGPRStackAccess(MI, FrameIndex);
8774 return Register();
8777 Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
8778 int &FrameIndex) const {
8779 if (!MI.mayStore())
8780 return Register();
8782 if (isMUBUF(MI) || isVGPRSpill(MI))
8783 return isStackAccess(MI, FrameIndex);
8785 if (isSGPRSpill(MI))
8786 return isSGPRStackAccess(MI, FrameIndex);
8788 return Register();
8791 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
8792 unsigned Size = 0;
8793 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
8794 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8795 while (++I != E && I->isInsideBundle()) {
8796 assert(!I->isBundle() && "No nested bundle!");
8797 Size += getInstSizeInBytes(*I);
8800 return Size;
8803 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
8804 unsigned Opc = MI.getOpcode();
8805 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
8806 unsigned DescSize = Desc.getSize();
8808 // If we have a definitive size, we can use it. Otherwise we need to inspect
8809 // the operands to know the size.
8810 if (isFixedSize(MI)) {
8811 unsigned Size = DescSize;
8813 // If we hit the buggy offset, an extra nop will be inserted in MC so
8814 // estimate the worst case.
8815 if (MI.isBranch() && ST.hasOffset3fBug())
8816 Size += 4;
8818 return Size;
8821 // Instructions may have a 32-bit literal encoded after them. Check
8822 // operands that could ever be literals.
8823 if (isVALU(MI) || isSALU(MI)) {
8824 if (isDPP(MI))
8825 return DescSize;
8826 bool HasLiteral = false;
8827 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8828 const MachineOperand &Op = MI.getOperand(I);
8829 const MCOperandInfo &OpInfo = Desc.operands()[I];
8830 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8831 HasLiteral = true;
8832 break;
8835 return HasLiteral ? DescSize + 4 : DescSize;
8838 // Check whether we have extra NSA words.
8839 if (isMIMG(MI)) {
8840 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8841 if (VAddr0Idx < 0)
8842 return 8;
8844 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8845 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8848 switch (Opc) {
8849 case TargetOpcode::BUNDLE:
8850 return getInstBundleSize(MI);
8851 case TargetOpcode::INLINEASM:
8852 case TargetOpcode::INLINEASM_BR: {
8853 const MachineFunction *MF = MI.getParent()->getParent();
8854 const char *AsmStr = MI.getOperand(0).getSymbolName();
8855 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8857 default:
8858 if (MI.isMetaInstruction())
8859 return 0;
8860 return DescSize;
8864 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
8865 if (!isFLAT(MI))
8866 return false;
8868 if (MI.memoperands_empty())
8869 return true;
8871 for (const MachineMemOperand *MMO : MI.memoperands()) {
8872 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8873 return true;
8875 return false;
8878 ArrayRef<std::pair<int, const char *>>
8879 SIInstrInfo::getSerializableTargetIndices() const {
8880 static const std::pair<int, const char *> TargetIndices[] = {
8881 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8882 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8883 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8884 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8885 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8886 return ArrayRef(TargetIndices);
8889 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8890 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8891 ScheduleHazardRecognizer *
8892 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
8893 const ScheduleDAG *DAG) const {
8894 return new GCNHazardRecognizer(DAG->MF);
8897 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8898 /// pass.
8899 ScheduleHazardRecognizer *
8900 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
8901 return new GCNHazardRecognizer(MF);
8904 // Called during:
8905 // - pre-RA scheduling and post-RA scheduling
8906 ScheduleHazardRecognizer *
8907 SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
8908 const ScheduleDAGMI *DAG) const {
8909 // Borrowed from Arm Target
8910 // We would like to restrict this hazard recognizer to only
8911 // post-RA scheduling; we can tell that we're post-RA because we don't
8912 // track VRegLiveness.
8913 if (!DAG->hasVRegLiveness())
8914 return new GCNHazardRecognizer(DAG->MF);
8915 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
8918 std::pair<unsigned, unsigned>
8919 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8920 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8923 ArrayRef<std::pair<unsigned, const char *>>
8924 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8925 static const std::pair<unsigned, const char *> TargetFlags[] = {
8926 { MO_GOTPCREL, "amdgpu-gotprel" },
8927 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8928 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8929 { MO_REL32_LO, "amdgpu-rel32-lo" },
8930 { MO_REL32_HI, "amdgpu-rel32-hi" },
8931 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8932 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8935 return ArrayRef(TargetFlags);
8938 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8939 SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8940 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8942 {MONoClobber, "amdgpu-noclobber"},
8943 {MOLastUse, "amdgpu-last-use"},
8946 return ArrayRef(TargetFlags);
8949 unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
8950 const MachineFunction &MF) const {
8951 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8952 assert(SrcReg.isVirtual());
8953 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8954 return AMDGPU::WWM_COPY;
8956 return AMDGPU::COPY;
8959 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
8960 Register Reg) const {
8961 // We need to handle instructions which may be inserted during register
8962 // allocation to handle the prolog. The initial prolog instruction may have
8963 // been separated from the start of the block by spills and copies inserted
8964 // needed by the prolog. However, the insertions for scalar registers can
8965 // always be placed at the BB top as they are independent of the exec mask
8966 // value.
8967 const MachineFunction *MF = MI.getParent()->getParent();
8968 bool IsNullOrVectorRegister = true;
8969 if (Reg) {
8970 const MachineRegisterInfo &MRI = MF->getRegInfo();
8971 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8974 uint16_t Opcode = MI.getOpcode();
8975 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
8976 return IsNullOrVectorRegister &&
8977 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
8978 (Opcode == AMDGPU::IMPLICIT_DEF &&
8979 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
8980 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8981 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8984 MachineInstrBuilder
8985 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
8986 MachineBasicBlock::iterator I,
8987 const DebugLoc &DL,
8988 Register DestReg) const {
8989 if (ST.hasAddNoCarry())
8990 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8992 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8993 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8994 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8996 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8997 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9000 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
9001 MachineBasicBlock::iterator I,
9002 const DebugLoc &DL,
9003 Register DestReg,
9004 RegScavenger &RS) const {
9005 if (ST.hasAddNoCarry())
9006 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9008 // If available, prefer to use vcc.
9009 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9010 ? Register(RI.getVCC())
9011 : RS.scavengeRegisterBackwards(
9012 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9013 0, /* AllowSpill */ false);
9015 // TODO: Users need to deal with this.
9016 if (!UnusedCarry.isValid())
9017 return MachineInstrBuilder();
9019 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9020 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9023 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9024 switch (Opcode) {
9025 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9026 case AMDGPU::SI_KILL_I1_TERMINATOR:
9027 return true;
9028 default:
9029 return false;
9033 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
9034 switch (Opcode) {
9035 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9036 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9037 case AMDGPU::SI_KILL_I1_PSEUDO:
9038 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9039 default:
9040 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9044 bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9045 return Imm <= getMaxMUBUFImmOffset(ST);
9048 unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
9049 // GFX12 field is non-negative 24-bit signed byte offset.
9050 const unsigned OffsetBits =
9051 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9052 return (1 << OffsetBits) - 1;
9055 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
9056 if (!ST.isWave32())
9057 return;
9059 if (MI.isInlineAsm())
9060 return;
9062 for (auto &Op : MI.implicit_operands()) {
9063 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9064 Op.setReg(AMDGPU::VCC_LO);
9068 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
9069 if (!isSMRD(MI))
9070 return false;
9072 // Check that it is using a buffer resource.
9073 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9074 if (Idx == -1) // e.g. s_memtime
9075 return false;
9077 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9078 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9081 // Given Imm, split it into the values to put into the SOffset and ImmOffset
9082 // fields in an MUBUF instruction. Return false if it is not possible (due to a
9083 // hardware bug needing a workaround).
9085 // The required alignment ensures that individual address components remain
9086 // aligned if they are aligned to begin with. It also ensures that additional
9087 // offsets within the given alignment can be added to the resulting ImmOffset.
9088 bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
9089 uint32_t &ImmOffset, Align Alignment) const {
9090 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9091 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9092 uint32_t Overflow = 0;
9094 if (Imm > MaxImm) {
9095 if (Imm <= MaxImm + 64) {
9096 // Use an SOffset inline constant for 4..64
9097 Overflow = Imm - MaxImm;
9098 Imm = MaxImm;
9099 } else {
9100 // Try to keep the same value in SOffset for adjacent loads, so that
9101 // the corresponding register contents can be re-used.
9103 // Load values with all low-bits (except for alignment bits) set into
9104 // SOffset, so that a larger range of values can be covered using
9105 // s_movk_i32.
9107 // Atomic operations fail to work correctly when individual address
9108 // components are unaligned, even if their sum is aligned.
9109 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9110 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9111 Imm = Low;
9112 Overflow = High - Alignment.value();
9116 if (Overflow > 0) {
9117 // There is a hardware bug in SI and CI which prevents address clamping in
9118 // MUBUF instructions from working correctly with SOffsets. The immediate
9119 // offset is unaffected.
9120 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9121 return false;
9123 // It is not possible to set immediate in SOffset field on some targets.
9124 if (ST.hasRestrictedSOffset())
9125 return false;
9128 ImmOffset = Imm;
9129 SOffset = Overflow;
9130 return true;
9133 // Depending on the used address space and instructions, some immediate offsets
9134 // are allowed and some are not.
9135 // Pre-GFX12, flat instruction offsets can only be non-negative, global and
9136 // scratch instruction offsets can also be negative. On GFX12, offsets can be
9137 // negative for all variants.
9139 // There are several bugs related to these offsets:
9140 // On gfx10.1, flat instructions that go into the global address space cannot
9141 // use an offset.
9143 // For scratch instructions, the address can be either an SGPR or a VGPR.
9144 // The following offsets can be used, depending on the architecture (x means
9145 // cannot be used):
9146 // +----------------------------+------+------+
9147 // | Address-Mode | SGPR | VGPR |
9148 // +----------------------------+------+------+
9149 // | gfx9 | | |
9150 // | negative, 4-aligned offset | x | ok |
9151 // | negative, unaligned offset | x | ok |
9152 // +----------------------------+------+------+
9153 // | gfx10 | | |
9154 // | negative, 4-aligned offset | ok | ok |
9155 // | negative, unaligned offset | ok | x |
9156 // +----------------------------+------+------+
9157 // | gfx10.3 | | |
9158 // | negative, 4-aligned offset | ok | ok |
9159 // | negative, unaligned offset | ok | ok |
9160 // +----------------------------+------+------+
9162 // This function ignores the addressing mode, so if an offset cannot be used in
9163 // one addressing mode, it is considered illegal.
9164 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9165 uint64_t FlatVariant) const {
9166 // TODO: Should 0 be special cased?
9167 if (!ST.hasFlatInstOffsets())
9168 return false;
9170 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9171 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9172 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9173 return false;
9175 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9176 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9177 (Offset % 4) != 0) {
9178 return false;
9181 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9182 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9183 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9186 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9187 std::pair<int64_t, int64_t>
9188 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9189 uint64_t FlatVariant) const {
9190 int64_t RemainderOffset = COffsetVal;
9191 int64_t ImmField = 0;
9193 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9194 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9196 if (AllowNegative) {
9197 // Use signed division by a power of two to truncate towards 0.
9198 int64_t D = 1LL << NumBits;
9199 RemainderOffset = (COffsetVal / D) * D;
9200 ImmField = COffsetVal - RemainderOffset;
9202 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9203 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9204 (ImmField % 4) != 0) {
9205 // Make ImmField a multiple of 4
9206 RemainderOffset += ImmField % 4;
9207 ImmField -= ImmField % 4;
9209 } else if (COffsetVal >= 0) {
9210 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9211 RemainderOffset = COffsetVal - ImmField;
9214 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9215 assert(RemainderOffset + ImmField == COffsetVal);
9216 return {ImmField, RemainderOffset};
9219 bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
9220 if (ST.hasNegativeScratchOffsetBug() &&
9221 FlatVariant == SIInstrFlags::FlatScratch)
9222 return false;
9224 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9227 static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9228 switch (ST.getGeneration()) {
9229 default:
9230 break;
9231 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
9232 case AMDGPUSubtarget::SEA_ISLANDS:
9233 return SIEncodingFamily::SI;
9234 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
9235 case AMDGPUSubtarget::GFX9:
9236 return SIEncodingFamily::VI;
9237 case AMDGPUSubtarget::GFX10:
9238 return SIEncodingFamily::GFX10;
9239 case AMDGPUSubtarget::GFX11:
9240 return SIEncodingFamily::GFX11;
9241 case AMDGPUSubtarget::GFX12:
9242 return SIEncodingFamily::GFX12;
9244 llvm_unreachable("Unknown subtarget generation!");
9247 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9248 switch(MCOp) {
9249 // These opcodes use indirect register addressing so
9250 // they need special handling by codegen (currently missing).
9251 // Therefore it is too risky to allow these opcodes
9252 // to be selected by dpp combiner or sdwa peepholer.
9253 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9254 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9255 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9256 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9257 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9258 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9259 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9260 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9261 return true;
9262 default:
9263 return false;
9267 #define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9268 case OPCODE##_dpp: \
9269 case OPCODE##_e32: \
9270 case OPCODE##_e64: \
9271 case OPCODE##_e64_dpp: \
9272 case OPCODE##_sdwa:
9274 static bool isRenamedInGFX9(int Opcode) {
9275 switch (Opcode) {
9276 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9277 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9278 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9279 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9280 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9281 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9282 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9283 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9284 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9286 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9287 case AMDGPU::V_FMA_F16_gfx9_e64:
9288 case AMDGPU::V_INTERP_P2_F16:
9289 case AMDGPU::V_MAD_F16_e64:
9290 case AMDGPU::V_MAD_U16_e64:
9291 case AMDGPU::V_MAD_I16_e64:
9292 return true;
9293 default:
9294 return false;
9298 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9299 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9301 unsigned Gen = subtargetEncodingFamily(ST);
9303 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9304 Gen = SIEncodingFamily::GFX9;
9306 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9307 // subtarget has UnpackedD16VMem feature.
9308 // TODO: remove this when we discard GFX80 encoding.
9309 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9310 Gen = SIEncodingFamily::GFX80;
9312 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9313 switch (ST.getGeneration()) {
9314 default:
9315 Gen = SIEncodingFamily::SDWA;
9316 break;
9317 case AMDGPUSubtarget::GFX9:
9318 Gen = SIEncodingFamily::SDWA9;
9319 break;
9320 case AMDGPUSubtarget::GFX10:
9321 Gen = SIEncodingFamily::SDWA10;
9322 break;
9326 if (isMAI(Opcode)) {
9327 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9328 if (MFMAOp != -1)
9329 Opcode = MFMAOp;
9332 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9334 // -1 means that Opcode is already a native instruction.
9335 if (MCOp == -1)
9336 return Opcode;
9338 if (ST.hasGFX90AInsts()) {
9339 uint16_t NMCOp = (uint16_t)-1;
9340 if (ST.hasGFX940Insts())
9341 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
9342 if (NMCOp == (uint16_t)-1)
9343 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
9344 if (NMCOp == (uint16_t)-1)
9345 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
9346 if (NMCOp != (uint16_t)-1)
9347 MCOp = NMCOp;
9350 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9351 // no encoding in the given subtarget generation.
9352 if (MCOp == (uint16_t)-1)
9353 return -1;
9355 if (isAsmOnlyOpcode(MCOp))
9356 return -1;
9358 return MCOp;
9361 static
9362 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
9363 assert(RegOpnd.isReg());
9364 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9365 getRegSubRegPair(RegOpnd);
9368 TargetInstrInfo::RegSubRegPair
9369 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
9370 assert(MI.isRegSequence());
9371 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9372 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9373 auto &RegOp = MI.getOperand(1 + 2 * I);
9374 return getRegOrUndef(RegOp);
9376 return TargetInstrInfo::RegSubRegPair();
9379 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
9380 // Following a subreg of reg:subreg isn't supported
9381 static bool followSubRegDef(MachineInstr &MI,
9382 TargetInstrInfo::RegSubRegPair &RSR) {
9383 if (!RSR.SubReg)
9384 return false;
9385 switch (MI.getOpcode()) {
9386 default: break;
9387 case AMDGPU::REG_SEQUENCE:
9388 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9389 return true;
9390 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9391 case AMDGPU::INSERT_SUBREG:
9392 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9393 // inserted the subreg we're looking for
9394 RSR = getRegOrUndef(MI.getOperand(2));
9395 else { // the subreg in the rest of the reg
9396 auto R1 = getRegOrUndef(MI.getOperand(1));
9397 if (R1.SubReg) // subreg of subreg isn't supported
9398 return false;
9399 RSR.Reg = R1.Reg;
9401 return true;
9403 return false;
9406 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
9407 MachineRegisterInfo &MRI) {
9408 assert(MRI.isSSA());
9409 if (!P.Reg.isVirtual())
9410 return nullptr;
9412 auto RSR = P;
9413 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9414 while (auto *MI = DefInst) {
9415 DefInst = nullptr;
9416 switch (MI->getOpcode()) {
9417 case AMDGPU::COPY:
9418 case AMDGPU::V_MOV_B32_e32: {
9419 auto &Op1 = MI->getOperand(1);
9420 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9421 if (Op1.isUndef())
9422 return nullptr;
9423 RSR = getRegSubRegPair(Op1);
9424 DefInst = MRI.getVRegDef(RSR.Reg);
9426 break;
9428 default:
9429 if (followSubRegDef(*MI, RSR)) {
9430 if (!RSR.Reg)
9431 return nullptr;
9432 DefInst = MRI.getVRegDef(RSR.Reg);
9435 if (!DefInst)
9436 return MI;
9438 return nullptr;
9441 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
9442 Register VReg,
9443 const MachineInstr &DefMI,
9444 const MachineInstr &UseMI) {
9445 assert(MRI.isSSA() && "Must be run on SSA");
9447 auto *TRI = MRI.getTargetRegisterInfo();
9448 auto *DefBB = DefMI.getParent();
9450 // Don't bother searching between blocks, although it is possible this block
9451 // doesn't modify exec.
9452 if (UseMI.getParent() != DefBB)
9453 return true;
9455 const int MaxInstScan = 20;
9456 int NumInst = 0;
9458 // Stop scan at the use.
9459 auto E = UseMI.getIterator();
9460 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9461 if (I->isDebugInstr())
9462 continue;
9464 if (++NumInst > MaxInstScan)
9465 return true;
9467 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9468 return true;
9471 return false;
9474 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
9475 Register VReg,
9476 const MachineInstr &DefMI) {
9477 assert(MRI.isSSA() && "Must be run on SSA");
9479 auto *TRI = MRI.getTargetRegisterInfo();
9480 auto *DefBB = DefMI.getParent();
9482 const int MaxUseScan = 10;
9483 int NumUse = 0;
9485 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9486 auto &UseInst = *Use.getParent();
9487 // Don't bother searching between blocks, although it is possible this block
9488 // doesn't modify exec.
9489 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9490 return true;
9492 if (++NumUse > MaxUseScan)
9493 return true;
9496 if (NumUse == 0)
9497 return false;
9499 const int MaxInstScan = 20;
9500 int NumInst = 0;
9502 // Stop scan when we have seen all the uses.
9503 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9504 assert(I != DefBB->end());
9506 if (I->isDebugInstr())
9507 continue;
9509 if (++NumInst > MaxInstScan)
9510 return true;
9512 for (const MachineOperand &Op : I->operands()) {
9513 // We don't check reg masks here as they're used only on calls:
9514 // 1. EXEC is only considered const within one BB
9515 // 2. Call should be a terminator instruction if present in a BB
9517 if (!Op.isReg())
9518 continue;
9520 Register Reg = Op.getReg();
9521 if (Op.isUse()) {
9522 if (Reg == VReg && --NumUse == 0)
9523 return false;
9524 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9525 return true;
9530 MachineInstr *SIInstrInfo::createPHIDestinationCopy(
9531 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
9532 const DebugLoc &DL, Register Src, Register Dst) const {
9533 auto Cur = MBB.begin();
9534 if (Cur != MBB.end())
9535 do {
9536 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9537 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9538 ++Cur;
9539 } while (Cur != MBB.end() && Cur != LastPHIIt);
9541 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9542 Dst);
9545 MachineInstr *SIInstrInfo::createPHISourceCopy(
9546 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
9547 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9548 if (InsPt != MBB.end() &&
9549 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9550 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9551 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9552 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9553 InsPt++;
9554 return BuildMI(MBB, InsPt, DL,
9555 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9556 : AMDGPU::S_MOV_B64_term),
9557 Dst)
9558 .addReg(Src, 0, SrcSubReg)
9559 .addReg(AMDGPU::EXEC, RegState::Implicit);
9561 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9562 Dst);
9565 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9567 MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
9568 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
9569 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9570 VirtRegMap *VRM) const {
9571 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9573 // %0:sreg_32 = COPY $m0
9575 // We explicitly chose SReg_32 for the virtual register so such a copy might
9576 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9577 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9578 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9579 // TargetInstrInfo::foldMemoryOperand() is going to try.
9580 // A similar issue also exists with spilling and reloading $exec registers.
9582 // To prevent that, constrain the %0 register class here.
9583 if (isFullCopyInstr(MI)) {
9584 Register DstReg = MI.getOperand(0).getReg();
9585 Register SrcReg = MI.getOperand(1).getReg();
9586 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9587 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9588 MachineRegisterInfo &MRI = MF.getRegInfo();
9589 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9590 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9591 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9592 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9593 return nullptr;
9595 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9596 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9597 return nullptr;
9602 return nullptr;
9605 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
9606 const MachineInstr &MI,
9607 unsigned *PredCost) const {
9608 if (MI.isBundle()) {
9609 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
9610 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9611 unsigned Lat = 0, Count = 0;
9612 for (++I; I != E && I->isBundledWithPred(); ++I) {
9613 ++Count;
9614 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9616 return Lat + Count - 1;
9619 return SchedModel.computeInstrLatency(&MI);
9622 InstructionUniformity
9623 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
9624 unsigned opcode = MI.getOpcode();
9625 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9626 auto IID = GI->getIntrinsicID();
9627 if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
9628 return InstructionUniformity::NeverUniform;
9629 if (AMDGPU::isIntrinsicAlwaysUniform(IID))
9630 return InstructionUniformity::AlwaysUniform;
9632 switch (IID) {
9633 case Intrinsic::amdgcn_if:
9634 case Intrinsic::amdgcn_else:
9635 // FIXME: Uniform if second result
9636 break;
9639 return InstructionUniformity::Default;
9642 // Loads from the private and flat address spaces are divergent, because
9643 // threads can execute the load instruction with the same inputs and get
9644 // different results.
9646 // All other loads are not divergent, because if threads issue loads with the
9647 // same arguments, they will always get the same result.
9648 if (opcode == AMDGPU::G_LOAD) {
9649 if (MI.memoperands_empty())
9650 return InstructionUniformity::NeverUniform; // conservative assumption
9652 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9653 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9654 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9655 })) {
9656 // At least one MMO in a non-global address space.
9657 return InstructionUniformity::NeverUniform;
9659 return InstructionUniformity::Default;
9662 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9663 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9664 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9665 AMDGPU::isGenericAtomic(opcode)) {
9666 return InstructionUniformity::NeverUniform;
9668 return InstructionUniformity::Default;
9671 InstructionUniformity
9672 SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
9674 if (isNeverUniform(MI))
9675 return InstructionUniformity::NeverUniform;
9677 unsigned opcode = MI.getOpcode();
9678 if (opcode == AMDGPU::V_READLANE_B32 ||
9679 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9680 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9681 return InstructionUniformity::AlwaysUniform;
9683 if (isCopyInstr(MI)) {
9684 const MachineOperand &srcOp = MI.getOperand(1);
9685 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9686 const TargetRegisterClass *regClass =
9687 RI.getPhysRegBaseClass(srcOp.getReg());
9688 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
9689 : InstructionUniformity::NeverUniform;
9691 return InstructionUniformity::Default;
9694 // GMIR handling
9695 if (MI.isPreISelOpcode())
9696 return SIInstrInfo::getGenericInstructionUniformity(MI);
9698 // Atomics are divergent because they are executed sequentially: when an
9699 // atomic operation refers to the same address in each thread, then each
9700 // thread after the first sees the value written by the previous thread as
9701 // original value.
9703 if (isAtomic(MI))
9704 return InstructionUniformity::NeverUniform;
9706 // Loads from the private and flat address spaces are divergent, because
9707 // threads can execute the load instruction with the same inputs and get
9708 // different results.
9709 if (isFLAT(MI) && MI.mayLoad()) {
9710 if (MI.memoperands_empty())
9711 return InstructionUniformity::NeverUniform; // conservative assumption
9713 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9714 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9715 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9716 })) {
9717 // At least one MMO in a non-global address space.
9718 return InstructionUniformity::NeverUniform;
9721 return InstructionUniformity::Default;
9724 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9725 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9727 // FIXME: It's conceptually broken to report this for an instruction, and not
9728 // a specific def operand. For inline asm in particular, there could be mixed
9729 // uniform and divergent results.
9730 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9731 const MachineOperand &SrcOp = MI.getOperand(I);
9732 if (!SrcOp.isReg())
9733 continue;
9735 Register Reg = SrcOp.getReg();
9736 if (!Reg || !SrcOp.readsReg())
9737 continue;
9739 // If RegBank is null, this is unassigned or an unallocatable special
9740 // register, which are all scalars.
9741 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9742 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9743 return InstructionUniformity::NeverUniform;
9746 // TODO: Uniformity check condtions above can be rearranged for more
9747 // redability
9749 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9750 // currently turned into no-op COPYs by SelectionDAG ISel and are
9751 // therefore no longer recognizable.
9753 return InstructionUniformity::Default;
9756 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
9757 switch (MF.getFunction().getCallingConv()) {
9758 case CallingConv::AMDGPU_PS:
9759 return 1;
9760 case CallingConv::AMDGPU_VS:
9761 return 2;
9762 case CallingConv::AMDGPU_GS:
9763 return 3;
9764 case CallingConv::AMDGPU_HS:
9765 case CallingConv::AMDGPU_LS:
9766 case CallingConv::AMDGPU_ES:
9767 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9768 case CallingConv::AMDGPU_CS:
9769 case CallingConv::AMDGPU_KERNEL:
9770 case CallingConv::C:
9771 case CallingConv::Fast:
9772 default:
9773 // Assume other calling conventions are various compute callable functions
9774 return 0;
9778 bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
9779 Register &SrcReg2, int64_t &CmpMask,
9780 int64_t &CmpValue) const {
9781 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9782 return false;
9784 switch (MI.getOpcode()) {
9785 default:
9786 break;
9787 case AMDGPU::S_CMP_EQ_U32:
9788 case AMDGPU::S_CMP_EQ_I32:
9789 case AMDGPU::S_CMP_LG_U32:
9790 case AMDGPU::S_CMP_LG_I32:
9791 case AMDGPU::S_CMP_LT_U32:
9792 case AMDGPU::S_CMP_LT_I32:
9793 case AMDGPU::S_CMP_GT_U32:
9794 case AMDGPU::S_CMP_GT_I32:
9795 case AMDGPU::S_CMP_LE_U32:
9796 case AMDGPU::S_CMP_LE_I32:
9797 case AMDGPU::S_CMP_GE_U32:
9798 case AMDGPU::S_CMP_GE_I32:
9799 case AMDGPU::S_CMP_EQ_U64:
9800 case AMDGPU::S_CMP_LG_U64:
9801 SrcReg = MI.getOperand(0).getReg();
9802 if (MI.getOperand(1).isReg()) {
9803 if (MI.getOperand(1).getSubReg())
9804 return false;
9805 SrcReg2 = MI.getOperand(1).getReg();
9806 CmpValue = 0;
9807 } else if (MI.getOperand(1).isImm()) {
9808 SrcReg2 = Register();
9809 CmpValue = MI.getOperand(1).getImm();
9810 } else {
9811 return false;
9813 CmpMask = ~0;
9814 return true;
9815 case AMDGPU::S_CMPK_EQ_U32:
9816 case AMDGPU::S_CMPK_EQ_I32:
9817 case AMDGPU::S_CMPK_LG_U32:
9818 case AMDGPU::S_CMPK_LG_I32:
9819 case AMDGPU::S_CMPK_LT_U32:
9820 case AMDGPU::S_CMPK_LT_I32:
9821 case AMDGPU::S_CMPK_GT_U32:
9822 case AMDGPU::S_CMPK_GT_I32:
9823 case AMDGPU::S_CMPK_LE_U32:
9824 case AMDGPU::S_CMPK_LE_I32:
9825 case AMDGPU::S_CMPK_GE_U32:
9826 case AMDGPU::S_CMPK_GE_I32:
9827 SrcReg = MI.getOperand(0).getReg();
9828 SrcReg2 = Register();
9829 CmpValue = MI.getOperand(1).getImm();
9830 CmpMask = ~0;
9831 return true;
9834 return false;
9837 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
9838 Register SrcReg2, int64_t CmpMask,
9839 int64_t CmpValue,
9840 const MachineRegisterInfo *MRI) const {
9841 if (!SrcReg || SrcReg.isPhysical())
9842 return false;
9844 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9845 return false;
9847 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9848 this](int64_t ExpectedValue, unsigned SrcSize,
9849 bool IsReversible, bool IsSigned) -> bool {
9850 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9851 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9852 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9853 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9854 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9855 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9856 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9857 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9858 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9859 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9861 // Signed ge/gt are not used for the sign bit.
9863 // If result of the AND is unused except in the compare:
9864 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9866 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9867 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9868 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9869 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9870 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9871 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9873 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9874 if (!Def || Def->getParent() != CmpInstr.getParent())
9875 return false;
9877 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9878 Def->getOpcode() != AMDGPU::S_AND_B64)
9879 return false;
9881 int64_t Mask;
9882 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9883 if (MO->isImm())
9884 Mask = MO->getImm();
9885 else if (!getFoldableImm(MO, Mask))
9886 return false;
9887 Mask &= maxUIntN(SrcSize);
9888 return isPowerOf2_64(Mask);
9891 MachineOperand *SrcOp = &Def->getOperand(1);
9892 if (isMask(SrcOp))
9893 SrcOp = &Def->getOperand(2);
9894 else if (isMask(&Def->getOperand(2)))
9895 SrcOp = &Def->getOperand(1);
9896 else
9897 return false;
9899 // A valid Mask is required to have a single bit set, hence a non-zero and
9900 // power-of-two value. This verifies that we will not do 64-bit shift below.
9901 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
9902 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9903 if (IsSigned && BitNo == SrcSize - 1)
9904 return false;
9906 ExpectedValue <<= BitNo;
9908 bool IsReversedCC = false;
9909 if (CmpValue != ExpectedValue) {
9910 if (!IsReversible)
9911 return false;
9912 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9913 if (!IsReversedCC)
9914 return false;
9917 Register DefReg = Def->getOperand(0).getReg();
9918 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9919 return false;
9921 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9922 I != E; ++I) {
9923 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9924 I->killsRegister(AMDGPU::SCC, &RI))
9925 return false;
9928 MachineOperand *SccDef =
9929 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9930 SccDef->setIsDead(false);
9931 CmpInstr.eraseFromParent();
9933 if (!MRI->use_nodbg_empty(DefReg)) {
9934 assert(!IsReversedCC);
9935 return true;
9938 // Replace AND with unused result with a S_BITCMP.
9939 MachineBasicBlock *MBB = Def->getParent();
9941 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9942 : AMDGPU::S_BITCMP1_B32
9943 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9944 : AMDGPU::S_BITCMP1_B64;
9946 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9947 .add(*SrcOp)
9948 .addImm(BitNo);
9949 Def->eraseFromParent();
9951 return true;
9954 switch (CmpInstr.getOpcode()) {
9955 default:
9956 break;
9957 case AMDGPU::S_CMP_EQ_U32:
9958 case AMDGPU::S_CMP_EQ_I32:
9959 case AMDGPU::S_CMPK_EQ_U32:
9960 case AMDGPU::S_CMPK_EQ_I32:
9961 return optimizeCmpAnd(1, 32, true, false);
9962 case AMDGPU::S_CMP_GE_U32:
9963 case AMDGPU::S_CMPK_GE_U32:
9964 return optimizeCmpAnd(1, 32, false, false);
9965 case AMDGPU::S_CMP_GE_I32:
9966 case AMDGPU::S_CMPK_GE_I32:
9967 return optimizeCmpAnd(1, 32, false, true);
9968 case AMDGPU::S_CMP_EQ_U64:
9969 return optimizeCmpAnd(1, 64, true, false);
9970 case AMDGPU::S_CMP_LG_U32:
9971 case AMDGPU::S_CMP_LG_I32:
9972 case AMDGPU::S_CMPK_LG_U32:
9973 case AMDGPU::S_CMPK_LG_I32:
9974 return optimizeCmpAnd(0, 32, true, false);
9975 case AMDGPU::S_CMP_GT_U32:
9976 case AMDGPU::S_CMPK_GT_U32:
9977 return optimizeCmpAnd(0, 32, false, false);
9978 case AMDGPU::S_CMP_GT_I32:
9979 case AMDGPU::S_CMPK_GT_I32:
9980 return optimizeCmpAnd(0, 32, false, true);
9981 case AMDGPU::S_CMP_LG_U64:
9982 return optimizeCmpAnd(0, 64, true, false);
9985 return false;
9988 void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
9989 unsigned OpName) const {
9990 if (!ST.needsAlignedVGPRs())
9991 return;
9993 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9994 if (OpNo < 0)
9995 return;
9996 MachineOperand &Op = MI.getOperand(OpNo);
9997 if (getOpSize(MI, OpNo) > 4)
9998 return;
10000 // Add implicit aligned super-reg to force alignment on the data operand.
10001 const DebugLoc &DL = MI.getDebugLoc();
10002 MachineBasicBlock *BB = MI.getParent();
10003 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10004 Register DataReg = Op.getReg();
10005 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10006 Register Undef = MRI.createVirtualRegister(
10007 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10008 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10009 Register NewVR =
10010 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10011 : &AMDGPU::VReg_64_Align2RegClass);
10012 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10013 .addReg(DataReg, 0, Op.getSubReg())
10014 .addImm(AMDGPU::sub0)
10015 .addReg(Undef)
10016 .addImm(AMDGPU::sub1);
10017 Op.setReg(NewVR);
10018 Op.setSubReg(AMDGPU::sub0);
10019 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));