[AMDGPU] New gfx940 mfma instructions
[llvm-project.git] / llvm / lib / Target / AMDGPU / SIFoldOperands.cpp
blob367951458ea4559db0d3c109a58cfc332471f24d
1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9 //
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #define DEBUG_TYPE "si-fold-operands"
19 using namespace llvm;
21 namespace {
23 struct FoldCandidate {
24 MachineInstr *UseMI;
25 union {
26 MachineOperand *OpToFold;
27 uint64_t ImmToFold;
28 int FrameIndexToFold;
30 int ShrinkOpcode;
31 unsigned UseOpNo;
32 MachineOperand::MachineOperandType Kind;
33 bool Commuted;
35 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
36 bool Commuted_ = false,
37 int ShrinkOp = -1) :
38 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
39 Kind(FoldOp->getType()),
40 Commuted(Commuted_) {
41 if (FoldOp->isImm()) {
42 ImmToFold = FoldOp->getImm();
43 } else if (FoldOp->isFI()) {
44 FrameIndexToFold = FoldOp->getIndex();
45 } else {
46 assert(FoldOp->isReg() || FoldOp->isGlobal());
47 OpToFold = FoldOp;
51 bool isFI() const {
52 return Kind == MachineOperand::MO_FrameIndex;
55 bool isImm() const {
56 return Kind == MachineOperand::MO_Immediate;
59 bool isReg() const {
60 return Kind == MachineOperand::MO_Register;
63 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65 bool isCommuted() const {
66 return Commuted;
69 bool needsShrink() const {
70 return ShrinkOpcode != -1;
73 int getShrinkOpcode() const {
74 return ShrinkOpcode;
78 class SIFoldOperands : public MachineFunctionPass {
79 public:
80 static char ID;
81 MachineRegisterInfo *MRI;
82 const SIInstrInfo *TII;
83 const SIRegisterInfo *TRI;
84 const GCNSubtarget *ST;
85 const SIMachineFunctionInfo *MFI;
87 void foldOperand(MachineOperand &OpToFold,
88 MachineInstr *UseMI,
89 int UseOpIdx,
90 SmallVectorImpl<FoldCandidate> &FoldList,
91 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
93 bool tryFoldCndMask(MachineInstr &MI) const;
94 bool tryFoldZeroHighBits(MachineInstr &MI) const;
95 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
97 const MachineOperand *isClamp(const MachineInstr &MI) const;
98 bool tryFoldClamp(MachineInstr &MI);
100 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
101 bool tryFoldOMod(MachineInstr &MI);
102 bool tryFoldRegSequence(MachineInstr &MI);
103 bool tryFoldLCSSAPhi(MachineInstr &MI);
104 bool tryFoldLoad(MachineInstr &MI);
106 public:
107 SIFoldOperands() : MachineFunctionPass(ID) {
108 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
111 bool runOnMachineFunction(MachineFunction &MF) override;
113 StringRef getPassName() const override { return "SI Fold Operands"; }
115 void getAnalysisUsage(AnalysisUsage &AU) const override {
116 AU.setPreservesCFG();
117 MachineFunctionPass::getAnalysisUsage(AU);
121 } // End anonymous namespace.
123 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
124 "SI Fold Operands", false, false)
126 char SIFoldOperands::ID = 0;
128 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
130 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
131 static unsigned macToMad(unsigned Opc) {
132 switch (Opc) {
133 case AMDGPU::V_MAC_F32_e64:
134 return AMDGPU::V_MAD_F32_e64;
135 case AMDGPU::V_MAC_F16_e64:
136 return AMDGPU::V_MAD_F16_e64;
137 case AMDGPU::V_FMAC_F32_e64:
138 return AMDGPU::V_FMA_F32_e64;
139 case AMDGPU::V_FMAC_F16_e64:
140 return AMDGPU::V_FMA_F16_gfx9_e64;
141 case AMDGPU::V_FMAC_LEGACY_F32_e64:
142 return AMDGPU::V_FMA_LEGACY_F32_e64;
143 case AMDGPU::V_FMAC_F64_e64:
144 return AMDGPU::V_FMA_F64_e64;
146 return AMDGPU::INSTRUCTION_LIST_END;
149 // Wrapper around isInlineConstant that understands special cases when
150 // instruction types are replaced during operand folding.
151 static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
152 const MachineInstr &UseMI,
153 unsigned OpNo,
154 const MachineOperand &OpToFold) {
155 if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
156 return true;
158 unsigned Opc = UseMI.getOpcode();
159 unsigned NewOpc = macToMad(Opc);
160 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
161 // Special case for mac. Since this is replaced with mad when folded into
162 // src2, we need to check the legality for the final instruction.
163 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
164 if (static_cast<int>(OpNo) == Src2Idx) {
165 const MCInstrDesc &MadDesc = TII->get(NewOpc);
166 return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
170 return false;
173 // TODO: Add heuristic that the frame index might not fit in the addressing mode
174 // immediate offset to avoid materializing in loops.
175 static bool frameIndexMayFold(const SIInstrInfo *TII,
176 const MachineInstr &UseMI,
177 int OpNo,
178 const MachineOperand &OpToFold) {
179 if (!OpToFold.isFI())
180 return false;
182 if (TII->isMUBUF(UseMI))
183 return OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
184 AMDGPU::OpName::vaddr);
185 if (!TII->isFLATScratch(UseMI))
186 return false;
188 int SIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
189 AMDGPU::OpName::saddr);
190 if (OpNo == SIdx)
191 return true;
193 int VIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
194 AMDGPU::OpName::vaddr);
195 return OpNo == VIdx && SIdx == -1;
198 FunctionPass *llvm::createSIFoldOperandsPass() {
199 return new SIFoldOperands();
202 static bool updateOperand(FoldCandidate &Fold,
203 const SIInstrInfo &TII,
204 const TargetRegisterInfo &TRI,
205 const GCNSubtarget &ST) {
206 MachineInstr *MI = Fold.UseMI;
207 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
208 assert(Old.isReg());
210 if (Fold.isImm()) {
211 if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
212 !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
213 (!ST.hasDOTOpSelHazard() ||
214 !(MI->getDesc().TSFlags & SIInstrFlags::IsDOT)) &&
215 AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
216 ST.hasInv2PiInlineImm())) {
217 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
218 // already set.
219 unsigned Opcode = MI->getOpcode();
220 int OpNo = MI->getOperandNo(&Old);
221 int ModIdx = -1;
222 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
223 ModIdx = AMDGPU::OpName::src0_modifiers;
224 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
225 ModIdx = AMDGPU::OpName::src1_modifiers;
226 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
227 ModIdx = AMDGPU::OpName::src2_modifiers;
228 assert(ModIdx != -1);
229 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
230 MachineOperand &Mod = MI->getOperand(ModIdx);
231 unsigned Val = Mod.getImm();
232 if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
233 // Only apply the following transformation if that operand requires
234 // a packed immediate.
235 switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
236 case AMDGPU::OPERAND_REG_IMM_V2FP16:
237 case AMDGPU::OPERAND_REG_IMM_V2INT16:
238 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
239 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
240 // If upper part is all zero we do not need op_sel_hi.
241 if (!isUInt<16>(Fold.ImmToFold)) {
242 if (!(Fold.ImmToFold & 0xffff)) {
243 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
244 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
245 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
246 return true;
248 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
249 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
250 return true;
252 break;
253 default:
254 break;
260 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
261 MachineBasicBlock *MBB = MI->getParent();
262 auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16);
263 if (Liveness != MachineBasicBlock::LQR_Dead) {
264 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
265 return false;
268 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
269 int Op32 = Fold.getShrinkOpcode();
270 MachineOperand &Dst0 = MI->getOperand(0);
271 MachineOperand &Dst1 = MI->getOperand(1);
272 assert(Dst0.isDef() && Dst1.isDef());
274 bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
276 const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
277 Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
279 MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
281 if (HaveNonDbgCarryUse) {
282 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
283 .addReg(AMDGPU::VCC, RegState::Kill);
286 // Keep the old instruction around to avoid breaking iterators, but
287 // replace it with a dummy instruction to remove uses.
289 // FIXME: We should not invert how this pass looks at operands to avoid
290 // this. Should track set of foldable movs instead of looking for uses
291 // when looking at a use.
292 Dst0.setReg(NewReg0);
293 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
294 MI->removeOperand(I);
295 MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
297 if (Fold.isCommuted())
298 TII.commuteInstruction(*Inst32, false);
299 return true;
302 assert(!Fold.needsShrink() && "not handled");
304 if (Fold.isImm()) {
305 if (Old.isTied()) {
306 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
307 if (NewMFMAOpc == -1)
308 return false;
309 MI->setDesc(TII.get(NewMFMAOpc));
310 MI->untieRegOperand(0);
312 Old.ChangeToImmediate(Fold.ImmToFold);
313 return true;
316 if (Fold.isGlobal()) {
317 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
318 Fold.OpToFold->getTargetFlags());
319 return true;
322 if (Fold.isFI()) {
323 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
324 return true;
327 MachineOperand *New = Fold.OpToFold;
328 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
329 Old.setIsUndef(New->isUndef());
330 return true;
333 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
334 const MachineInstr *MI) {
335 for (auto Candidate : FoldList) {
336 if (Candidate.UseMI == MI)
337 return true;
339 return false;
342 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
343 MachineInstr *MI, unsigned OpNo,
344 MachineOperand *FoldOp, bool Commuted = false,
345 int ShrinkOp = -1) {
346 // Skip additional folding on the same operand.
347 for (FoldCandidate &Fold : FoldList)
348 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
349 return;
350 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
351 << " operand " << OpNo << "\n " << *MI);
352 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
355 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
356 MachineInstr *MI, unsigned OpNo,
357 MachineOperand *OpToFold,
358 const SIInstrInfo *TII) {
359 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
360 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
361 unsigned Opc = MI->getOpcode();
362 unsigned NewOpc = macToMad(Opc);
363 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
364 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
365 // to fold the operand.
366 MI->setDesc(TII->get(NewOpc));
367 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
368 if (FoldAsMAD) {
369 MI->untieRegOperand(OpNo);
370 return true;
372 MI->setDesc(TII->get(Opc));
375 // Special case for s_setreg_b32
376 if (OpToFold->isImm()) {
377 unsigned ImmOpc = 0;
378 if (Opc == AMDGPU::S_SETREG_B32)
379 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
380 else if (Opc == AMDGPU::S_SETREG_B32_mode)
381 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
382 if (ImmOpc) {
383 MI->setDesc(TII->get(ImmOpc));
384 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
385 return true;
389 // If we are already folding into another operand of MI, then
390 // we can't commute the instruction, otherwise we risk making the
391 // other fold illegal.
392 if (isUseMIInFoldList(FoldList, MI))
393 return false;
395 unsigned CommuteOpNo = OpNo;
397 // Operand is not legal, so try to commute the instruction to
398 // see if this makes it possible to fold.
399 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
400 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
401 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
403 if (CanCommute) {
404 if (CommuteIdx0 == OpNo)
405 CommuteOpNo = CommuteIdx1;
406 else if (CommuteIdx1 == OpNo)
407 CommuteOpNo = CommuteIdx0;
411 // One of operands might be an Imm operand, and OpNo may refer to it after
412 // the call of commuteInstruction() below. Such situations are avoided
413 // here explicitly as OpNo must be a register operand to be a candidate
414 // for memory folding.
415 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
416 !MI->getOperand(CommuteIdx1).isReg()))
417 return false;
419 if (!CanCommute ||
420 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
421 return false;
423 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
424 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
425 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
426 Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
427 (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
428 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
430 // Verify the other operand is a VGPR, otherwise we would violate the
431 // constant bus restriction.
432 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
433 MachineOperand &OtherOp = MI->getOperand(OtherIdx);
434 if (!OtherOp.isReg() ||
435 !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
436 return false;
438 assert(MI->getOperand(1).isDef());
440 // Make sure to get the 32-bit version of the commuted opcode.
441 unsigned MaybeCommutedOpc = MI->getOpcode();
442 int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
444 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
445 return true;
448 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
449 return false;
452 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
453 return true;
456 // Check the case where we might introduce a second constant operand to a
457 // scalar instruction
458 if (TII->isSALU(MI->getOpcode())) {
459 const MCInstrDesc &InstDesc = MI->getDesc();
460 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
461 const SIRegisterInfo &SRI = TII->getRegisterInfo();
463 // Fine if the operand can be encoded as an inline constant
464 if (TII->isLiteralConstantLike(*OpToFold, OpInfo)) {
465 if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
466 !TII->isInlineConstant(*OpToFold, OpInfo)) {
467 // Otherwise check for another constant
468 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
469 auto &Op = MI->getOperand(i);
470 if (OpNo != i &&
471 TII->isLiteralConstantLike(Op, OpInfo)) {
472 return false;
479 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
480 return true;
483 // If the use operand doesn't care about the value, this may be an operand only
484 // used for register indexing, in which case it is unsafe to fold.
485 static bool isUseSafeToFold(const SIInstrInfo *TII,
486 const MachineInstr &MI,
487 const MachineOperand &UseMO) {
488 if (UseMO.isUndef() || TII->isSDWA(MI))
489 return false;
491 switch (MI.getOpcode()) {
492 case AMDGPU::V_MOV_B32_e32:
493 case AMDGPU::V_MOV_B32_e64:
494 case AMDGPU::V_MOV_B64_PSEUDO:
495 case AMDGPU::V_MOV_B64_e32:
496 case AMDGPU::V_MOV_B64_e64:
497 // Do not fold into an indirect mov.
498 return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
501 return true;
502 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
505 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
506 // for each subreg, tracking it to foldable inline immediate if possible.
507 // Returns true on success.
508 static bool getRegSeqInit(
509 SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
510 Register UseReg, uint8_t OpTy,
511 const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
512 MachineInstr *Def = MRI.getVRegDef(UseReg);
513 if (!Def || !Def->isRegSequence())
514 return false;
516 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
517 MachineOperand *Sub = &Def->getOperand(I);
518 assert(Sub->isReg());
520 for (MachineInstr *SubDef = MRI.getVRegDef(Sub->getReg());
521 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
522 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
523 SubDef = MRI.getVRegDef(Sub->getReg())) {
524 MachineOperand *Op = &SubDef->getOperand(1);
525 if (Op->isImm()) {
526 if (TII->isInlineConstant(*Op, OpTy))
527 Sub = Op;
528 break;
530 if (!Op->isReg() || Op->getReg().isPhysical())
531 break;
532 Sub = Op;
535 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
538 return true;
541 static bool tryToFoldACImm(const SIInstrInfo *TII,
542 const MachineOperand &OpToFold,
543 MachineInstr *UseMI,
544 unsigned UseOpIdx,
545 SmallVectorImpl<FoldCandidate> &FoldList) {
546 const MCInstrDesc &Desc = UseMI->getDesc();
547 const MCOperandInfo *OpInfo = Desc.OpInfo;
548 if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
549 return false;
551 uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
552 if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
553 OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
554 (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
555 OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
556 return false;
558 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
559 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
560 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
561 return true;
564 if (!OpToFold.isReg())
565 return false;
567 Register UseReg = OpToFold.getReg();
568 if (!UseReg.isVirtual())
569 return false;
571 if (isUseMIInFoldList(FoldList, UseMI))
572 return false;
574 MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
576 // Maybe it is just a COPY of an immediate itself.
577 MachineInstr *Def = MRI.getVRegDef(UseReg);
578 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
579 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
580 MachineOperand &DefOp = Def->getOperand(1);
581 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
582 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
583 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
584 return true;
588 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
589 if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
590 return false;
592 int32_t Imm;
593 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
594 const MachineOperand *Op = Defs[I].first;
595 if (!Op->isImm())
596 return false;
598 auto SubImm = Op->getImm();
599 if (!I) {
600 Imm = SubImm;
601 if (!TII->isInlineConstant(*Op, OpTy) ||
602 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
603 return false;
605 continue;
607 if (Imm != SubImm)
608 return false; // Can only fold splat constants
611 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
612 return true;
615 void SIFoldOperands::foldOperand(
616 MachineOperand &OpToFold,
617 MachineInstr *UseMI,
618 int UseOpIdx,
619 SmallVectorImpl<FoldCandidate> &FoldList,
620 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
621 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
623 if (!isUseSafeToFold(TII, *UseMI, UseOp))
624 return;
626 // FIXME: Fold operands with subregs.
627 if (UseOp.isReg() && OpToFold.isReg()) {
628 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
629 return;
632 // Special case for REG_SEQUENCE: We can't fold literals into
633 // REG_SEQUENCE instructions, so we have to fold them into the
634 // uses of REG_SEQUENCE.
635 if (UseMI->isRegSequence()) {
636 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
637 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
639 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
640 MachineInstr *RSUseMI = RSUse.getParent();
642 if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
643 RSUseMI->getOperandNo(&RSUse), FoldList))
644 continue;
646 if (RSUse.getSubReg() != RegSeqDstSubReg)
647 continue;
649 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
650 CopiesToReplace);
653 return;
656 if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
657 return;
659 if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
660 // Verify that this is a stack access.
661 // FIXME: Should probably use stack pseudos before frame lowering.
663 if (TII->isMUBUF(*UseMI)) {
664 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
665 MFI->getScratchRSrcReg())
666 return;
668 // Ensure this is either relative to the current frame or the current
669 // wave.
670 MachineOperand &SOff =
671 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
672 if (!SOff.isImm() || SOff.getImm() != 0)
673 return;
676 // A frame index will resolve to a positive constant, so it should always be
677 // safe to fold the addressing mode, even pre-GFX9.
678 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
680 if (TII->isFLATScratch(*UseMI) &&
681 AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
682 AMDGPU::OpName::vaddr) != -1 &&
683 AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
684 AMDGPU::OpName::saddr) == -1) {
685 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode());
686 UseMI->setDesc(TII->get(NewOpc));
689 return;
692 bool FoldingImmLike =
693 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
695 if (FoldingImmLike && UseMI->isCopy()) {
696 Register DestReg = UseMI->getOperand(0).getReg();
697 Register SrcReg = UseMI->getOperand(1).getReg();
698 assert(SrcReg.isVirtual());
700 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
702 // Don't fold into a copy to a physical register with the same class. Doing
703 // so would interfere with the register coalescer's logic which would avoid
704 // redundant initializations.
705 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
706 return;
708 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
709 if (!DestReg.isPhysical()) {
710 if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
711 SmallVector<FoldCandidate, 4> CopyUses;
712 for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
713 // There's no point trying to fold into an implicit operand.
714 if (Use.isImplicit())
715 continue;
717 CopyUses.emplace_back(Use.getParent(),
718 Use.getParent()->getOperandNo(&Use),
719 &UseMI->getOperand(1));
721 for (auto &F : CopyUses) {
722 foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
726 if (DestRC == &AMDGPU::AGPR_32RegClass &&
727 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
728 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
729 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
730 CopiesToReplace.push_back(UseMI);
731 return;
735 // In order to fold immediates into copies, we need to change the
736 // copy to a MOV.
738 unsigned MovOp = TII->getMovOpcode(DestRC);
739 if (MovOp == AMDGPU::COPY)
740 return;
742 UseMI->setDesc(TII->get(MovOp));
743 MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
744 MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
745 while (ImpOpI != ImpOpE) {
746 MachineInstr::mop_iterator Tmp = ImpOpI;
747 ImpOpI++;
748 UseMI->removeOperand(UseMI->getOperandNo(Tmp));
750 CopiesToReplace.push_back(UseMI);
751 } else {
752 if (UseMI->isCopy() && OpToFold.isReg() &&
753 UseMI->getOperand(0).getReg().isVirtual() &&
754 !UseMI->getOperand(1).getSubReg()) {
755 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
756 unsigned Size = TII->getOpSize(*UseMI, 1);
757 Register UseReg = OpToFold.getReg();
758 UseMI->getOperand(1).setReg(UseReg);
759 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
760 UseMI->getOperand(1).setIsKill(false);
761 CopiesToReplace.push_back(UseMI);
762 OpToFold.setIsKill(false);
764 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
765 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
766 // its initializers right here, so we will rematerialize immediates and
767 // avoid copies via different reg classes.
768 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
769 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
770 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII,
771 *MRI)) {
772 const DebugLoc &DL = UseMI->getDebugLoc();
773 MachineBasicBlock &MBB = *UseMI->getParent();
775 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
776 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
777 UseMI->removeOperand(I);
779 MachineInstrBuilder B(*MBB.getParent(), UseMI);
780 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
781 SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
782 for (unsigned I = 0; I < Size / 4; ++I) {
783 MachineOperand *Def = Defs[I].first;
784 TargetInstrInfo::RegSubRegPair CopyToVGPR;
785 if (Def->isImm() &&
786 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
787 int64_t Imm = Def->getImm();
789 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
790 BuildMI(MBB, UseMI, DL,
791 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
792 B.addReg(Tmp);
793 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
794 auto Src = getRegSubRegPair(*Def);
795 Def->setIsKill(false);
796 if (!SeenAGPRs.insert(Src)) {
797 // We cannot build a reg_sequence out of the same registers, they
798 // must be copied. Better do it here before copyPhysReg() created
799 // several reads to do the AGPR->VGPR->AGPR copy.
800 CopyToVGPR = Src;
801 } else {
802 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
803 Src.SubReg);
805 } else {
806 assert(Def->isReg());
807 Def->setIsKill(false);
808 auto Src = getRegSubRegPair(*Def);
810 // Direct copy from SGPR to AGPR is not possible. To avoid creation
811 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
812 // create a copy here and track if we already have such a copy.
813 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
814 CopyToVGPR = Src;
815 } else {
816 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
817 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
818 B.addReg(Tmp);
822 if (CopyToVGPR.Reg) {
823 Register Vgpr;
824 if (VGPRCopies.count(CopyToVGPR)) {
825 Vgpr = VGPRCopies[CopyToVGPR];
826 } else {
827 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
828 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
829 VGPRCopies[CopyToVGPR] = Vgpr;
831 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
832 BuildMI(MBB, UseMI, DL,
833 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
834 B.addReg(Tmp);
837 B.addImm(Defs[I].second);
839 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
840 return;
843 if (Size != 4)
844 return;
845 if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
846 TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
847 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
848 else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
849 TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
850 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
851 else if (ST->hasGFX90AInsts() &&
852 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
853 TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
854 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
855 return;
858 unsigned UseOpc = UseMI->getOpcode();
859 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
860 (UseOpc == AMDGPU::V_READLANE_B32 &&
861 (int)UseOpIdx ==
862 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
863 // %vgpr = V_MOV_B32 imm
864 // %sgpr = V_READFIRSTLANE_B32 %vgpr
865 // =>
866 // %sgpr = S_MOV_B32 imm
867 if (FoldingImmLike) {
868 if (execMayBeModifiedBeforeUse(*MRI,
869 UseMI->getOperand(UseOpIdx).getReg(),
870 *OpToFold.getParent(),
871 *UseMI))
872 return;
874 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
876 if (OpToFold.isImm())
877 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
878 else
879 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
880 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
881 return;
884 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
885 if (execMayBeModifiedBeforeUse(*MRI,
886 UseMI->getOperand(UseOpIdx).getReg(),
887 *OpToFold.getParent(),
888 *UseMI))
889 return;
891 // %vgpr = COPY %sgpr0
892 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
893 // =>
894 // %sgpr1 = COPY %sgpr0
895 UseMI->setDesc(TII->get(AMDGPU::COPY));
896 UseMI->getOperand(1).setReg(OpToFold.getReg());
897 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
898 UseMI->getOperand(1).setIsKill(false);
899 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
900 return;
904 const MCInstrDesc &UseDesc = UseMI->getDesc();
906 // Don't fold into target independent nodes. Target independent opcodes
907 // don't have defined register classes.
908 if (UseDesc.isVariadic() ||
909 UseOp.isImplicit() ||
910 UseDesc.OpInfo[UseOpIdx].RegClass == -1)
911 return;
914 if (!FoldingImmLike) {
915 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
916 // Don't fold if OpToFold doesn't hold an aligned register.
917 const TargetRegisterClass *RC =
918 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
919 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
920 unsigned SubReg = OpToFold.getSubReg();
921 const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
922 RC = TRI->getCompatibleSubRegClass(RC, SubRC, SubReg);
923 if (RC)
924 RC = SubRC;
927 if (!RC || !TRI->isProperlyAlignedRC(*RC))
928 return;
931 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
933 // FIXME: We could try to change the instruction from 64-bit to 32-bit
934 // to enable more folding opportunities. The shrink operands pass
935 // already does this.
936 return;
940 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
941 const TargetRegisterClass *FoldRC =
942 TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
944 // Split 64-bit constants into 32-bits for folding.
945 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
946 Register UseReg = UseOp.getReg();
947 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
949 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
950 return;
952 APInt Imm(64, OpToFold.getImm());
953 if (UseOp.getSubReg() == AMDGPU::sub0) {
954 Imm = Imm.getLoBits(32);
955 } else {
956 assert(UseOp.getSubReg() == AMDGPU::sub1);
957 Imm = Imm.getHiBits(32);
960 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
961 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
962 return;
967 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
970 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
971 uint32_t LHS, uint32_t RHS) {
972 switch (Opcode) {
973 case AMDGPU::V_AND_B32_e64:
974 case AMDGPU::V_AND_B32_e32:
975 case AMDGPU::S_AND_B32:
976 Result = LHS & RHS;
977 return true;
978 case AMDGPU::V_OR_B32_e64:
979 case AMDGPU::V_OR_B32_e32:
980 case AMDGPU::S_OR_B32:
981 Result = LHS | RHS;
982 return true;
983 case AMDGPU::V_XOR_B32_e64:
984 case AMDGPU::V_XOR_B32_e32:
985 case AMDGPU::S_XOR_B32:
986 Result = LHS ^ RHS;
987 return true;
988 case AMDGPU::S_XNOR_B32:
989 Result = ~(LHS ^ RHS);
990 return true;
991 case AMDGPU::S_NAND_B32:
992 Result = ~(LHS & RHS);
993 return true;
994 case AMDGPU::S_NOR_B32:
995 Result = ~(LHS | RHS);
996 return true;
997 case AMDGPU::S_ANDN2_B32:
998 Result = LHS & ~RHS;
999 return true;
1000 case AMDGPU::S_ORN2_B32:
1001 Result = LHS | ~RHS;
1002 return true;
1003 case AMDGPU::V_LSHL_B32_e64:
1004 case AMDGPU::V_LSHL_B32_e32:
1005 case AMDGPU::S_LSHL_B32:
1006 // The instruction ignores the high bits for out of bounds shifts.
1007 Result = LHS << (RHS & 31);
1008 return true;
1009 case AMDGPU::V_LSHLREV_B32_e64:
1010 case AMDGPU::V_LSHLREV_B32_e32:
1011 Result = RHS << (LHS & 31);
1012 return true;
1013 case AMDGPU::V_LSHR_B32_e64:
1014 case AMDGPU::V_LSHR_B32_e32:
1015 case AMDGPU::S_LSHR_B32:
1016 Result = LHS >> (RHS & 31);
1017 return true;
1018 case AMDGPU::V_LSHRREV_B32_e64:
1019 case AMDGPU::V_LSHRREV_B32_e32:
1020 Result = RHS >> (LHS & 31);
1021 return true;
1022 case AMDGPU::V_ASHR_I32_e64:
1023 case AMDGPU::V_ASHR_I32_e32:
1024 case AMDGPU::S_ASHR_I32:
1025 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1026 return true;
1027 case AMDGPU::V_ASHRREV_I32_e64:
1028 case AMDGPU::V_ASHRREV_I32_e32:
1029 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1030 return true;
1031 default:
1032 return false;
1036 static unsigned getMovOpc(bool IsScalar) {
1037 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1040 /// Remove any leftover implicit operands from mutating the instruction. e.g.
1041 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1042 /// anymore.
1043 static void stripExtraCopyOperands(MachineInstr &MI) {
1044 const MCInstrDesc &Desc = MI.getDesc();
1045 unsigned NumOps = Desc.getNumOperands() +
1046 Desc.getNumImplicitUses() +
1047 Desc.getNumImplicitDefs();
1049 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1050 MI.removeOperand(I);
1053 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1054 MI.setDesc(NewDesc);
1055 stripExtraCopyOperands(MI);
1058 static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
1059 MachineOperand &Op) {
1060 if (Op.isReg()) {
1061 // If this has a subregister, it obviously is a register source.
1062 if (Op.getSubReg() != AMDGPU::NoSubRegister || !Op.getReg().isVirtual())
1063 return &Op;
1065 MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1066 if (Def && Def->isMoveImmediate()) {
1067 MachineOperand &ImmSrc = Def->getOperand(1);
1068 if (ImmSrc.isImm())
1069 return &ImmSrc;
1073 return &Op;
1076 // Try to simplify operations with a constant that may appear after instruction
1077 // selection.
1078 // TODO: See if a frame index with a fixed offset can fold.
1079 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
1080 MachineInstr *MI) {
1081 unsigned Opc = MI->getOpcode();
1083 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1084 if (Src0Idx == -1)
1085 return false;
1086 MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
1088 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1089 Opc == AMDGPU::S_NOT_B32) &&
1090 Src0->isImm()) {
1091 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1092 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1093 return true;
1096 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1097 if (Src1Idx == -1)
1098 return false;
1099 MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
1101 if (!Src0->isImm() && !Src1->isImm())
1102 return false;
1104 // and k0, k1 -> v_mov_b32 (k0 & k1)
1105 // or k0, k1 -> v_mov_b32 (k0 | k1)
1106 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1107 if (Src0->isImm() && Src1->isImm()) {
1108 int32_t NewImm;
1109 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1110 return false;
1112 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1113 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
1115 // Be careful to change the right operand, src0 may belong to a different
1116 // instruction.
1117 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1118 MI->removeOperand(Src1Idx);
1119 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1120 return true;
1123 if (!MI->isCommutable())
1124 return false;
1126 if (Src0->isImm() && !Src1->isImm()) {
1127 std::swap(Src0, Src1);
1128 std::swap(Src0Idx, Src1Idx);
1131 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1132 if (Opc == AMDGPU::V_OR_B32_e64 ||
1133 Opc == AMDGPU::V_OR_B32_e32 ||
1134 Opc == AMDGPU::S_OR_B32) {
1135 if (Src1Val == 0) {
1136 // y = or x, 0 => y = copy x
1137 MI->removeOperand(Src1Idx);
1138 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1139 } else if (Src1Val == -1) {
1140 // y = or x, -1 => y = v_mov_b32 -1
1141 MI->removeOperand(Src1Idx);
1142 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1143 } else
1144 return false;
1146 return true;
1149 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
1150 MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
1151 MI->getOpcode() == AMDGPU::S_AND_B32) {
1152 if (Src1Val == 0) {
1153 // y = and x, 0 => y = v_mov_b32 0
1154 MI->removeOperand(Src0Idx);
1155 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1156 } else if (Src1Val == -1) {
1157 // y = and x, -1 => y = copy x
1158 MI->removeOperand(Src1Idx);
1159 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1160 stripExtraCopyOperands(*MI);
1161 } else
1162 return false;
1164 return true;
1167 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
1168 MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
1169 MI->getOpcode() == AMDGPU::S_XOR_B32) {
1170 if (Src1Val == 0) {
1171 // y = xor x, 0 => y = copy x
1172 MI->removeOperand(Src1Idx);
1173 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1174 return true;
1178 return false;
1181 // Try to fold an instruction into a simpler one
1182 bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1183 unsigned Opc = MI.getOpcode();
1184 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1185 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1186 return false;
1188 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1189 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1190 if (!Src1->isIdenticalTo(*Src0)) {
1191 auto *Src0Imm = getImmOrMaterializedImm(*MRI, *Src0);
1192 auto *Src1Imm = getImmOrMaterializedImm(*MRI, *Src1);
1193 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1194 return false;
1197 int Src1ModIdx =
1198 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1199 int Src0ModIdx =
1200 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1201 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1202 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1203 return false;
1205 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1206 auto &NewDesc =
1207 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1208 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1209 if (Src2Idx != -1)
1210 MI.removeOperand(Src2Idx);
1211 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1212 if (Src1ModIdx != -1)
1213 MI.removeOperand(Src1ModIdx);
1214 if (Src0ModIdx != -1)
1215 MI.removeOperand(Src0ModIdx);
1216 mutateCopyOp(MI, NewDesc);
1217 LLVM_DEBUG(dbgs() << MI);
1218 return true;
1221 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1222 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1223 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1224 return false;
1226 MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1));
1227 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1228 return false;
1230 Register Src1 = MI.getOperand(2).getReg();
1231 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1232 if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) {
1233 Register Dst = MI.getOperand(0).getReg();
1234 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1235 MI.eraseFromParent();
1236 return true;
1239 return false;
1242 bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1243 MachineOperand &OpToFold) const {
1244 // We need mutate the operands of new mov instructions to add implicit
1245 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1246 // this.
1247 SmallVector<MachineInstr *, 4> CopiesToReplace;
1248 SmallVector<FoldCandidate, 4> FoldList;
1249 MachineOperand &Dst = MI.getOperand(0);
1250 bool Changed = false;
1252 if (OpToFold.isImm()) {
1253 for (auto &UseMI :
1254 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1255 // Folding the immediate may reveal operations that can be constant
1256 // folded or replaced with a copy. This can happen for example after
1257 // frame indices are lowered to constants or from splitting 64-bit
1258 // constants.
1260 // We may also encounter cases where one or both operands are
1261 // immediates materialized into a register, which would ordinarily not
1262 // be folded due to multiple uses or operand constraints.
1263 if (tryConstantFoldOp(*MRI, TII, &UseMI)) {
1264 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1265 Changed = true;
1270 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1271 if (FoldingImm) {
1272 unsigned NumLiteralUses = 0;
1273 MachineOperand *NonInlineUse = nullptr;
1274 int NonInlineUseOpNo = -1;
1276 for (auto &Use :
1277 make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
1278 MachineInstr *UseMI = Use.getParent();
1279 unsigned OpNo = UseMI->getOperandNo(&Use);
1281 // Try to fold any inline immediate uses, and then only fold other
1282 // constants if they have one use.
1284 // The legality of the inline immediate must be checked based on the use
1285 // operand, not the defining instruction, because 32-bit instructions
1286 // with 32-bit inline immediate sources may be used to materialize
1287 // constants used in 16-bit operands.
1289 // e.g. it is unsafe to fold:
1290 // s_mov_b32 s0, 1.0 // materializes 0x3f800000
1291 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1293 // Folding immediates with more than one use will increase program size.
1294 // FIXME: This will also reduce register usage, which may be better
1295 // in some cases. A better heuristic is needed.
1296 if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
1297 foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1298 } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
1299 foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1300 } else {
1301 if (++NumLiteralUses == 1) {
1302 NonInlineUse = &Use;
1303 NonInlineUseOpNo = OpNo;
1308 if (NumLiteralUses == 1) {
1309 MachineInstr *UseMI = NonInlineUse->getParent();
1310 foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
1312 } else {
1313 // Folding register.
1314 SmallVector <MachineOperand *, 4> UsesToProcess;
1315 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1316 UsesToProcess.push_back(&Use);
1317 for (auto U : UsesToProcess) {
1318 MachineInstr *UseMI = U->getParent();
1320 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
1321 FoldList, CopiesToReplace);
1325 if (CopiesToReplace.empty() && FoldList.empty())
1326 return Changed;
1328 MachineFunction *MF = MI.getParent()->getParent();
1329 // Make sure we add EXEC uses to any new v_mov instructions created.
1330 for (MachineInstr *Copy : CopiesToReplace)
1331 Copy->addImplicitDefUseOperands(*MF);
1333 for (FoldCandidate &Fold : FoldList) {
1334 assert(!Fold.isReg() || Fold.OpToFold);
1335 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1336 Register Reg = Fold.OpToFold->getReg();
1337 MachineInstr *DefMI = Fold.OpToFold->getParent();
1338 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1339 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1340 continue;
1342 if (updateOperand(Fold, *TII, *TRI, *ST)) {
1343 // Clear kill flags.
1344 if (Fold.isReg()) {
1345 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1346 // FIXME: Probably shouldn't bother trying to fold if not an
1347 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1348 // copies.
1349 MRI->clearKillFlags(Fold.OpToFold->getReg());
1351 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1352 << static_cast<int>(Fold.UseOpNo) << " of "
1353 << *Fold.UseMI);
1354 } else if (Fold.isCommuted()) {
1355 // Restoring instruction's original operand order if fold has failed.
1356 TII->commuteInstruction(*Fold.UseMI, false);
1359 return true;
1362 // Clamp patterns are canonically selected to v_max_* instructions, so only
1363 // handle them.
1364 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1365 unsigned Op = MI.getOpcode();
1366 switch (Op) {
1367 case AMDGPU::V_MAX_F32_e64:
1368 case AMDGPU::V_MAX_F16_e64:
1369 case AMDGPU::V_MAX_F64_e64:
1370 case AMDGPU::V_PK_MAX_F16: {
1371 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1372 return nullptr;
1374 // Make sure sources are identical.
1375 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1376 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1377 if (!Src0->isReg() || !Src1->isReg() ||
1378 Src0->getReg() != Src1->getReg() ||
1379 Src0->getSubReg() != Src1->getSubReg() ||
1380 Src0->getSubReg() != AMDGPU::NoSubRegister)
1381 return nullptr;
1383 // Can't fold up if we have modifiers.
1384 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1385 return nullptr;
1387 unsigned Src0Mods
1388 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1389 unsigned Src1Mods
1390 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1392 // Having a 0 op_sel_hi would require swizzling the output in the source
1393 // instruction, which we can't do.
1394 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1395 : 0u;
1396 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1397 return nullptr;
1398 return Src0;
1400 default:
1401 return nullptr;
1405 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1406 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1407 const MachineOperand *ClampSrc = isClamp(MI);
1408 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1409 return false;
1411 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1413 // The type of clamp must be compatible.
1414 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1415 return false;
1417 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1418 if (!DefClamp)
1419 return false;
1421 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1423 // Clamp is applied after omod, so it is OK if omod is set.
1424 DefClamp->setImm(1);
1425 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1426 MI.eraseFromParent();
1428 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1429 // instruction, so we might as well convert it to the more flexible VOP3-only
1430 // mad/fma form.
1431 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1432 Def->eraseFromParent();
1434 return true;
1437 static int getOModValue(unsigned Opc, int64_t Val) {
1438 switch (Opc) {
1439 case AMDGPU::V_MUL_F64_e64: {
1440 switch (Val) {
1441 case 0x3fe0000000000000: // 0.5
1442 return SIOutMods::DIV2;
1443 case 0x4000000000000000: // 2.0
1444 return SIOutMods::MUL2;
1445 case 0x4010000000000000: // 4.0
1446 return SIOutMods::MUL4;
1447 default:
1448 return SIOutMods::NONE;
1451 case AMDGPU::V_MUL_F32_e64: {
1452 switch (static_cast<uint32_t>(Val)) {
1453 case 0x3f000000: // 0.5
1454 return SIOutMods::DIV2;
1455 case 0x40000000: // 2.0
1456 return SIOutMods::MUL2;
1457 case 0x40800000: // 4.0
1458 return SIOutMods::MUL4;
1459 default:
1460 return SIOutMods::NONE;
1463 case AMDGPU::V_MUL_F16_e64: {
1464 switch (static_cast<uint16_t>(Val)) {
1465 case 0x3800: // 0.5
1466 return SIOutMods::DIV2;
1467 case 0x4000: // 2.0
1468 return SIOutMods::MUL2;
1469 case 0x4400: // 4.0
1470 return SIOutMods::MUL4;
1471 default:
1472 return SIOutMods::NONE;
1475 default:
1476 llvm_unreachable("invalid mul opcode");
1480 // FIXME: Does this really not support denormals with f16?
1481 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1482 // handled, so will anything other than that break?
1483 std::pair<const MachineOperand *, int>
1484 SIFoldOperands::isOMod(const MachineInstr &MI) const {
1485 unsigned Op = MI.getOpcode();
1486 switch (Op) {
1487 case AMDGPU::V_MUL_F64_e64:
1488 case AMDGPU::V_MUL_F32_e64:
1489 case AMDGPU::V_MUL_F16_e64: {
1490 // If output denormals are enabled, omod is ignored.
1491 if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
1492 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64) &&
1493 MFI->getMode().FP64FP16OutputDenormals))
1494 return std::make_pair(nullptr, SIOutMods::NONE);
1496 const MachineOperand *RegOp = nullptr;
1497 const MachineOperand *ImmOp = nullptr;
1498 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1499 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1500 if (Src0->isImm()) {
1501 ImmOp = Src0;
1502 RegOp = Src1;
1503 } else if (Src1->isImm()) {
1504 ImmOp = Src1;
1505 RegOp = Src0;
1506 } else
1507 return std::make_pair(nullptr, SIOutMods::NONE);
1509 int OMod = getOModValue(Op, ImmOp->getImm());
1510 if (OMod == SIOutMods::NONE ||
1511 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1512 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1513 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1514 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1515 return std::make_pair(nullptr, SIOutMods::NONE);
1517 return std::make_pair(RegOp, OMod);
1519 case AMDGPU::V_ADD_F64_e64:
1520 case AMDGPU::V_ADD_F32_e64:
1521 case AMDGPU::V_ADD_F16_e64: {
1522 // If output denormals are enabled, omod is ignored.
1523 if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
1524 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64) &&
1525 MFI->getMode().FP64FP16OutputDenormals))
1526 return std::make_pair(nullptr, SIOutMods::NONE);
1528 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1529 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1530 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1532 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1533 Src0->getSubReg() == Src1->getSubReg() &&
1534 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1535 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1536 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1537 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1538 return std::make_pair(Src0, SIOutMods::MUL2);
1540 return std::make_pair(nullptr, SIOutMods::NONE);
1542 default:
1543 return std::make_pair(nullptr, SIOutMods::NONE);
1547 // FIXME: Does this need to check IEEE bit on function?
1548 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1549 const MachineOperand *RegOp;
1550 int OMod;
1551 std::tie(RegOp, OMod) = isOMod(MI);
1552 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1553 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1554 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1555 return false;
1557 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1558 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1559 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1560 return false;
1562 // Clamp is applied after omod. If the source already has clamp set, don't
1563 // fold it.
1564 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1565 return false;
1567 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1569 DefOMod->setImm(OMod);
1570 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1571 MI.eraseFromParent();
1573 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1574 // instruction, so we might as well convert it to the more flexible VOP3-only
1575 // mad/fma form.
1576 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1577 Def->eraseFromParent();
1579 return true;
1582 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1583 // instruction which can take an agpr. So far that means a store.
1584 bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1585 assert(MI.isRegSequence());
1586 auto Reg = MI.getOperand(0).getReg();
1588 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1589 !MRI->hasOneNonDBGUse(Reg))
1590 return false;
1592 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1593 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI))
1594 return false;
1596 for (auto &Def : Defs) {
1597 const auto *Op = Def.first;
1598 if (!Op->isReg())
1599 return false;
1600 if (TRI->isAGPR(*MRI, Op->getReg()))
1601 continue;
1602 // Maybe this is a COPY from AREG
1603 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1604 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1605 return false;
1606 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1607 return false;
1610 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1611 MachineInstr *UseMI = Op->getParent();
1612 while (UseMI->isCopy() && !Op->getSubReg()) {
1613 Reg = UseMI->getOperand(0).getReg();
1614 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1615 return false;
1616 Op = &*MRI->use_nodbg_begin(Reg);
1617 UseMI = Op->getParent();
1620 if (Op->getSubReg())
1621 return false;
1623 unsigned OpIdx = Op - &UseMI->getOperand(0);
1624 const MCInstrDesc &InstDesc = UseMI->getDesc();
1625 const TargetRegisterClass *OpRC =
1626 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1627 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1628 return false;
1630 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1631 auto Dst = MRI->createVirtualRegister(NewDstRC);
1632 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1633 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1635 for (unsigned I = 0; I < Defs.size(); ++I) {
1636 MachineOperand *Def = Defs[I].first;
1637 Def->setIsKill(false);
1638 if (TRI->isAGPR(*MRI, Def->getReg())) {
1639 RS.add(*Def);
1640 } else { // This is a copy
1641 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1642 SubDef->getOperand(1).setIsKill(false);
1643 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1645 RS.addImm(Defs[I].second);
1648 Op->setReg(Dst);
1649 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1650 Op->setReg(Reg);
1651 RS->eraseFromParent();
1652 return false;
1655 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1657 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1658 // in which case we can erase them all later in runOnMachineFunction.
1659 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1660 MI.eraseFromParent();
1661 return true;
1664 // Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
1665 // This should allow folding of an AGPR into a consumer which may support it.
1666 // I.e.:
1668 // loop: // loop:
1669 // %1:vreg = COPY %0:areg // exit:
1670 // exit: => // %1:areg = PHI %0:areg, %loop
1671 // %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
1672 bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
1673 assert(PHI.isPHI());
1675 if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
1676 return false;
1678 Register PhiIn = PHI.getOperand(1).getReg();
1679 Register PhiOut = PHI.getOperand(0).getReg();
1680 if (PHI.getOperand(1).getSubReg() ||
1681 !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
1682 return false;
1684 // A single use should not matter for correctness, but if it has another use
1685 // inside the loop we may perform copy twice in a worst case.
1686 if (!MRI->hasOneNonDBGUse(PhiIn))
1687 return false;
1689 MachineInstr *Copy = MRI->getVRegDef(PhiIn);
1690 if (!Copy || !Copy->isCopy())
1691 return false;
1693 Register CopyIn = Copy->getOperand(1).getReg();
1694 if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
1695 return false;
1697 const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
1698 Register NewReg = MRI->createVirtualRegister(ARC);
1699 PHI.getOperand(1).setReg(CopyIn);
1700 PHI.getOperand(0).setReg(NewReg);
1702 MachineBasicBlock *MBB = PHI.getParent();
1703 BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
1704 TII->get(AMDGPU::COPY), PhiOut)
1705 .addReg(NewReg, RegState::Kill);
1706 Copy->eraseFromParent(); // We know this copy had a single use.
1708 LLVM_DEBUG(dbgs() << "Folded " << PHI);
1710 return true;
1713 // Attempt to convert VGPR load to an AGPR load.
1714 bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1715 assert(MI.mayLoad());
1716 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1717 return false;
1719 MachineOperand &Def = MI.getOperand(0);
1720 if (!Def.isDef())
1721 return false;
1723 Register DefReg = Def.getReg();
1725 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1726 return false;
1728 SmallVector<const MachineInstr*, 8> Users;
1729 SmallVector<Register, 8> MoveRegs;
1730 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) {
1731 Users.push_back(&I);
1733 if (Users.empty())
1734 return false;
1736 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
1737 while (!Users.empty()) {
1738 const MachineInstr *I = Users.pop_back_val();
1739 if (!I->isCopy() && !I->isRegSequence())
1740 return false;
1741 Register DstReg = I->getOperand(0).getReg();
1742 if (TRI->isAGPR(*MRI, DstReg))
1743 continue;
1744 MoveRegs.push_back(DstReg);
1745 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) {
1746 Users.push_back(&U);
1750 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
1751 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
1752 if (!TII->isOperandLegal(MI, 0, &Def)) {
1753 MRI->setRegClass(DefReg, RC);
1754 return false;
1757 while (!MoveRegs.empty()) {
1758 Register Reg = MoveRegs.pop_back_val();
1759 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
1762 LLVM_DEBUG(dbgs() << "Folded " << MI);
1764 return true;
1767 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1768 if (skipFunction(MF.getFunction()))
1769 return false;
1771 MRI = &MF.getRegInfo();
1772 ST = &MF.getSubtarget<GCNSubtarget>();
1773 TII = ST->getInstrInfo();
1774 TRI = &TII->getRegisterInfo();
1775 MFI = MF.getInfo<SIMachineFunctionInfo>();
1777 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1778 // correctly handle signed zeros.
1780 // FIXME: Also need to check strictfp
1781 bool IsIEEEMode = MFI->getMode().IEEE;
1782 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1784 bool Changed = false;
1785 for (MachineBasicBlock *MBB : depth_first(&MF)) {
1786 MachineOperand *CurrentKnownM0Val = nullptr;
1787 for (auto &MI : make_early_inc_range(*MBB)) {
1788 Changed |= tryFoldCndMask(MI);
1790 if (tryFoldZeroHighBits(MI)) {
1791 Changed = true;
1792 continue;
1795 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
1796 Changed = true;
1797 continue;
1800 if (MI.isPHI() && tryFoldLCSSAPhi(MI)) {
1801 Changed = true;
1802 continue;
1805 if (MI.mayLoad() && tryFoldLoad(MI)) {
1806 Changed = true;
1807 continue;
1810 if (!TII->isFoldableCopy(MI)) {
1811 // Saw an unknown clobber of m0, so we no longer know what it is.
1812 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
1813 CurrentKnownM0Val = nullptr;
1815 // TODO: Omod might be OK if there is NSZ only on the source
1816 // instruction, and not the omod multiply.
1817 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
1818 !tryFoldOMod(MI))
1819 Changed |= tryFoldClamp(MI);
1821 continue;
1824 // Specially track simple redefs of m0 to the same value in a block, so we
1825 // can erase the later ones.
1826 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1827 MachineOperand &NewM0Val = MI.getOperand(1);
1828 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1829 MI.eraseFromParent();
1830 Changed = true;
1831 continue;
1834 // We aren't tracking other physical registers
1835 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ?
1836 nullptr : &NewM0Val;
1837 continue;
1840 MachineOperand &OpToFold = MI.getOperand(1);
1841 bool FoldingImm =
1842 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1844 // FIXME: We could also be folding things like TargetIndexes.
1845 if (!FoldingImm && !OpToFold.isReg())
1846 continue;
1848 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1849 continue;
1851 // Prevent folding operands backwards in the function. For example,
1852 // the COPY opcode must not be replaced by 1 in this example:
1854 // %3 = COPY %vgpr0; VGPR_32:%3
1855 // ...
1856 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1857 if (!MI.getOperand(0).getReg().isVirtual())
1858 continue;
1860 Changed |= foldInstOperand(MI, OpToFold);
1862 // If we managed to fold all uses of this copy then we might as well
1863 // delete it now.
1864 // The only reason we need to follow chains of copies here is that
1865 // tryFoldRegSequence looks forward through copies before folding a
1866 // REG_SEQUENCE into its eventual users.
1867 auto *InstToErase = &MI;
1868 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1869 auto &SrcOp = InstToErase->getOperand(1);
1870 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1871 InstToErase->eraseFromParent();
1872 Changed = true;
1873 InstToErase = nullptr;
1874 if (!SrcReg || SrcReg.isPhysical())
1875 break;
1876 InstToErase = MRI->getVRegDef(SrcReg);
1877 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1878 break;
1880 if (InstToErase && InstToErase->isRegSequence() &&
1881 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1882 InstToErase->eraseFromParent();
1883 Changed = true;
1887 return Changed;