1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/DenseMap.h"
45 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/ADT/Statistic.h"
47 #include "llvm/CodeGen/MachineBasicBlock.h"
48 #include "llvm/CodeGen/MachineFunction.h"
49 #include "llvm/CodeGen/MachineFunctionPass.h"
50 #include "llvm/CodeGen/MachineInstr.h"
51 #include "llvm/CodeGen/MachineInstrBuilder.h"
52 #include "llvm/CodeGen/MachineOperand.h"
53 #include "llvm/CodeGen/MachineRegisterInfo.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/Pass.h"
60 #define DEBUG_TYPE "gcn-dpp-combine"
62 STATISTIC(NumDPPMovsCombined
, "Number of DPP moves combined.");
66 class GCNDPPCombine
: public MachineFunctionPass
{
67 MachineRegisterInfo
*MRI
;
68 const SIInstrInfo
*TII
;
70 using RegSubRegPair
= TargetInstrInfo::RegSubRegPair
;
72 MachineOperand
*getOldOpndValue(MachineOperand
&OldOpnd
) const;
74 MachineInstr
*createDPPInst(MachineInstr
&OrigMI
,
76 RegSubRegPair CombOldVGPR
,
77 MachineOperand
*OldOpnd
,
80 MachineInstr
*createDPPInst(MachineInstr
&OrigMI
,
82 RegSubRegPair CombOldVGPR
,
85 bool hasNoImmOrEqual(MachineInstr
&MI
,
88 int64_t Mask
= -1) const;
90 bool combineDPPMov(MachineInstr
&MI
) const;
95 GCNDPPCombine() : MachineFunctionPass(ID
) {
96 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
99 bool runOnMachineFunction(MachineFunction
&MF
) override
;
101 StringRef
getPassName() const override
{ return "GCN DPP Combine"; }
103 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
104 AU
.setPreservesCFG();
105 MachineFunctionPass::getAnalysisUsage(AU
);
109 } // end anonymous namespace
111 INITIALIZE_PASS(GCNDPPCombine
, DEBUG_TYPE
, "GCN DPP Combine", false, false)
113 char GCNDPPCombine::ID
= 0;
115 char &llvm::GCNDPPCombineID
= GCNDPPCombine::ID
;
117 FunctionPass
*llvm::createGCNDPPCombinePass() {
118 return new GCNDPPCombine();
121 static int getDPPOp(unsigned Op
) {
122 auto DPP32
= AMDGPU::getDPPOp32(Op
);
126 auto E32
= AMDGPU::getVOPe32(Op
);
127 return E32
!= -1 ? AMDGPU::getDPPOp32(E32
) : -1;
130 // tracks the register operand definition and returns:
131 // 1. immediate operand used to initialize the register if found
132 // 2. nullptr if the register operand is undef
133 // 3. the operand itself otherwise
134 MachineOperand
*GCNDPPCombine::getOldOpndValue(MachineOperand
&OldOpnd
) const {
135 auto *Def
= getVRegSubRegDef(getRegSubRegPair(OldOpnd
), *MRI
);
139 switch(Def
->getOpcode()) {
141 case AMDGPU::IMPLICIT_DEF
:
144 case AMDGPU::V_MOV_B32_e32
: {
145 auto &Op1
= Def
->getOperand(1);
154 MachineInstr
*GCNDPPCombine::createDPPInst(MachineInstr
&OrigMI
,
156 RegSubRegPair CombOldVGPR
,
157 bool CombBCZ
) const {
158 assert(MovMI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
);
160 auto OrigOp
= OrigMI
.getOpcode();
161 auto DPPOp
= getDPPOp(OrigOp
);
163 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
167 auto DPPInst
= BuildMI(*OrigMI
.getParent(), OrigMI
,
168 OrigMI
.getDebugLoc(), TII
->get(DPPOp
));
171 auto *Dst
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::vdst
);
176 const int OldIdx
= AMDGPU::getNamedOperandIdx(DPPOp
, AMDGPU::OpName::old
);
178 assert(OldIdx
== NumOperands
);
179 assert(isOfRegClass(CombOldVGPR
, AMDGPU::VGPR_32RegClass
, *MRI
));
180 auto *Def
= getVRegSubRegDef(CombOldVGPR
, *MRI
);
181 DPPInst
.addReg(CombOldVGPR
.Reg
, Def
? 0 : RegState::Undef
,
185 // TODO: this discards MAC/FMA instructions for now, let's add it later
186 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
192 if (auto *Mod0
= TII
->getNamedOperand(OrigMI
,
193 AMDGPU::OpName::src0_modifiers
)) {
194 assert(NumOperands
== AMDGPU::getNamedOperandIdx(DPPOp
,
195 AMDGPU::OpName::src0_modifiers
));
196 assert(0LL == (Mod0
->getImm() & ~(SISrcMods::ABS
| SISrcMods::NEG
)));
197 DPPInst
.addImm(Mod0
->getImm());
199 } else if (AMDGPU::getNamedOperandIdx(DPPOp
,
200 AMDGPU::OpName::src0_modifiers
) != -1) {
204 auto *Src0
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::src0
);
206 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src0
)) {
207 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
212 DPPInst
->getOperand(NumOperands
).setIsKill(false);
215 if (auto *Mod1
= TII
->getNamedOperand(OrigMI
,
216 AMDGPU::OpName::src1_modifiers
)) {
217 assert(NumOperands
== AMDGPU::getNamedOperandIdx(DPPOp
,
218 AMDGPU::OpName::src1_modifiers
));
219 assert(0LL == (Mod1
->getImm() & ~(SISrcMods::ABS
| SISrcMods::NEG
)));
220 DPPInst
.addImm(Mod1
->getImm());
222 } else if (AMDGPU::getNamedOperandIdx(DPPOp
,
223 AMDGPU::OpName::src1_modifiers
) != -1) {
227 if (auto *Src1
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
)) {
228 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src1
)) {
229 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
237 if (auto *Src2
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src2
)) {
238 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src2
)) {
239 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
246 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::dpp_ctrl
));
247 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::row_mask
));
248 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bank_mask
));
249 DPPInst
.addImm(CombBCZ
? 1 : 0);
253 DPPInst
.getInstr()->eraseFromParent();
256 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst
.getInstr());
257 return DPPInst
.getInstr();
260 static bool isIdentityValue(unsigned OrigMIOp
, MachineOperand
*OldOpnd
) {
261 assert(OldOpnd
->isImm());
264 case AMDGPU::V_ADD_U32_e32
:
265 case AMDGPU::V_ADD_U32_e64
:
266 case AMDGPU::V_ADD_I32_e32
:
267 case AMDGPU::V_ADD_I32_e64
:
268 case AMDGPU::V_OR_B32_e32
:
269 case AMDGPU::V_OR_B32_e64
:
270 case AMDGPU::V_SUBREV_U32_e32
:
271 case AMDGPU::V_SUBREV_U32_e64
:
272 case AMDGPU::V_SUBREV_I32_e32
:
273 case AMDGPU::V_SUBREV_I32_e64
:
274 case AMDGPU::V_MAX_U32_e32
:
275 case AMDGPU::V_MAX_U32_e64
:
276 case AMDGPU::V_XOR_B32_e32
:
277 case AMDGPU::V_XOR_B32_e64
:
278 if (OldOpnd
->getImm() == 0)
281 case AMDGPU::V_AND_B32_e32
:
282 case AMDGPU::V_AND_B32_e64
:
283 case AMDGPU::V_MIN_U32_e32
:
284 case AMDGPU::V_MIN_U32_e64
:
285 if (static_cast<uint32_t>(OldOpnd
->getImm()) ==
286 std::numeric_limits
<uint32_t>::max())
289 case AMDGPU::V_MIN_I32_e32
:
290 case AMDGPU::V_MIN_I32_e64
:
291 if (static_cast<int32_t>(OldOpnd
->getImm()) ==
292 std::numeric_limits
<int32_t>::max())
295 case AMDGPU::V_MAX_I32_e32
:
296 case AMDGPU::V_MAX_I32_e64
:
297 if (static_cast<int32_t>(OldOpnd
->getImm()) ==
298 std::numeric_limits
<int32_t>::min())
301 case AMDGPU::V_MUL_I32_I24_e32
:
302 case AMDGPU::V_MUL_I32_I24_e64
:
303 case AMDGPU::V_MUL_U32_U24_e32
:
304 case AMDGPU::V_MUL_U32_U24_e64
:
305 if (OldOpnd
->getImm() == 1)
312 MachineInstr
*GCNDPPCombine::createDPPInst(MachineInstr
&OrigMI
,
314 RegSubRegPair CombOldVGPR
,
315 MachineOperand
*OldOpndValue
,
316 bool CombBCZ
) const {
317 assert(CombOldVGPR
.Reg
);
318 if (!CombBCZ
&& OldOpndValue
&& OldOpndValue
->isImm()) {
319 auto *Src1
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
);
320 if (!Src1
|| !Src1
->isReg()) {
321 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
324 if (!isIdentityValue(OrigMI
.getOpcode(), OldOpndValue
)) {
325 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
328 CombOldVGPR
= getRegSubRegPair(*Src1
);
329 if (!isOfRegClass(CombOldVGPR
, AMDGPU::VGPR_32RegClass
, *MRI
)) {
330 LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
334 return createDPPInst(OrigMI
, MovMI
, CombOldVGPR
, CombBCZ
);
337 // returns true if MI doesn't have OpndName immediate operand or the
339 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr
&MI
, unsigned OpndName
,
340 int64_t Value
, int64_t Mask
) const {
341 auto *Imm
= TII
->getNamedOperand(MI
, OpndName
);
345 assert(Imm
->isImm());
346 return (Imm
->getImm() & Mask
) == Value
;
349 bool GCNDPPCombine::combineDPPMov(MachineInstr
&MovMI
) const {
350 assert(MovMI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
);
351 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI
);
353 auto *DstOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::vdst
);
354 assert(DstOpnd
&& DstOpnd
->isReg());
355 auto DPPMovReg
= DstOpnd
->getReg();
356 if (DPPMovReg
.isPhysical()) {
357 LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
360 if (execMayBeModifiedBeforeAnyUse(*MRI
, DPPMovReg
, MovMI
)) {
361 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
366 auto *RowMaskOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::row_mask
);
367 assert(RowMaskOpnd
&& RowMaskOpnd
->isImm());
368 auto *BankMaskOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bank_mask
);
369 assert(BankMaskOpnd
&& BankMaskOpnd
->isImm());
370 const bool MaskAllLanes
= RowMaskOpnd
->getImm() == 0xF &&
371 BankMaskOpnd
->getImm() == 0xF;
373 auto *BCZOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bound_ctrl
);
374 assert(BCZOpnd
&& BCZOpnd
->isImm());
375 bool BoundCtrlZero
= BCZOpnd
->getImm();
377 auto *OldOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::old
);
378 auto *SrcOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::src0
);
379 assert(OldOpnd
&& OldOpnd
->isReg());
380 assert(SrcOpnd
&& SrcOpnd
->isReg());
381 if (OldOpnd
->getReg().isPhysical() || SrcOpnd
->getReg().isPhysical()) {
382 LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
386 auto * const OldOpndValue
= getOldOpndValue(*OldOpnd
);
387 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
388 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
389 // but the third option is used to distinguish undef from non-immediate
390 // to reuse IMPLICIT_DEF instruction later
391 assert(!OldOpndValue
|| OldOpndValue
->isImm() || OldOpndValue
== OldOpnd
);
393 bool CombBCZ
= false;
395 if (MaskAllLanes
&& BoundCtrlZero
) { // [1]
398 if (!OldOpndValue
|| !OldOpndValue
->isImm()) {
399 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
403 if (OldOpndValue
->getParent()->getParent() != MovMI
.getParent()) {
405 " failed: old reg def and mov should be in the same BB\n");
409 if (OldOpndValue
->getImm() == 0) {
411 assert(!BoundCtrlZero
); // by check [1]
414 } else if (BoundCtrlZero
) {
415 assert(!MaskAllLanes
); // by check [1]
417 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
422 LLVM_DEBUG(dbgs() << " old=";
426 dbgs() << *OldOpndValue
;
427 dbgs() << ", bound_ctrl=" << CombBCZ
<< '\n');
429 SmallVector
<MachineInstr
*, 4> OrigMIs
, DPPMIs
;
430 DenseMap
<MachineInstr
*, SmallVector
<unsigned, 4>> RegSeqWithOpNos
;
431 auto CombOldVGPR
= getRegSubRegPair(*OldOpnd
);
432 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
433 if (CombBCZ
&& OldOpndValue
) { // CombOldVGPR should be undef
434 CombOldVGPR
= RegSubRegPair(
435 MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
));
436 auto UndefInst
= BuildMI(*MovMI
.getParent(), MovMI
, MovMI
.getDebugLoc(),
437 TII
->get(AMDGPU::IMPLICIT_DEF
), CombOldVGPR
.Reg
);
438 DPPMIs
.push_back(UndefInst
.getInstr());
441 OrigMIs
.push_back(&MovMI
);
442 bool Rollback
= true;
443 SmallVector
<MachineOperand
*, 16> Uses
;
445 for (auto &Use
: MRI
->use_nodbg_operands(DPPMovReg
)) {
446 Uses
.push_back(&Use
);
449 while (!Uses
.empty()) {
450 MachineOperand
*Use
= Uses
.pop_back_val();
453 auto &OrigMI
= *Use
->getParent();
454 LLVM_DEBUG(dbgs() << " try: " << OrigMI
);
456 auto OrigOp
= OrigMI
.getOpcode();
457 if (OrigOp
== AMDGPU::REG_SEQUENCE
) {
458 Register FwdReg
= OrigMI
.getOperand(0).getReg();
459 unsigned FwdSubReg
= 0;
461 if (execMayBeModifiedBeforeAnyUse(*MRI
, FwdReg
, OrigMI
)) {
462 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
467 unsigned OpNo
, E
= OrigMI
.getNumOperands();
468 for (OpNo
= 1; OpNo
< E
; OpNo
+= 2) {
469 if (OrigMI
.getOperand(OpNo
).getReg() == DPPMovReg
) {
470 FwdSubReg
= OrigMI
.getOperand(OpNo
+ 1).getImm();
478 for (auto &Op
: MRI
->use_nodbg_operands(FwdReg
)) {
479 if (Op
.getSubReg() == FwdSubReg
)
482 RegSeqWithOpNos
[&OrigMI
].push_back(OpNo
);
486 if (TII
->isVOP3(OrigOp
)) {
487 if (!TII
->hasVALU32BitEncoding(OrigOp
)) {
488 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
491 // check if other than abs|neg modifiers are set (opsel for example)
492 const int64_t Mask
= ~(SISrcMods::ABS
| SISrcMods::NEG
);
493 if (!hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::src0_modifiers
, 0, Mask
) ||
494 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::src1_modifiers
, 0, Mask
) ||
495 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::clamp
, 0) ||
496 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::omod
, 0)) {
497 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
500 } else if (!TII
->isVOP1(OrigOp
) && !TII
->isVOP2(OrigOp
)) {
501 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
505 LLVM_DEBUG(dbgs() << " combining: " << OrigMI
);
506 if (Use
== TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src0
)) {
507 if (auto *DPPInst
= createDPPInst(OrigMI
, MovMI
, CombOldVGPR
,
508 OldOpndValue
, CombBCZ
)) {
509 DPPMIs
.push_back(DPPInst
);
512 } else if (OrigMI
.isCommutable() &&
513 Use
== TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
)) {
514 auto *BB
= OrigMI
.getParent();
515 auto *NewMI
= BB
->getParent()->CloneMachineInstr(&OrigMI
);
516 BB
->insert(OrigMI
, NewMI
);
517 if (TII
->commuteInstruction(*NewMI
)) {
518 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI
);
519 if (auto *DPPInst
= createDPPInst(*NewMI
, MovMI
, CombOldVGPR
,
520 OldOpndValue
, CombBCZ
)) {
521 DPPMIs
.push_back(DPPInst
);
525 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
526 NewMI
->eraseFromParent();
528 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
531 OrigMIs
.push_back(&OrigMI
);
534 Rollback
|= !Uses
.empty();
536 for (auto *MI
: *(Rollback
? &DPPMIs
: &OrigMIs
))
537 MI
->eraseFromParent();
540 for (auto &S
: RegSeqWithOpNos
) {
541 if (MRI
->use_nodbg_empty(S
.first
->getOperand(0).getReg())) {
542 S
.first
->eraseFromParent();
545 while (!S
.second
.empty())
546 S
.first
->getOperand(S
.second
.pop_back_val()).setIsUndef(true);
553 bool GCNDPPCombine::runOnMachineFunction(MachineFunction
&MF
) {
554 auto &ST
= MF
.getSubtarget
<GCNSubtarget
>();
555 if (!ST
.hasDPP() || skipFunction(MF
.getFunction()))
558 MRI
= &MF
.getRegInfo();
559 TII
= ST
.getInstrInfo();
561 assert(MRI
->isSSA() && "Must be run on SSA");
563 bool Changed
= false;
564 for (auto &MBB
: MF
) {
565 for (auto I
= MBB
.rbegin(), E
= MBB
.rend(); I
!= E
;) {
567 if (MI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
&& combineDPPMov(MI
)) {
569 ++NumDPPMovsCombined
;
570 } else if (MI
.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO
) {
571 auto Split
= TII
->expandMovDPP64(MI
);
572 for (auto M
: { Split
.first
, Split
.second
}) {
573 if (combineDPPMov(*M
))
574 ++NumDPPMovsCombined
;