1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/SmallVector.h"
45 #include "llvm/ADT/Statistic.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/TargetRegisterInfo.h"
54 #include "llvm/Pass.h"
59 #define DEBUG_TYPE "gcn-dpp-combine"
61 STATISTIC(NumDPPMovsCombined
, "Number of DPP moves combined.");
65 class GCNDPPCombine
: public MachineFunctionPass
{
66 MachineRegisterInfo
*MRI
;
67 const SIInstrInfo
*TII
;
69 using RegSubRegPair
= TargetInstrInfo::RegSubRegPair
;
71 MachineOperand
*getOldOpndValue(MachineOperand
&OldOpnd
) const;
73 MachineInstr
*createDPPInst(MachineInstr
&OrigMI
,
75 RegSubRegPair CombOldVGPR
,
76 MachineOperand
*OldOpnd
,
79 MachineInstr
*createDPPInst(MachineInstr
&OrigMI
,
81 RegSubRegPair CombOldVGPR
,
84 bool hasNoImmOrEqual(MachineInstr
&MI
,
87 int64_t Mask
= -1) const;
89 bool combineDPPMov(MachineInstr
&MI
) const;
94 GCNDPPCombine() : MachineFunctionPass(ID
) {
95 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
98 bool runOnMachineFunction(MachineFunction
&MF
) override
;
100 StringRef
getPassName() const override
{ return "GCN DPP Combine"; }
102 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
103 AU
.setPreservesCFG();
104 MachineFunctionPass::getAnalysisUsage(AU
);
108 } // end anonymous namespace
110 INITIALIZE_PASS(GCNDPPCombine
, DEBUG_TYPE
, "GCN DPP Combine", false, false)
112 char GCNDPPCombine::ID
= 0;
114 char &llvm::GCNDPPCombineID
= GCNDPPCombine::ID
;
116 FunctionPass
*llvm::createGCNDPPCombinePass() {
117 return new GCNDPPCombine();
120 static int getDPPOp(unsigned Op
) {
121 auto DPP32
= AMDGPU::getDPPOp32(Op
);
125 auto E32
= AMDGPU::getVOPe32(Op
);
126 return E32
!= -1 ? AMDGPU::getDPPOp32(E32
) : -1;
129 // tracks the register operand definition and returns:
130 // 1. immediate operand used to initialize the register if found
131 // 2. nullptr if the register operand is undef
132 // 3. the operand itself otherwise
133 MachineOperand
*GCNDPPCombine::getOldOpndValue(MachineOperand
&OldOpnd
) const {
134 auto *Def
= getVRegSubRegDef(getRegSubRegPair(OldOpnd
), *MRI
);
138 switch(Def
->getOpcode()) {
140 case AMDGPU::IMPLICIT_DEF
:
143 case AMDGPU::V_MOV_B32_e32
: {
144 auto &Op1
= Def
->getOperand(1);
153 MachineInstr
*GCNDPPCombine::createDPPInst(MachineInstr
&OrigMI
,
155 RegSubRegPair CombOldVGPR
,
156 bool CombBCZ
) const {
157 assert(MovMI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
);
158 assert(TII
->getNamedOperand(MovMI
, AMDGPU::OpName::vdst
)->getReg() ==
159 TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src0
)->getReg());
161 auto OrigOp
= OrigMI
.getOpcode();
162 auto DPPOp
= getDPPOp(OrigOp
);
164 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
168 auto DPPInst
= BuildMI(*OrigMI
.getParent(), OrigMI
,
169 OrigMI
.getDebugLoc(), TII
->get(DPPOp
));
172 auto *Dst
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::vdst
);
177 const int OldIdx
= AMDGPU::getNamedOperandIdx(DPPOp
, AMDGPU::OpName::old
);
179 assert(OldIdx
== NumOperands
);
180 assert(isOfRegClass(CombOldVGPR
, AMDGPU::VGPR_32RegClass
, *MRI
));
181 DPPInst
.addReg(CombOldVGPR
.Reg
, 0, CombOldVGPR
.SubReg
);
184 // TODO: this discards MAC/FMA instructions for now, let's add it later
185 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
191 if (auto *Mod0
= TII
->getNamedOperand(OrigMI
,
192 AMDGPU::OpName::src0_modifiers
)) {
193 assert(NumOperands
== AMDGPU::getNamedOperandIdx(DPPOp
,
194 AMDGPU::OpName::src0_modifiers
));
195 assert(0LL == (Mod0
->getImm() & ~(SISrcMods::ABS
| SISrcMods::NEG
)));
196 DPPInst
.addImm(Mod0
->getImm());
198 } else if (AMDGPU::getNamedOperandIdx(DPPOp
,
199 AMDGPU::OpName::src0_modifiers
) != -1) {
203 auto *Src0
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::src0
);
205 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src0
)) {
206 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
211 DPPInst
->getOperand(NumOperands
).setIsKill(false);
214 if (auto *Mod1
= TII
->getNamedOperand(OrigMI
,
215 AMDGPU::OpName::src1_modifiers
)) {
216 assert(NumOperands
== AMDGPU::getNamedOperandIdx(DPPOp
,
217 AMDGPU::OpName::src1_modifiers
));
218 assert(0LL == (Mod1
->getImm() & ~(SISrcMods::ABS
| SISrcMods::NEG
)));
219 DPPInst
.addImm(Mod1
->getImm());
221 } else if (AMDGPU::getNamedOperandIdx(DPPOp
,
222 AMDGPU::OpName::src1_modifiers
) != -1) {
226 if (auto *Src1
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
)) {
227 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src1
)) {
228 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
236 if (auto *Src2
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src2
)) {
237 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src2
)) {
238 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
245 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::dpp_ctrl
));
246 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::row_mask
));
247 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bank_mask
));
248 DPPInst
.addImm(CombBCZ
? 1 : 0);
252 DPPInst
.getInstr()->eraseFromParent();
255 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst
.getInstr());
256 return DPPInst
.getInstr();
259 static bool isIdentityValue(unsigned OrigMIOp
, MachineOperand
*OldOpnd
) {
260 assert(OldOpnd
->isImm());
263 case AMDGPU::V_ADD_U32_e32
:
264 case AMDGPU::V_ADD_U32_e64
:
265 case AMDGPU::V_ADD_I32_e32
:
266 case AMDGPU::V_ADD_I32_e64
:
267 case AMDGPU::V_OR_B32_e32
:
268 case AMDGPU::V_OR_B32_e64
:
269 case AMDGPU::V_SUBREV_U32_e32
:
270 case AMDGPU::V_SUBREV_U32_e64
:
271 case AMDGPU::V_SUBREV_I32_e32
:
272 case AMDGPU::V_SUBREV_I32_e64
:
273 case AMDGPU::V_MAX_U32_e32
:
274 case AMDGPU::V_MAX_U32_e64
:
275 case AMDGPU::V_XOR_B32_e32
:
276 case AMDGPU::V_XOR_B32_e64
:
277 if (OldOpnd
->getImm() == 0)
280 case AMDGPU::V_AND_B32_e32
:
281 case AMDGPU::V_AND_B32_e64
:
282 case AMDGPU::V_MIN_U32_e32
:
283 case AMDGPU::V_MIN_U32_e64
:
284 if (static_cast<uint32_t>(OldOpnd
->getImm()) ==
285 std::numeric_limits
<uint32_t>::max())
288 case AMDGPU::V_MIN_I32_e32
:
289 case AMDGPU::V_MIN_I32_e64
:
290 if (static_cast<int32_t>(OldOpnd
->getImm()) ==
291 std::numeric_limits
<int32_t>::max())
294 case AMDGPU::V_MAX_I32_e32
:
295 case AMDGPU::V_MAX_I32_e64
:
296 if (static_cast<int32_t>(OldOpnd
->getImm()) ==
297 std::numeric_limits
<int32_t>::min())
300 case AMDGPU::V_MUL_I32_I24_e32
:
301 case AMDGPU::V_MUL_I32_I24_e64
:
302 case AMDGPU::V_MUL_U32_U24_e32
:
303 case AMDGPU::V_MUL_U32_U24_e64
:
304 if (OldOpnd
->getImm() == 1)
311 MachineInstr
*GCNDPPCombine::createDPPInst(MachineInstr
&OrigMI
,
313 RegSubRegPair CombOldVGPR
,
314 MachineOperand
*OldOpndValue
,
315 bool CombBCZ
) const {
316 assert(CombOldVGPR
.Reg
);
317 if (!CombBCZ
&& OldOpndValue
&& OldOpndValue
->isImm()) {
318 auto *Src1
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
);
319 if (!Src1
|| !Src1
->isReg()) {
320 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
323 if (!isIdentityValue(OrigMI
.getOpcode(), OldOpndValue
)) {
324 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
327 CombOldVGPR
= getRegSubRegPair(*Src1
);
328 if (!isOfRegClass(CombOldVGPR
, AMDGPU::VGPR_32RegClass
, *MRI
)) {
329 LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
333 return createDPPInst(OrigMI
, MovMI
, CombOldVGPR
, CombBCZ
);
336 // returns true if MI doesn't have OpndName immediate operand or the
338 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr
&MI
, unsigned OpndName
,
339 int64_t Value
, int64_t Mask
) const {
340 auto *Imm
= TII
->getNamedOperand(MI
, OpndName
);
344 assert(Imm
->isImm());
345 return (Imm
->getImm() & Mask
) == Value
;
348 bool GCNDPPCombine::combineDPPMov(MachineInstr
&MovMI
) const {
349 assert(MovMI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
);
350 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI
);
352 auto *DstOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::vdst
);
353 assert(DstOpnd
&& DstOpnd
->isReg());
354 auto DPPMovReg
= DstOpnd
->getReg();
355 if (execMayBeModifiedBeforeAnyUse(*MRI
, DPPMovReg
, MovMI
)) {
356 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
361 auto *RowMaskOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::row_mask
);
362 assert(RowMaskOpnd
&& RowMaskOpnd
->isImm());
363 auto *BankMaskOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bank_mask
);
364 assert(BankMaskOpnd
&& BankMaskOpnd
->isImm());
365 const bool MaskAllLanes
= RowMaskOpnd
->getImm() == 0xF &&
366 BankMaskOpnd
->getImm() == 0xF;
368 auto *BCZOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bound_ctrl
);
369 assert(BCZOpnd
&& BCZOpnd
->isImm());
370 bool BoundCtrlZero
= BCZOpnd
->getImm();
372 auto *OldOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::old
);
373 assert(OldOpnd
&& OldOpnd
->isReg());
375 auto * const OldOpndValue
= getOldOpndValue(*OldOpnd
);
376 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
377 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
378 // but the third option is used to distinguish undef from non-immediate
379 // to reuse IMPLICIT_DEF instruction later
380 assert(!OldOpndValue
|| OldOpndValue
->isImm() || OldOpndValue
== OldOpnd
);
382 bool CombBCZ
= false;
384 if (MaskAllLanes
&& BoundCtrlZero
) { // [1]
387 if (!OldOpndValue
|| !OldOpndValue
->isImm()) {
388 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
392 if (OldOpndValue
->getParent()->getParent() != MovMI
.getParent()) {
394 " failed: old reg def and mov should be in the same BB\n");
398 if (OldOpndValue
->getImm() == 0) {
400 assert(!BoundCtrlZero
); // by check [1]
403 } else if (BoundCtrlZero
) {
404 assert(!MaskAllLanes
); // by check [1]
406 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
411 LLVM_DEBUG(dbgs() << " old=";
415 dbgs() << *OldOpndValue
;
416 dbgs() << ", bound_ctrl=" << CombBCZ
<< '\n');
418 SmallVector
<MachineInstr
*, 4> OrigMIs
, DPPMIs
;
419 auto CombOldVGPR
= getRegSubRegPair(*OldOpnd
);
420 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
421 if (CombBCZ
&& OldOpndValue
) { // CombOldVGPR should be undef
422 CombOldVGPR
= RegSubRegPair(
423 MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
));
424 auto UndefInst
= BuildMI(*MovMI
.getParent(), MovMI
, MovMI
.getDebugLoc(),
425 TII
->get(AMDGPU::IMPLICIT_DEF
), CombOldVGPR
.Reg
);
426 DPPMIs
.push_back(UndefInst
.getInstr());
429 OrigMIs
.push_back(&MovMI
);
430 bool Rollback
= true;
431 for (auto &Use
: MRI
->use_nodbg_operands(DPPMovReg
)) {
434 auto &OrigMI
= *Use
.getParent();
435 LLVM_DEBUG(dbgs() << " try: " << OrigMI
);
437 auto OrigOp
= OrigMI
.getOpcode();
438 if (TII
->isVOP3(OrigOp
)) {
439 if (!TII
->hasVALU32BitEncoding(OrigOp
)) {
440 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
443 // check if other than abs|neg modifiers are set (opsel for example)
444 const int64_t Mask
= ~(SISrcMods::ABS
| SISrcMods::NEG
);
445 if (!hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::src0_modifiers
, 0, Mask
) ||
446 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::src1_modifiers
, 0, Mask
) ||
447 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::clamp
, 0) ||
448 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::omod
, 0)) {
449 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
452 } else if (!TII
->isVOP1(OrigOp
) && !TII
->isVOP2(OrigOp
)) {
453 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
457 LLVM_DEBUG(dbgs() << " combining: " << OrigMI
);
458 if (&Use
== TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src0
)) {
459 if (auto *DPPInst
= createDPPInst(OrigMI
, MovMI
, CombOldVGPR
,
460 OldOpndValue
, CombBCZ
)) {
461 DPPMIs
.push_back(DPPInst
);
464 } else if (OrigMI
.isCommutable() &&
465 &Use
== TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
)) {
466 auto *BB
= OrigMI
.getParent();
467 auto *NewMI
= BB
->getParent()->CloneMachineInstr(&OrigMI
);
468 BB
->insert(OrigMI
, NewMI
);
469 if (TII
->commuteInstruction(*NewMI
)) {
470 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI
);
471 if (auto *DPPInst
= createDPPInst(*NewMI
, MovMI
, CombOldVGPR
,
472 OldOpndValue
, CombBCZ
)) {
473 DPPMIs
.push_back(DPPInst
);
477 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
478 NewMI
->eraseFromParent();
480 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
483 OrigMIs
.push_back(&OrigMI
);
486 for (auto *MI
: *(Rollback
? &DPPMIs
: &OrigMIs
))
487 MI
->eraseFromParent();
492 bool GCNDPPCombine::runOnMachineFunction(MachineFunction
&MF
) {
493 auto &ST
= MF
.getSubtarget
<GCNSubtarget
>();
494 if (!ST
.hasDPP() || skipFunction(MF
.getFunction()))
497 MRI
= &MF
.getRegInfo();
498 TII
= ST
.getInstrInfo();
500 assert(MRI
->isSSA() && "Must be run on SSA");
502 bool Changed
= false;
503 for (auto &MBB
: MF
) {
504 for (auto I
= MBB
.rbegin(), E
= MBB
.rend(); I
!= E
;) {
506 if (MI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
&& combineDPPMov(MI
)) {
508 ++NumDPPMovsCombined
;