1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/SmallVector.h"
45 #include "llvm/ADT/Statistic.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/TargetRegisterInfo.h"
54 #include "llvm/Pass.h"
59 #define DEBUG_TYPE "gcn-dpp-combine"
61 STATISTIC(NumDPPMovsCombined
, "Number of DPP moves combined.");
65 class GCNDPPCombine
: public MachineFunctionPass
{
66 MachineRegisterInfo
*MRI
;
67 const SIInstrInfo
*TII
;
69 using RegSubRegPair
= TargetInstrInfo::RegSubRegPair
;
71 MachineOperand
*getOldOpndValue(MachineOperand
&OldOpnd
) const;
73 MachineInstr
*createDPPInst(MachineInstr
&OrigMI
,
75 RegSubRegPair CombOldVGPR
,
76 MachineOperand
*OldOpnd
,
79 MachineInstr
*createDPPInst(MachineInstr
&OrigMI
,
81 RegSubRegPair CombOldVGPR
,
84 bool hasNoImmOrEqual(MachineInstr
&MI
,
87 int64_t Mask
= -1) const;
89 bool combineDPPMov(MachineInstr
&MI
) const;
94 GCNDPPCombine() : MachineFunctionPass(ID
) {
95 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
98 bool runOnMachineFunction(MachineFunction
&MF
) override
;
100 StringRef
getPassName() const override
{ return "GCN DPP Combine"; }
102 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
103 AU
.setPreservesCFG();
104 MachineFunctionPass::getAnalysisUsage(AU
);
108 } // end anonymous namespace
110 INITIALIZE_PASS(GCNDPPCombine
, DEBUG_TYPE
, "GCN DPP Combine", false, false)
112 char GCNDPPCombine::ID
= 0;
114 char &llvm::GCNDPPCombineID
= GCNDPPCombine::ID
;
116 FunctionPass
*llvm::createGCNDPPCombinePass() {
117 return new GCNDPPCombine();
120 static int getDPPOp(unsigned Op
) {
121 auto DPP32
= AMDGPU::getDPPOp32(Op
);
125 auto E32
= AMDGPU::getVOPe32(Op
);
126 return E32
!= -1 ? AMDGPU::getDPPOp32(E32
) : -1;
129 // tracks the register operand definition and returns:
130 // 1. immediate operand used to initialize the register if found
131 // 2. nullptr if the register operand is undef
132 // 3. the operand itself otherwise
133 MachineOperand
*GCNDPPCombine::getOldOpndValue(MachineOperand
&OldOpnd
) const {
134 auto *Def
= getVRegSubRegDef(getRegSubRegPair(OldOpnd
), *MRI
);
138 switch(Def
->getOpcode()) {
140 case AMDGPU::IMPLICIT_DEF
:
143 case AMDGPU::V_MOV_B32_e32
: {
144 auto &Op1
= Def
->getOperand(1);
153 MachineInstr
*GCNDPPCombine::createDPPInst(MachineInstr
&OrigMI
,
155 RegSubRegPair CombOldVGPR
,
156 bool CombBCZ
) const {
157 assert(MovMI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
);
158 assert(TII
->getNamedOperand(MovMI
, AMDGPU::OpName::vdst
)->getReg() ==
159 TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src0
)->getReg());
161 auto OrigOp
= OrigMI
.getOpcode();
162 auto DPPOp
= getDPPOp(OrigOp
);
164 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
168 auto DPPInst
= BuildMI(*OrigMI
.getParent(), OrigMI
,
169 OrigMI
.getDebugLoc(), TII
->get(DPPOp
));
172 auto *Dst
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::vdst
);
177 const int OldIdx
= AMDGPU::getNamedOperandIdx(DPPOp
, AMDGPU::OpName::old
);
179 assert(OldIdx
== NumOperands
);
180 assert(isOfRegClass(CombOldVGPR
, AMDGPU::VGPR_32RegClass
, *MRI
));
181 DPPInst
.addReg(CombOldVGPR
.Reg
, 0, CombOldVGPR
.SubReg
);
184 // TODO: this discards MAC/FMA instructions for now, let's add it later
185 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
191 if (auto *Mod0
= TII
->getNamedOperand(OrigMI
,
192 AMDGPU::OpName::src0_modifiers
)) {
193 assert(NumOperands
== AMDGPU::getNamedOperandIdx(DPPOp
,
194 AMDGPU::OpName::src0_modifiers
));
195 assert(0LL == (Mod0
->getImm() & ~(SISrcMods::ABS
| SISrcMods::NEG
)));
196 DPPInst
.addImm(Mod0
->getImm());
199 auto *Src0
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::src0
);
201 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src0
)) {
202 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
207 DPPInst
->getOperand(NumOperands
).setIsKill(false);
210 if (auto *Mod1
= TII
->getNamedOperand(OrigMI
,
211 AMDGPU::OpName::src1_modifiers
)) {
212 assert(NumOperands
== AMDGPU::getNamedOperandIdx(DPPOp
,
213 AMDGPU::OpName::src1_modifiers
));
214 assert(0LL == (Mod1
->getImm() & ~(SISrcMods::ABS
| SISrcMods::NEG
)));
215 DPPInst
.addImm(Mod1
->getImm());
218 if (auto *Src1
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
)) {
219 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src1
)) {
220 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
228 if (auto *Src2
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src2
)) {
229 if (!TII
->isOperandLegal(*DPPInst
.getInstr(), NumOperands
, Src2
)) {
230 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
237 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::dpp_ctrl
));
238 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::row_mask
));
239 DPPInst
.add(*TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bank_mask
));
240 DPPInst
.addImm(CombBCZ
? 1 : 0);
244 DPPInst
.getInstr()->eraseFromParent();
247 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst
.getInstr());
248 return DPPInst
.getInstr();
251 static bool isIdentityValue(unsigned OrigMIOp
, MachineOperand
*OldOpnd
) {
252 assert(OldOpnd
->isImm());
255 case AMDGPU::V_ADD_U32_e32
:
256 case AMDGPU::V_ADD_U32_e64
:
257 case AMDGPU::V_ADD_I32_e32
:
258 case AMDGPU::V_ADD_I32_e64
:
259 case AMDGPU::V_OR_B32_e32
:
260 case AMDGPU::V_OR_B32_e64
:
261 case AMDGPU::V_SUBREV_U32_e32
:
262 case AMDGPU::V_SUBREV_U32_e64
:
263 case AMDGPU::V_SUBREV_I32_e32
:
264 case AMDGPU::V_SUBREV_I32_e64
:
265 case AMDGPU::V_MAX_U32_e32
:
266 case AMDGPU::V_MAX_U32_e64
:
267 case AMDGPU::V_XOR_B32_e32
:
268 case AMDGPU::V_XOR_B32_e64
:
269 if (OldOpnd
->getImm() == 0)
272 case AMDGPU::V_AND_B32_e32
:
273 case AMDGPU::V_AND_B32_e64
:
274 case AMDGPU::V_MIN_U32_e32
:
275 case AMDGPU::V_MIN_U32_e64
:
276 if (static_cast<uint32_t>(OldOpnd
->getImm()) ==
277 std::numeric_limits
<uint32_t>::max())
280 case AMDGPU::V_MIN_I32_e32
:
281 case AMDGPU::V_MIN_I32_e64
:
282 if (static_cast<int32_t>(OldOpnd
->getImm()) ==
283 std::numeric_limits
<int32_t>::max())
286 case AMDGPU::V_MAX_I32_e32
:
287 case AMDGPU::V_MAX_I32_e64
:
288 if (static_cast<int32_t>(OldOpnd
->getImm()) ==
289 std::numeric_limits
<int32_t>::min())
292 case AMDGPU::V_MUL_I32_I24_e32
:
293 case AMDGPU::V_MUL_I32_I24_e64
:
294 case AMDGPU::V_MUL_U32_U24_e32
:
295 case AMDGPU::V_MUL_U32_U24_e64
:
296 if (OldOpnd
->getImm() == 1)
303 MachineInstr
*GCNDPPCombine::createDPPInst(MachineInstr
&OrigMI
,
305 RegSubRegPair CombOldVGPR
,
306 MachineOperand
*OldOpndValue
,
307 bool CombBCZ
) const {
308 assert(CombOldVGPR
.Reg
);
309 if (!CombBCZ
&& OldOpndValue
&& OldOpndValue
->isImm()) {
310 auto *Src1
= TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
);
311 if (!Src1
|| !Src1
->isReg()) {
312 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
315 if (!isIdentityValue(OrigMI
.getOpcode(), OldOpndValue
)) {
316 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
319 CombOldVGPR
= getRegSubRegPair(*Src1
);
320 if (!isOfRegClass(CombOldVGPR
, AMDGPU::VGPR_32RegClass
, *MRI
)) {
321 LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
325 return createDPPInst(OrigMI
, MovMI
, CombOldVGPR
, CombBCZ
);
328 // returns true if MI doesn't have OpndName immediate operand or the
330 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr
&MI
, unsigned OpndName
,
331 int64_t Value
, int64_t Mask
) const {
332 auto *Imm
= TII
->getNamedOperand(MI
, OpndName
);
336 assert(Imm
->isImm());
337 return (Imm
->getImm() & Mask
) == Value
;
340 bool GCNDPPCombine::combineDPPMov(MachineInstr
&MovMI
) const {
341 assert(MovMI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
);
342 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI
);
344 auto *DstOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::vdst
);
345 assert(DstOpnd
&& DstOpnd
->isReg());
346 auto DPPMovReg
= DstOpnd
->getReg();
347 if (execMayBeModifiedBeforeAnyUse(*MRI
, DPPMovReg
, MovMI
)) {
348 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
353 auto *RowMaskOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::row_mask
);
354 assert(RowMaskOpnd
&& RowMaskOpnd
->isImm());
355 auto *BankMaskOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bank_mask
);
356 assert(BankMaskOpnd
&& BankMaskOpnd
->isImm());
357 const bool MaskAllLanes
= RowMaskOpnd
->getImm() == 0xF &&
358 BankMaskOpnd
->getImm() == 0xF;
360 auto *BCZOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::bound_ctrl
);
361 assert(BCZOpnd
&& BCZOpnd
->isImm());
362 bool BoundCtrlZero
= BCZOpnd
->getImm();
364 auto *OldOpnd
= TII
->getNamedOperand(MovMI
, AMDGPU::OpName::old
);
365 assert(OldOpnd
&& OldOpnd
->isReg());
367 auto * const OldOpndValue
= getOldOpndValue(*OldOpnd
);
368 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
369 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
370 // but the third option is used to distinguish undef from non-immediate
371 // to reuse IMPLICIT_DEF instruction later
372 assert(!OldOpndValue
|| OldOpndValue
->isImm() || OldOpndValue
== OldOpnd
);
374 bool CombBCZ
= false;
376 if (MaskAllLanes
&& BoundCtrlZero
) { // [1]
379 if (!OldOpndValue
|| !OldOpndValue
->isImm()) {
380 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
384 if (OldOpndValue
->getParent()->getParent() != MovMI
.getParent()) {
386 " failed: old reg def and mov should be in the same BB\n");
390 if (OldOpndValue
->getImm() == 0) {
392 assert(!BoundCtrlZero
); // by check [1]
395 } else if (BoundCtrlZero
) {
396 assert(!MaskAllLanes
); // by check [1]
398 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
403 LLVM_DEBUG(dbgs() << " old=";
407 dbgs() << *OldOpndValue
;
408 dbgs() << ", bound_ctrl=" << CombBCZ
<< '\n');
410 SmallVector
<MachineInstr
*, 4> OrigMIs
, DPPMIs
;
411 auto CombOldVGPR
= getRegSubRegPair(*OldOpnd
);
412 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
413 if (CombBCZ
&& OldOpndValue
) { // CombOldVGPR should be undef
414 CombOldVGPR
= RegSubRegPair(
415 MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
));
416 auto UndefInst
= BuildMI(*MovMI
.getParent(), MovMI
, MovMI
.getDebugLoc(),
417 TII
->get(AMDGPU::IMPLICIT_DEF
), CombOldVGPR
.Reg
);
418 DPPMIs
.push_back(UndefInst
.getInstr());
421 OrigMIs
.push_back(&MovMI
);
422 bool Rollback
= true;
423 for (auto &Use
: MRI
->use_nodbg_operands(DPPMovReg
)) {
426 auto &OrigMI
= *Use
.getParent();
427 LLVM_DEBUG(dbgs() << " try: " << OrigMI
);
429 auto OrigOp
= OrigMI
.getOpcode();
430 if (TII
->isVOP3(OrigOp
)) {
431 if (!TII
->hasVALU32BitEncoding(OrigOp
)) {
432 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
435 // check if other than abs|neg modifiers are set (opsel for example)
436 const int64_t Mask
= ~(SISrcMods::ABS
| SISrcMods::NEG
);
437 if (!hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::src0_modifiers
, 0, Mask
) ||
438 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::src1_modifiers
, 0, Mask
) ||
439 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::clamp
, 0) ||
440 !hasNoImmOrEqual(OrigMI
, AMDGPU::OpName::omod
, 0)) {
441 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
444 } else if (!TII
->isVOP1(OrigOp
) && !TII
->isVOP2(OrigOp
)) {
445 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
449 LLVM_DEBUG(dbgs() << " combining: " << OrigMI
);
450 if (&Use
== TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src0
)) {
451 if (auto *DPPInst
= createDPPInst(OrigMI
, MovMI
, CombOldVGPR
,
452 OldOpndValue
, CombBCZ
)) {
453 DPPMIs
.push_back(DPPInst
);
456 } else if (OrigMI
.isCommutable() &&
457 &Use
== TII
->getNamedOperand(OrigMI
, AMDGPU::OpName::src1
)) {
458 auto *BB
= OrigMI
.getParent();
459 auto *NewMI
= BB
->getParent()->CloneMachineInstr(&OrigMI
);
460 BB
->insert(OrigMI
, NewMI
);
461 if (TII
->commuteInstruction(*NewMI
)) {
462 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI
);
463 if (auto *DPPInst
= createDPPInst(*NewMI
, MovMI
, CombOldVGPR
,
464 OldOpndValue
, CombBCZ
)) {
465 DPPMIs
.push_back(DPPInst
);
469 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
470 NewMI
->eraseFromParent();
472 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
475 OrigMIs
.push_back(&OrigMI
);
478 for (auto *MI
: *(Rollback
? &DPPMIs
: &OrigMIs
))
479 MI
->eraseFromParent();
484 bool GCNDPPCombine::runOnMachineFunction(MachineFunction
&MF
) {
485 auto &ST
= MF
.getSubtarget
<GCNSubtarget
>();
486 if (!ST
.hasDPP() || skipFunction(MF
.getFunction()))
489 MRI
= &MF
.getRegInfo();
490 TII
= ST
.getInstrInfo();
492 assert(MRI
->isSSA() && "Must be run on SSA");
494 bool Changed
= false;
495 for (auto &MBB
: MF
) {
496 for (auto I
= MBB
.rbegin(), E
= MBB
.rend(); I
!= E
;) {
498 if (MI
.getOpcode() == AMDGPU::V_MOV_B32_dpp
&& combineDPPMov(MI
)) {
500 ++NumDPPMovsCombined
;