[Codegen] Alter the default promotion for saturating adds and subs
[llvm-complete.git] / lib / Target / AMDGPU / GCNDPPCombine.cpp
blob954058592d659b7d2fd990c5a2df0ce1b2363c4a
1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
17 // to
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
22 // Combining rules :
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
35 // Otherwise cancel.
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
40 #include "AMDGPU.h"
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/SmallVector.h"
45 #include "llvm/ADT/Statistic.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/TargetRegisterInfo.h"
54 #include "llvm/Pass.h"
55 #include <cassert>
57 using namespace llvm;
59 #define DEBUG_TYPE "gcn-dpp-combine"
61 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
63 namespace {
65 class GCNDPPCombine : public MachineFunctionPass {
66 MachineRegisterInfo *MRI;
67 const SIInstrInfo *TII;
69 using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
71 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
73 MachineInstr *createDPPInst(MachineInstr &OrigMI,
74 MachineInstr &MovMI,
75 RegSubRegPair CombOldVGPR,
76 MachineOperand *OldOpnd,
77 bool CombBCZ) const;
79 MachineInstr *createDPPInst(MachineInstr &OrigMI,
80 MachineInstr &MovMI,
81 RegSubRegPair CombOldVGPR,
82 bool CombBCZ) const;
84 bool hasNoImmOrEqual(MachineInstr &MI,
85 unsigned OpndName,
86 int64_t Value,
87 int64_t Mask = -1) const;
89 bool combineDPPMov(MachineInstr &MI) const;
91 public:
92 static char ID;
94 GCNDPPCombine() : MachineFunctionPass(ID) {
95 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
98 bool runOnMachineFunction(MachineFunction &MF) override;
100 StringRef getPassName() const override { return "GCN DPP Combine"; }
102 void getAnalysisUsage(AnalysisUsage &AU) const override {
103 AU.setPreservesCFG();
104 MachineFunctionPass::getAnalysisUsage(AU);
108 } // end anonymous namespace
110 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
112 char GCNDPPCombine::ID = 0;
114 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
116 FunctionPass *llvm::createGCNDPPCombinePass() {
117 return new GCNDPPCombine();
120 static int getDPPOp(unsigned Op) {
121 auto DPP32 = AMDGPU::getDPPOp32(Op);
122 if (DPP32 != -1)
123 return DPP32;
125 auto E32 = AMDGPU::getVOPe32(Op);
126 return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
129 // tracks the register operand definition and returns:
130 // 1. immediate operand used to initialize the register if found
131 // 2. nullptr if the register operand is undef
132 // 3. the operand itself otherwise
133 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
134 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
135 if (!Def)
136 return nullptr;
138 switch(Def->getOpcode()) {
139 default: break;
140 case AMDGPU::IMPLICIT_DEF:
141 return nullptr;
142 case AMDGPU::COPY:
143 case AMDGPU::V_MOV_B32_e32: {
144 auto &Op1 = Def->getOperand(1);
145 if (Op1.isImm())
146 return &Op1;
147 break;
150 return &OldOpnd;
153 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
154 MachineInstr &MovMI,
155 RegSubRegPair CombOldVGPR,
156 bool CombBCZ) const {
157 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
158 assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
159 TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
161 auto OrigOp = OrigMI.getOpcode();
162 auto DPPOp = getDPPOp(OrigOp);
163 if (DPPOp == -1) {
164 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
165 return nullptr;
168 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
169 OrigMI.getDebugLoc(), TII->get(DPPOp));
170 bool Fail = false;
171 do {
172 auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
173 assert(Dst);
174 DPPInst.add(*Dst);
175 int NumOperands = 1;
177 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
178 if (OldIdx != -1) {
179 assert(OldIdx == NumOperands);
180 assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
181 DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
182 ++NumOperands;
183 } else {
184 // TODO: this discards MAC/FMA instructions for now, let's add it later
185 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
186 " TBD\n");
187 Fail = true;
188 break;
191 if (auto *Mod0 = TII->getNamedOperand(OrigMI,
192 AMDGPU::OpName::src0_modifiers)) {
193 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
194 AMDGPU::OpName::src0_modifiers));
195 assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
196 DPPInst.addImm(Mod0->getImm());
197 ++NumOperands;
198 } else if (AMDGPU::getNamedOperandIdx(DPPOp,
199 AMDGPU::OpName::src0_modifiers) != -1) {
200 DPPInst.addImm(0);
201 ++NumOperands;
203 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
204 assert(Src0);
205 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
206 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
207 Fail = true;
208 break;
210 DPPInst.add(*Src0);
211 DPPInst->getOperand(NumOperands).setIsKill(false);
212 ++NumOperands;
214 if (auto *Mod1 = TII->getNamedOperand(OrigMI,
215 AMDGPU::OpName::src1_modifiers)) {
216 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
217 AMDGPU::OpName::src1_modifiers));
218 assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
219 DPPInst.addImm(Mod1->getImm());
220 ++NumOperands;
221 } else if (AMDGPU::getNamedOperandIdx(DPPOp,
222 AMDGPU::OpName::src1_modifiers) != -1) {
223 DPPInst.addImm(0);
224 ++NumOperands;
226 if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
227 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
228 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
229 Fail = true;
230 break;
232 DPPInst.add(*Src1);
233 ++NumOperands;
236 if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
237 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
238 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
239 Fail = true;
240 break;
242 DPPInst.add(*Src2);
245 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
246 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
247 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
248 DPPInst.addImm(CombBCZ ? 1 : 0);
249 } while (false);
251 if (Fail) {
252 DPPInst.getInstr()->eraseFromParent();
253 return nullptr;
255 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
256 return DPPInst.getInstr();
259 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
260 assert(OldOpnd->isImm());
261 switch (OrigMIOp) {
262 default: break;
263 case AMDGPU::V_ADD_U32_e32:
264 case AMDGPU::V_ADD_U32_e64:
265 case AMDGPU::V_ADD_I32_e32:
266 case AMDGPU::V_ADD_I32_e64:
267 case AMDGPU::V_OR_B32_e32:
268 case AMDGPU::V_OR_B32_e64:
269 case AMDGPU::V_SUBREV_U32_e32:
270 case AMDGPU::V_SUBREV_U32_e64:
271 case AMDGPU::V_SUBREV_I32_e32:
272 case AMDGPU::V_SUBREV_I32_e64:
273 case AMDGPU::V_MAX_U32_e32:
274 case AMDGPU::V_MAX_U32_e64:
275 case AMDGPU::V_XOR_B32_e32:
276 case AMDGPU::V_XOR_B32_e64:
277 if (OldOpnd->getImm() == 0)
278 return true;
279 break;
280 case AMDGPU::V_AND_B32_e32:
281 case AMDGPU::V_AND_B32_e64:
282 case AMDGPU::V_MIN_U32_e32:
283 case AMDGPU::V_MIN_U32_e64:
284 if (static_cast<uint32_t>(OldOpnd->getImm()) ==
285 std::numeric_limits<uint32_t>::max())
286 return true;
287 break;
288 case AMDGPU::V_MIN_I32_e32:
289 case AMDGPU::V_MIN_I32_e64:
290 if (static_cast<int32_t>(OldOpnd->getImm()) ==
291 std::numeric_limits<int32_t>::max())
292 return true;
293 break;
294 case AMDGPU::V_MAX_I32_e32:
295 case AMDGPU::V_MAX_I32_e64:
296 if (static_cast<int32_t>(OldOpnd->getImm()) ==
297 std::numeric_limits<int32_t>::min())
298 return true;
299 break;
300 case AMDGPU::V_MUL_I32_I24_e32:
301 case AMDGPU::V_MUL_I32_I24_e64:
302 case AMDGPU::V_MUL_U32_U24_e32:
303 case AMDGPU::V_MUL_U32_U24_e64:
304 if (OldOpnd->getImm() == 1)
305 return true;
306 break;
308 return false;
311 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
312 MachineInstr &MovMI,
313 RegSubRegPair CombOldVGPR,
314 MachineOperand *OldOpndValue,
315 bool CombBCZ) const {
316 assert(CombOldVGPR.Reg);
317 if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
318 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
319 if (!Src1 || !Src1->isReg()) {
320 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
321 return nullptr;
323 if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
324 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
325 return nullptr;
327 CombOldVGPR = getRegSubRegPair(*Src1);
328 if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
329 LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
330 return nullptr;
333 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
336 // returns true if MI doesn't have OpndName immediate operand or the
337 // operand has Value
338 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
339 int64_t Value, int64_t Mask) const {
340 auto *Imm = TII->getNamedOperand(MI, OpndName);
341 if (!Imm)
342 return true;
344 assert(Imm->isImm());
345 return (Imm->getImm() & Mask) == Value;
348 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
349 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
350 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
352 auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
353 assert(DstOpnd && DstOpnd->isReg());
354 auto DPPMovReg = DstOpnd->getReg();
355 if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
356 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
357 " for all uses\n");
358 return false;
361 auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
362 assert(RowMaskOpnd && RowMaskOpnd->isImm());
363 auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
364 assert(BankMaskOpnd && BankMaskOpnd->isImm());
365 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
366 BankMaskOpnd->getImm() == 0xF;
368 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
369 assert(BCZOpnd && BCZOpnd->isImm());
370 bool BoundCtrlZero = BCZOpnd->getImm();
372 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
373 assert(OldOpnd && OldOpnd->isReg());
375 auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
376 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
377 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
378 // but the third option is used to distinguish undef from non-immediate
379 // to reuse IMPLICIT_DEF instruction later
380 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
382 bool CombBCZ = false;
384 if (MaskAllLanes && BoundCtrlZero) { // [1]
385 CombBCZ = true;
386 } else {
387 if (!OldOpndValue || !OldOpndValue->isImm()) {
388 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
389 return false;
392 if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
393 LLVM_DEBUG(dbgs() <<
394 " failed: old reg def and mov should be in the same BB\n");
395 return false;
398 if (OldOpndValue->getImm() == 0) {
399 if (MaskAllLanes) {
400 assert(!BoundCtrlZero); // by check [1]
401 CombBCZ = true;
403 } else if (BoundCtrlZero) {
404 assert(!MaskAllLanes); // by check [1]
405 LLVM_DEBUG(dbgs() <<
406 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
407 return false;
411 LLVM_DEBUG(dbgs() << " old=";
412 if (!OldOpndValue)
413 dbgs() << "undef";
414 else
415 dbgs() << *OldOpndValue;
416 dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
418 SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
419 auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
420 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
421 if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
422 CombOldVGPR = RegSubRegPair(
423 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
424 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
425 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
426 DPPMIs.push_back(UndefInst.getInstr());
429 OrigMIs.push_back(&MovMI);
430 bool Rollback = true;
431 for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
432 Rollback = true;
434 auto &OrigMI = *Use.getParent();
435 LLVM_DEBUG(dbgs() << " try: " << OrigMI);
437 auto OrigOp = OrigMI.getOpcode();
438 if (TII->isVOP3(OrigOp)) {
439 if (!TII->hasVALU32BitEncoding(OrigOp)) {
440 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
441 break;
443 // check if other than abs|neg modifiers are set (opsel for example)
444 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
445 if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
446 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
447 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
448 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
449 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
450 break;
452 } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
453 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
454 break;
457 LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
458 if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
459 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
460 OldOpndValue, CombBCZ)) {
461 DPPMIs.push_back(DPPInst);
462 Rollback = false;
464 } else if (OrigMI.isCommutable() &&
465 &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
466 auto *BB = OrigMI.getParent();
467 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
468 BB->insert(OrigMI, NewMI);
469 if (TII->commuteInstruction(*NewMI)) {
470 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
471 if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
472 OldOpndValue, CombBCZ)) {
473 DPPMIs.push_back(DPPInst);
474 Rollback = false;
476 } else
477 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
478 NewMI->eraseFromParent();
479 } else
480 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
481 if (Rollback)
482 break;
483 OrigMIs.push_back(&OrigMI);
486 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
487 MI->eraseFromParent();
489 return !Rollback;
492 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
493 auto &ST = MF.getSubtarget<GCNSubtarget>();
494 if (!ST.hasDPP() || skipFunction(MF.getFunction()))
495 return false;
497 MRI = &MF.getRegInfo();
498 TII = ST.getInstrInfo();
500 assert(MRI->isSSA() && "Must be run on SSA");
502 bool Changed = false;
503 for (auto &MBB : MF) {
504 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
505 auto &MI = *I++;
506 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
507 Changed = true;
508 ++NumDPPMovsCombined;
512 return Changed;