[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / GCNDPPCombine.cpp
blobe1845e2e8e879d2f9f4d35363f4cc76480d61e0e
1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
17 // to
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
22 // Combining rules :
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
35 // Otherwise cancel.
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
40 #include "AMDGPU.h"
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/SmallVector.h"
45 #include "llvm/ADT/Statistic.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/TargetRegisterInfo.h"
54 #include "llvm/Pass.h"
55 #include <cassert>
57 using namespace llvm;
59 #define DEBUG_TYPE "gcn-dpp-combine"
61 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
63 namespace {
65 class GCNDPPCombine : public MachineFunctionPass {
66 MachineRegisterInfo *MRI;
67 const SIInstrInfo *TII;
69 using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
71 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
73 MachineInstr *createDPPInst(MachineInstr &OrigMI,
74 MachineInstr &MovMI,
75 RegSubRegPair CombOldVGPR,
76 MachineOperand *OldOpnd,
77 bool CombBCZ) const;
79 MachineInstr *createDPPInst(MachineInstr &OrigMI,
80 MachineInstr &MovMI,
81 RegSubRegPair CombOldVGPR,
82 bool CombBCZ) const;
84 bool hasNoImmOrEqual(MachineInstr &MI,
85 unsigned OpndName,
86 int64_t Value,
87 int64_t Mask = -1) const;
89 bool combineDPPMov(MachineInstr &MI) const;
91 public:
92 static char ID;
94 GCNDPPCombine() : MachineFunctionPass(ID) {
95 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
98 bool runOnMachineFunction(MachineFunction &MF) override;
100 StringRef getPassName() const override { return "GCN DPP Combine"; }
102 void getAnalysisUsage(AnalysisUsage &AU) const override {
103 AU.setPreservesCFG();
104 MachineFunctionPass::getAnalysisUsage(AU);
108 } // end anonymous namespace
110 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
112 char GCNDPPCombine::ID = 0;
114 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
116 FunctionPass *llvm::createGCNDPPCombinePass() {
117 return new GCNDPPCombine();
120 static int getDPPOp(unsigned Op) {
121 auto DPP32 = AMDGPU::getDPPOp32(Op);
122 if (DPP32 != -1)
123 return DPP32;
125 auto E32 = AMDGPU::getVOPe32(Op);
126 return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
129 // tracks the register operand definition and returns:
130 // 1. immediate operand used to initialize the register if found
131 // 2. nullptr if the register operand is undef
132 // 3. the operand itself otherwise
133 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
134 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
135 if (!Def)
136 return nullptr;
138 switch(Def->getOpcode()) {
139 default: break;
140 case AMDGPU::IMPLICIT_DEF:
141 return nullptr;
142 case AMDGPU::COPY:
143 case AMDGPU::V_MOV_B32_e32: {
144 auto &Op1 = Def->getOperand(1);
145 if (Op1.isImm())
146 return &Op1;
147 break;
150 return &OldOpnd;
153 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
154 MachineInstr &MovMI,
155 RegSubRegPair CombOldVGPR,
156 bool CombBCZ) const {
157 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
158 assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
159 TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
161 auto OrigOp = OrigMI.getOpcode();
162 auto DPPOp = getDPPOp(OrigOp);
163 if (DPPOp == -1) {
164 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
165 return nullptr;
168 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
169 OrigMI.getDebugLoc(), TII->get(DPPOp));
170 bool Fail = false;
171 do {
172 auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
173 assert(Dst);
174 DPPInst.add(*Dst);
175 int NumOperands = 1;
177 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
178 if (OldIdx != -1) {
179 assert(OldIdx == NumOperands);
180 assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
181 DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
182 ++NumOperands;
183 } else {
184 // TODO: this discards MAC/FMA instructions for now, let's add it later
185 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
186 " TBD\n");
187 Fail = true;
188 break;
191 if (auto *Mod0 = TII->getNamedOperand(OrigMI,
192 AMDGPU::OpName::src0_modifiers)) {
193 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
194 AMDGPU::OpName::src0_modifiers));
195 assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
196 DPPInst.addImm(Mod0->getImm());
197 ++NumOperands;
199 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
200 assert(Src0);
201 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
202 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
203 Fail = true;
204 break;
206 DPPInst.add(*Src0);
207 DPPInst->getOperand(NumOperands).setIsKill(false);
208 ++NumOperands;
210 if (auto *Mod1 = TII->getNamedOperand(OrigMI,
211 AMDGPU::OpName::src1_modifiers)) {
212 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
213 AMDGPU::OpName::src1_modifiers));
214 assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
215 DPPInst.addImm(Mod1->getImm());
216 ++NumOperands;
218 if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
219 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
220 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
221 Fail = true;
222 break;
224 DPPInst.add(*Src1);
225 ++NumOperands;
228 if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
229 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
230 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
231 Fail = true;
232 break;
234 DPPInst.add(*Src2);
237 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
238 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
239 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
240 DPPInst.addImm(CombBCZ ? 1 : 0);
241 } while (false);
243 if (Fail) {
244 DPPInst.getInstr()->eraseFromParent();
245 return nullptr;
247 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
248 return DPPInst.getInstr();
251 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
252 assert(OldOpnd->isImm());
253 switch (OrigMIOp) {
254 default: break;
255 case AMDGPU::V_ADD_U32_e32:
256 case AMDGPU::V_ADD_U32_e64:
257 case AMDGPU::V_ADD_I32_e32:
258 case AMDGPU::V_ADD_I32_e64:
259 case AMDGPU::V_OR_B32_e32:
260 case AMDGPU::V_OR_B32_e64:
261 case AMDGPU::V_SUBREV_U32_e32:
262 case AMDGPU::V_SUBREV_U32_e64:
263 case AMDGPU::V_SUBREV_I32_e32:
264 case AMDGPU::V_SUBREV_I32_e64:
265 case AMDGPU::V_MAX_U32_e32:
266 case AMDGPU::V_MAX_U32_e64:
267 case AMDGPU::V_XOR_B32_e32:
268 case AMDGPU::V_XOR_B32_e64:
269 if (OldOpnd->getImm() == 0)
270 return true;
271 break;
272 case AMDGPU::V_AND_B32_e32:
273 case AMDGPU::V_AND_B32_e64:
274 case AMDGPU::V_MIN_U32_e32:
275 case AMDGPU::V_MIN_U32_e64:
276 if (static_cast<uint32_t>(OldOpnd->getImm()) ==
277 std::numeric_limits<uint32_t>::max())
278 return true;
279 break;
280 case AMDGPU::V_MIN_I32_e32:
281 case AMDGPU::V_MIN_I32_e64:
282 if (static_cast<int32_t>(OldOpnd->getImm()) ==
283 std::numeric_limits<int32_t>::max())
284 return true;
285 break;
286 case AMDGPU::V_MAX_I32_e32:
287 case AMDGPU::V_MAX_I32_e64:
288 if (static_cast<int32_t>(OldOpnd->getImm()) ==
289 std::numeric_limits<int32_t>::min())
290 return true;
291 break;
292 case AMDGPU::V_MUL_I32_I24_e32:
293 case AMDGPU::V_MUL_I32_I24_e64:
294 case AMDGPU::V_MUL_U32_U24_e32:
295 case AMDGPU::V_MUL_U32_U24_e64:
296 if (OldOpnd->getImm() == 1)
297 return true;
298 break;
300 return false;
303 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
304 MachineInstr &MovMI,
305 RegSubRegPair CombOldVGPR,
306 MachineOperand *OldOpndValue,
307 bool CombBCZ) const {
308 assert(CombOldVGPR.Reg);
309 if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
310 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
311 if (!Src1 || !Src1->isReg()) {
312 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
313 return nullptr;
315 if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
316 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
317 return nullptr;
319 CombOldVGPR = getRegSubRegPair(*Src1);
320 if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
321 LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
322 return nullptr;
325 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
328 // returns true if MI doesn't have OpndName immediate operand or the
329 // operand has Value
330 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
331 int64_t Value, int64_t Mask) const {
332 auto *Imm = TII->getNamedOperand(MI, OpndName);
333 if (!Imm)
334 return true;
336 assert(Imm->isImm());
337 return (Imm->getImm() & Mask) == Value;
340 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
341 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
342 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
344 auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
345 assert(DstOpnd && DstOpnd->isReg());
346 auto DPPMovReg = DstOpnd->getReg();
347 if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
348 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
349 " for all uses\n");
350 return false;
353 auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
354 assert(RowMaskOpnd && RowMaskOpnd->isImm());
355 auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
356 assert(BankMaskOpnd && BankMaskOpnd->isImm());
357 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
358 BankMaskOpnd->getImm() == 0xF;
360 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
361 assert(BCZOpnd && BCZOpnd->isImm());
362 bool BoundCtrlZero = BCZOpnd->getImm();
364 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
365 assert(OldOpnd && OldOpnd->isReg());
367 auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
368 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
369 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
370 // but the third option is used to distinguish undef from non-immediate
371 // to reuse IMPLICIT_DEF instruction later
372 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
374 bool CombBCZ = false;
376 if (MaskAllLanes && BoundCtrlZero) { // [1]
377 CombBCZ = true;
378 } else {
379 if (!OldOpndValue || !OldOpndValue->isImm()) {
380 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
381 return false;
384 if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
385 LLVM_DEBUG(dbgs() <<
386 " failed: old reg def and mov should be in the same BB\n");
387 return false;
390 if (OldOpndValue->getImm() == 0) {
391 if (MaskAllLanes) {
392 assert(!BoundCtrlZero); // by check [1]
393 CombBCZ = true;
395 } else if (BoundCtrlZero) {
396 assert(!MaskAllLanes); // by check [1]
397 LLVM_DEBUG(dbgs() <<
398 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
399 return false;
403 LLVM_DEBUG(dbgs() << " old=";
404 if (!OldOpndValue)
405 dbgs() << "undef";
406 else
407 dbgs() << *OldOpndValue;
408 dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
410 SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
411 auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
412 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
413 if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
414 CombOldVGPR = RegSubRegPair(
415 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
416 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
417 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
418 DPPMIs.push_back(UndefInst.getInstr());
421 OrigMIs.push_back(&MovMI);
422 bool Rollback = true;
423 for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
424 Rollback = true;
426 auto &OrigMI = *Use.getParent();
427 LLVM_DEBUG(dbgs() << " try: " << OrigMI);
429 auto OrigOp = OrigMI.getOpcode();
430 if (TII->isVOP3(OrigOp)) {
431 if (!TII->hasVALU32BitEncoding(OrigOp)) {
432 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
433 break;
435 // check if other than abs|neg modifiers are set (opsel for example)
436 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
437 if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
438 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
439 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
440 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
441 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
442 break;
444 } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
445 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
446 break;
449 LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
450 if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
451 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
452 OldOpndValue, CombBCZ)) {
453 DPPMIs.push_back(DPPInst);
454 Rollback = false;
456 } else if (OrigMI.isCommutable() &&
457 &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
458 auto *BB = OrigMI.getParent();
459 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
460 BB->insert(OrigMI, NewMI);
461 if (TII->commuteInstruction(*NewMI)) {
462 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
463 if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
464 OldOpndValue, CombBCZ)) {
465 DPPMIs.push_back(DPPInst);
466 Rollback = false;
468 } else
469 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
470 NewMI->eraseFromParent();
471 } else
472 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
473 if (Rollback)
474 break;
475 OrigMIs.push_back(&OrigMI);
478 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
479 MI->eraseFromParent();
481 return !Rollback;
484 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
485 auto &ST = MF.getSubtarget<GCNSubtarget>();
486 if (!ST.hasDPP() || skipFunction(MF.getFunction()))
487 return false;
489 MRI = &MF.getRegInfo();
490 TII = ST.getInstrInfo();
492 assert(MRI->isSSA() && "Must be run on SSA");
494 bool Changed = false;
495 for (auto &MBB : MF) {
496 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
497 auto &MI = *I++;
498 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
499 Changed = true;
500 ++NumDPPMovsCombined;
504 return Changed;