[AMDGPU] New gfx940 mfma instructions
[llvm-project.git] / llvm / lib / Target / AArch64 / GISel / AArch64PreLegalizerCombiner.cpp
blobf37eb10ea3e580882d95016c9b304f9b9e970ede
1 //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64TargetMachine.h"
16 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
17 #include "llvm/CodeGen/GlobalISel/Combiner.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/TargetPassConfig.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
31 #define DEBUG_TYPE "aarch64-prelegalizer-combiner"
33 using namespace llvm;
34 using namespace MIPatternMatch;
36 /// Return true if a G_FCONSTANT instruction is known to be better-represented
37 /// as a G_CONSTANT.
38 static bool matchFConstantToConstant(MachineInstr &MI,
39 MachineRegisterInfo &MRI) {
40 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
41 Register DstReg = MI.getOperand(0).getReg();
42 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
43 if (DstSize != 32 && DstSize != 64)
44 return false;
46 // When we're storing a value, it doesn't matter what register bank it's on.
47 // Since not all floating point constants can be materialized using a fmov,
48 // it makes more sense to just use a GPR.
49 return all_of(MRI.use_nodbg_instructions(DstReg),
50 [](const MachineInstr &Use) { return Use.mayStore(); });
53 /// Change a G_FCONSTANT into a G_CONSTANT.
54 static void applyFConstantToConstant(MachineInstr &MI) {
55 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
56 MachineIRBuilder MIB(MI);
57 const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
58 MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
59 MI.eraseFromParent();
62 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
63 /// are sign bits. In this case, we can transform the G_ICMP to directly compare
64 /// the wide value with a zero.
65 static bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
66 GISelKnownBits *KB, Register &MatchInfo) {
67 assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
69 auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
70 if (!ICmpInst::isEquality(Pred))
71 return false;
73 Register LHS = MI.getOperand(2).getReg();
74 LLT LHSTy = MRI.getType(LHS);
75 if (!LHSTy.isScalar())
76 return false;
78 Register RHS = MI.getOperand(3).getReg();
79 Register WideReg;
81 if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) ||
82 !mi_match(RHS, MRI, m_SpecificICst(0)))
83 return false;
85 LLT WideTy = MRI.getType(WideReg);
86 if (KB->computeNumSignBits(WideReg) <=
87 WideTy.getSizeInBits() - LHSTy.getSizeInBits())
88 return false;
90 MatchInfo = WideReg;
91 return true;
94 static bool applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
95 MachineIRBuilder &Builder,
96 GISelChangeObserver &Observer,
97 Register &WideReg) {
98 assert(MI.getOpcode() == TargetOpcode::G_ICMP);
100 LLT WideTy = MRI.getType(WideReg);
101 // We're going to directly use the wide register as the LHS, and then use an
102 // equivalent size zero for RHS.
103 Builder.setInstrAndDebugLoc(MI);
104 auto WideZero = Builder.buildConstant(WideTy, 0);
105 Observer.changingInstr(MI);
106 MI.getOperand(2).setReg(WideReg);
107 MI.getOperand(3).setReg(WideZero.getReg(0));
108 Observer.changedInstr(MI);
109 return true;
112 /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
114 /// e.g.
116 /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
117 static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
118 std::pair<uint64_t, uint64_t> &MatchInfo) {
119 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
120 MachineFunction &MF = *MI.getMF();
121 auto &GlobalOp = MI.getOperand(1);
122 auto *GV = GlobalOp.getGlobal();
123 if (GV->isThreadLocal())
124 return false;
126 // Don't allow anything that could represent offsets etc.
127 if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
128 GV, MF.getTarget()) != AArch64II::MO_NO_FLAG)
129 return false;
131 // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
133 // %g = G_GLOBAL_VALUE @x
134 // %ptr1 = G_PTR_ADD %g, cst1
135 // %ptr2 = G_PTR_ADD %g, cst2
136 // ...
137 // %ptrN = G_PTR_ADD %g, cstN
139 // Identify the *smallest* constant. We want to be able to form this:
141 // %offset_g = G_GLOBAL_VALUE @x + min_cst
142 // %g = G_PTR_ADD %offset_g, -min_cst
143 // %ptr1 = G_PTR_ADD %g, cst1
144 // ...
145 Register Dst = MI.getOperand(0).getReg();
146 uint64_t MinOffset = -1ull;
147 for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
148 if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
149 return false;
150 auto Cst = getIConstantVRegValWithLookThrough(
151 UseInstr.getOperand(2).getReg(), MRI);
152 if (!Cst)
153 return false;
154 MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
157 // Require that the new offset is larger than the existing one to avoid
158 // infinite loops.
159 uint64_t CurrOffset = GlobalOp.getOffset();
160 uint64_t NewOffset = MinOffset + CurrOffset;
161 if (NewOffset <= CurrOffset)
162 return false;
164 // Check whether folding this offset is legal. It must not go out of bounds of
165 // the referenced object to avoid violating the code model, and must be
166 // smaller than 2^21 because this is the largest offset expressible in all
167 // object formats.
169 // This check also prevents us from folding negative offsets, which will end
170 // up being treated in the same way as large positive ones. They could also
171 // cause code model violations, and aren't really common enough to matter.
172 if (NewOffset >= (1 << 21))
173 return false;
175 Type *T = GV->getValueType();
176 if (!T->isSized() ||
177 NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
178 return false;
179 MatchInfo = std::make_pair(NewOffset, MinOffset);
180 return true;
183 static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
184 MachineIRBuilder &B,
185 GISelChangeObserver &Observer,
186 std::pair<uint64_t, uint64_t> &MatchInfo) {
187 // Change:
189 // %g = G_GLOBAL_VALUE @x
190 // %ptr1 = G_PTR_ADD %g, cst1
191 // %ptr2 = G_PTR_ADD %g, cst2
192 // ...
193 // %ptrN = G_PTR_ADD %g, cstN
195 // To:
197 // %offset_g = G_GLOBAL_VALUE @x + min_cst
198 // %g = G_PTR_ADD %offset_g, -min_cst
199 // %ptr1 = G_PTR_ADD %g, cst1
200 // ...
201 // %ptrN = G_PTR_ADD %g, cstN
203 // Then, the original G_PTR_ADDs should be folded later on so that they look
204 // like this:
206 // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
207 uint64_t Offset, MinOffset;
208 std::tie(Offset, MinOffset) = MatchInfo;
209 B.setInstrAndDebugLoc(MI);
210 Observer.changingInstr(MI);
211 auto &GlobalOp = MI.getOperand(1);
212 auto *GV = GlobalOp.getGlobal();
213 GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags());
214 Register Dst = MI.getOperand(0).getReg();
215 Register NewGVDst = MRI.cloneVirtualRegister(Dst);
216 MI.getOperand(0).setReg(NewGVDst);
217 Observer.changedInstr(MI);
218 B.buildPtrAdd(
219 Dst, NewGVDst,
220 B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
221 return true;
224 static bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
225 CombinerHelper &Helper,
226 GISelChangeObserver &Observer) {
227 // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
228 // result is only used in the no-overflow case. It is restricted to cases
229 // where we know that the high-bits of the operands are 0. If there's an
230 // overflow, then the the 9th or 17th bit must be set, which can be checked
231 // using TBNZ.
233 // Change (for UADDOs on 8 and 16 bits):
235 // %z0 = G_ASSERT_ZEXT _
236 // %op0 = G_TRUNC %z0
237 // %z1 = G_ASSERT_ZEXT _
238 // %op1 = G_TRUNC %z1
239 // %val, %cond = G_UADDO %op0, %op1
240 // G_BRCOND %cond, %error.bb
242 // error.bb:
243 // (no successors and no uses of %val)
245 // To:
247 // %z0 = G_ASSERT_ZEXT _
248 // %z1 = G_ASSERT_ZEXT _
249 // %add = G_ADD %z0, %z1
250 // %val = G_TRUNC %add
251 // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
252 // %cond = G_ICMP NE, %bit, 0
253 // G_BRCOND %cond, %error.bb
255 auto &MRI = *B.getMRI();
257 MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg());
258 MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg());
259 Register Op0Wide;
260 Register Op1Wide;
261 if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) ||
262 !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide))))
263 return false;
264 LLT WideTy0 = MRI.getType(Op0Wide);
265 LLT WideTy1 = MRI.getType(Op1Wide);
266 Register ResVal = MI.getOperand(0).getReg();
267 LLT OpTy = MRI.getType(ResVal);
268 MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide);
269 MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide);
271 unsigned OpTySize = OpTy.getScalarSizeInBits();
272 // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
273 // inputs have been zero-extended.
274 if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
275 Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
276 OpTySize != Op0WideDef->getOperand(2).getImm() ||
277 OpTySize != Op1WideDef->getOperand(2).getImm())
278 return false;
280 // Only scalar UADDO with either 8 or 16 bit operands are handled.
281 if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
282 OpTySize >= WideTy0.getScalarSizeInBits() ||
283 (OpTySize != 8 && OpTySize != 16))
284 return false;
286 // The overflow-status result must be used by a branch only.
287 Register ResStatus = MI.getOperand(1).getReg();
288 if (!MRI.hasOneNonDBGUse(ResStatus))
289 return false;
290 MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus);
291 if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
292 return false;
294 // Make sure the computed result is only used in the no-overflow blocks.
295 MachineBasicBlock *CurrentMBB = MI.getParent();
296 MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB();
297 if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
298 return false;
299 if (any_of(MRI.use_nodbg_instructions(ResVal),
300 [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
301 return &MI != &I &&
302 (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
304 return false;
306 // Remove G_ADDO.
307 B.setInstrAndDebugLoc(*MI.getNextNode());
308 MI.eraseFromParent();
310 // Emit wide add.
311 Register AddDst = MRI.cloneVirtualRegister(Op0Wide);
312 B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide});
314 // Emit check of the 9th or 17th bit and update users (the branch). This will
315 // later be folded to TBNZ.
316 Register CondBit = MRI.cloneVirtualRegister(Op0Wide);
317 B.buildAnd(
318 CondBit, AddDst,
319 B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16));
320 B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit,
321 B.buildConstant(LLT::scalar(32), 0));
323 // Update ZEXts users of the result value. Because all uses are in the
324 // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
325 B.buildZExtOrTrunc(ResVal, AddDst);
326 for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) {
327 Register WideReg;
328 if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) {
329 auto OldR = U.getParent()->getOperand(0).getReg();
330 Observer.erasingInstr(*U.getParent());
331 U.getParent()->eraseFromParent();
332 Helper.replaceRegWith(MRI, OldR, AddDst);
336 return true;
339 class AArch64PreLegalizerCombinerHelperState {
340 protected:
341 CombinerHelper &Helper;
343 public:
344 AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper)
345 : Helper(Helper) {}
348 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
349 #include "AArch64GenPreLegalizeGICombiner.inc"
350 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
352 namespace {
353 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
354 #include "AArch64GenPreLegalizeGICombiner.inc"
355 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
357 class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
358 GISelKnownBits *KB;
359 MachineDominatorTree *MDT;
360 AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
362 public:
363 AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
364 GISelKnownBits *KB, MachineDominatorTree *MDT)
365 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
366 /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
367 KB(KB), MDT(MDT) {
368 if (!GeneratedRuleCfg.parseCommandLineOption())
369 report_fatal_error("Invalid rule identifier");
372 virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
373 MachineIRBuilder &B) const override;
376 bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
377 MachineInstr &MI,
378 MachineIRBuilder &B) const {
379 CombinerHelper Helper(Observer, B, KB, MDT);
380 AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
382 if (Generated.tryCombineAll(Observer, MI, B))
383 return true;
385 unsigned Opc = MI.getOpcode();
386 switch (Opc) {
387 case TargetOpcode::G_CONCAT_VECTORS:
388 return Helper.tryCombineConcatVectors(MI);
389 case TargetOpcode::G_SHUFFLE_VECTOR:
390 return Helper.tryCombineShuffleVector(MI);
391 case TargetOpcode::G_UADDO:
392 return tryToSimplifyUADDO(MI, B, Helper, Observer);
393 case TargetOpcode::G_MEMCPY_INLINE:
394 return Helper.tryEmitMemcpyInline(MI);
395 case TargetOpcode::G_MEMCPY:
396 case TargetOpcode::G_MEMMOVE:
397 case TargetOpcode::G_MEMSET: {
398 // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
399 // heuristics decide.
400 unsigned MaxLen = EnableOpt ? 0 : 32;
401 // Try to inline memcpy type calls if optimizations are enabled.
402 if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
403 return true;
404 if (Opc == TargetOpcode::G_MEMSET)
405 return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize);
406 return false;
410 return false;
413 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
414 #include "AArch64GenPreLegalizeGICombiner.inc"
415 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
417 // Pass boilerplate
418 // ================
420 class AArch64PreLegalizerCombiner : public MachineFunctionPass {
421 public:
422 static char ID;
424 AArch64PreLegalizerCombiner();
426 StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
428 bool runOnMachineFunction(MachineFunction &MF) override;
430 void getAnalysisUsage(AnalysisUsage &AU) const override;
432 } // end anonymous namespace
434 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
435 AU.addRequired<TargetPassConfig>();
436 AU.setPreservesCFG();
437 getSelectionDAGFallbackAnalysisUsage(AU);
438 AU.addRequired<GISelKnownBitsAnalysis>();
439 AU.addPreserved<GISelKnownBitsAnalysis>();
440 AU.addRequired<MachineDominatorTree>();
441 AU.addPreserved<MachineDominatorTree>();
442 AU.addRequired<GISelCSEAnalysisWrapperPass>();
443 AU.addPreserved<GISelCSEAnalysisWrapperPass>();
444 MachineFunctionPass::getAnalysisUsage(AU);
447 AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
448 : MachineFunctionPass(ID) {
449 initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
452 bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
453 if (MF.getProperties().hasProperty(
454 MachineFunctionProperties::Property::FailedISel))
455 return false;
456 auto &TPC = getAnalysis<TargetPassConfig>();
458 // Enable CSE.
459 GISelCSEAnalysisWrapper &Wrapper =
460 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
461 auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
463 const Function &F = MF.getFunction();
464 bool EnableOpt =
465 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
466 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
467 MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
468 AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
469 F.hasMinSize(), KB, MDT);
470 Combiner C(PCInfo, &TPC);
471 return C.combineMachineInstrs(MF, CSEInfo);
474 char AArch64PreLegalizerCombiner::ID = 0;
475 INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
476 "Combine AArch64 machine instrs before legalization",
477 false, false)
478 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
479 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
480 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
481 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
482 "Combine AArch64 machine instrs before legalization", false,
483 false)
486 namespace llvm {
487 FunctionPass *createAArch64PreLegalizerCombiner() {
488 return new AArch64PreLegalizerCombiner();
490 } // end namespace llvm