1 //=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass does combining of machine instructions at the generic MI level,
10 // after register banks are known.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
24 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/MachineDominators.h"
27 #include "llvm/CodeGen/TargetPassConfig.h"
28 #include "llvm/Target/TargetMachine.h"
30 #define GET_GICOMBINER_DEPS
31 #include "AMDGPUGenPreLegalizeGICombiner.inc"
32 #undef GET_GICOMBINER_DEPS
34 #define DEBUG_TYPE "amdgpu-regbank-combiner"
37 using namespace MIPatternMatch
;
40 #define GET_GICOMBINER_TYPES
41 #include "AMDGPUGenRegBankGICombiner.inc"
42 #undef GET_GICOMBINER_TYPES
44 class AMDGPURegBankCombinerImpl
: public Combiner
{
46 const AMDGPURegBankCombinerImplRuleConfig
&RuleConfig
;
47 const GCNSubtarget
&STI
;
48 const RegisterBankInfo
&RBI
;
49 const TargetRegisterInfo
&TRI
;
50 const SIInstrInfo
&TII
;
51 // TODO: Make CombinerHelper methods const.
52 mutable CombinerHelper Helper
;
55 AMDGPURegBankCombinerImpl(
56 MachineFunction
&MF
, CombinerInfo
&CInfo
, const TargetPassConfig
*TPC
,
57 GISelKnownBits
&KB
, GISelCSEInfo
*CSEInfo
,
58 const AMDGPURegBankCombinerImplRuleConfig
&RuleConfig
,
59 const GCNSubtarget
&STI
, MachineDominatorTree
*MDT
,
60 const LegalizerInfo
*LI
);
62 static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
64 bool tryCombineAll(MachineInstr
&I
) const override
;
66 bool isVgprRegBank(Register Reg
) const;
67 Register
getAsVgpr(Register Reg
) const;
70 unsigned Min
, Max
, Med
;
73 struct Med3MatchInfo
{
75 Register Val0
, Val1
, Val2
;
78 MinMaxMedOpc
getMinMaxPair(unsigned Opc
) const;
80 template <class m_Cst
, typename CstTy
>
81 bool matchMed(MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MinMaxMedOpc MMMOpc
,
82 Register
&Val
, CstTy
&K0
, CstTy
&K1
) const;
84 bool matchIntMinMaxToMed3(MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) const;
85 bool matchFPMinMaxToMed3(MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) const;
86 bool matchFPMinMaxToClamp(MachineInstr
&MI
, Register
&Reg
) const;
87 bool matchFPMed3ToClamp(MachineInstr
&MI
, Register
&Reg
) const;
88 void applyMed3(MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) const;
89 void applyClamp(MachineInstr
&MI
, Register
&Reg
) const;
92 SIModeRegisterDefaults
getMode() const;
94 bool getDX10Clamp() const;
95 bool isFminnumIeee(const MachineInstr
&MI
) const;
96 bool isFCst(MachineInstr
*MI
) const;
97 bool isClampZeroToOne(MachineInstr
*K0
, MachineInstr
*K1
) const;
99 #define GET_GICOMBINER_CLASS_MEMBERS
100 #define AMDGPUSubtarget GCNSubtarget
101 #include "AMDGPUGenRegBankGICombiner.inc"
102 #undef GET_GICOMBINER_CLASS_MEMBERS
103 #undef AMDGPUSubtarget
106 #define GET_GICOMBINER_IMPL
107 #define AMDGPUSubtarget GCNSubtarget
108 #include "AMDGPUGenRegBankGICombiner.inc"
109 #undef AMDGPUSubtarget
110 #undef GET_GICOMBINER_IMPL
112 AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
113 MachineFunction
&MF
, CombinerInfo
&CInfo
, const TargetPassConfig
*TPC
,
114 GISelKnownBits
&KB
, GISelCSEInfo
*CSEInfo
,
115 const AMDGPURegBankCombinerImplRuleConfig
&RuleConfig
,
116 const GCNSubtarget
&STI
, MachineDominatorTree
*MDT
, const LegalizerInfo
*LI
)
117 : Combiner(MF
, CInfo
, TPC
, &KB
, CSEInfo
), RuleConfig(RuleConfig
), STI(STI
),
118 RBI(*STI
.getRegBankInfo()), TRI(*STI
.getRegisterInfo()),
119 TII(*STI
.getInstrInfo()),
120 Helper(Observer
, B
, /*IsPreLegalize*/ false, &KB
, MDT
, LI
),
121 #define GET_GICOMBINER_CONSTRUCTOR_INITS
122 #include "AMDGPUGenRegBankGICombiner.inc"
123 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
127 bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg
) const {
128 return RBI
.getRegBank(Reg
, MRI
, TRI
)->getID() == AMDGPU::VGPRRegBankID
;
131 Register
AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg
) const {
132 if (isVgprRegBank(Reg
))
135 // Search for existing copy of Reg to vgpr.
136 for (MachineInstr
&Use
: MRI
.use_instructions(Reg
)) {
137 Register Def
= Use
.getOperand(0).getReg();
138 if (Use
.getOpcode() == AMDGPU::COPY
&& isVgprRegBank(Def
))
143 Register VgprReg
= B
.buildCopy(MRI
.getType(Reg
), Reg
).getReg(0);
144 MRI
.setRegBank(VgprReg
, RBI
.getRegBank(AMDGPU::VGPRRegBankID
));
148 AMDGPURegBankCombinerImpl::MinMaxMedOpc
149 AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc
) const {
152 llvm_unreachable("Unsupported opcode");
155 return {AMDGPU::G_SMIN
, AMDGPU::G_SMAX
, AMDGPU::G_AMDGPU_SMED3
};
158 return {AMDGPU::G_UMIN
, AMDGPU::G_UMAX
, AMDGPU::G_AMDGPU_UMED3
};
159 case AMDGPU::G_FMAXNUM
:
160 case AMDGPU::G_FMINNUM
:
161 return {AMDGPU::G_FMINNUM
, AMDGPU::G_FMAXNUM
, AMDGPU::G_AMDGPU_FMED3
};
162 case AMDGPU::G_FMAXNUM_IEEE
:
163 case AMDGPU::G_FMINNUM_IEEE
:
164 return {AMDGPU::G_FMINNUM_IEEE
, AMDGPU::G_FMAXNUM_IEEE
,
165 AMDGPU::G_AMDGPU_FMED3
};
169 template <class m_Cst
, typename CstTy
>
170 bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr
&MI
,
171 MachineRegisterInfo
&MRI
,
172 MinMaxMedOpc MMMOpc
, Register
&Val
,
173 CstTy
&K0
, CstTy
&K1
) const {
174 // 4 operand commutes of: min(max(Val, K0), K1).
175 // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
176 // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
177 // 4 operand commutes of: max(min(Val, K1), K0).
178 // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
179 // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
184 MMMOpc
.Min
, m_CommutativeBinOp(MMMOpc
.Max
, m_Reg(Val
), m_Cst(K0
)),
187 MMMOpc
.Max
, m_CommutativeBinOp(MMMOpc
.Min
, m_Reg(Val
), m_Cst(K1
)),
191 bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
192 MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) const {
193 Register Dst
= MI
.getOperand(0).getReg();
194 if (!isVgprRegBank(Dst
))
197 // med3 for i16 is only available on gfx9+, and not available for v2i16.
198 LLT Ty
= MRI
.getType(Dst
);
199 if ((Ty
!= LLT::scalar(16) || !STI
.hasMed3_16()) && Ty
!= LLT::scalar(32))
202 MinMaxMedOpc OpcodeTriple
= getMinMaxPair(MI
.getOpcode());
204 std::optional
<ValueAndVReg
> K0
, K1
;
205 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
206 if (!matchMed
<GCstAndRegMatch
>(MI
, MRI
, OpcodeTriple
, Val
, K0
, K1
))
209 if (OpcodeTriple
.Med
== AMDGPU::G_AMDGPU_SMED3
&& K0
->Value
.sgt(K1
->Value
))
211 if (OpcodeTriple
.Med
== AMDGPU::G_AMDGPU_UMED3
&& K0
->Value
.ugt(K1
->Value
))
214 MatchInfo
= {OpcodeTriple
.Med
, Val
, K0
->VReg
, K1
->VReg
};
218 // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
219 // ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
220 // ieee = false : min/max(NaN, K) = K
221 // clamp(NaN) = dx10_clamp ? 0.0 : NaN
222 // Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
223 // Other operand commutes (see matchMed) give same result since min and max are
226 // Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
227 // with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
228 // Val = SNaN only for ieee = true
229 // fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
230 // min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
231 // max(min(SNaN, K1), K0) = max(K1, K0) = K1
232 // Val = NaN,ieee = false or Val = QNaN,ieee = true
233 // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
234 // min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
235 // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
236 bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
237 MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) const {
238 Register Dst
= MI
.getOperand(0).getReg();
239 LLT Ty
= MRI
.getType(Dst
);
241 // med3 for f16 is only available on gfx9+, and not available for v2f16.
242 if ((Ty
!= LLT::scalar(16) || !STI
.hasMed3_16()) && Ty
!= LLT::scalar(32))
245 auto OpcodeTriple
= getMinMaxPair(MI
.getOpcode());
248 std::optional
<FPValueAndVReg
> K0
, K1
;
249 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
250 if (!matchMed
<GFCstAndRegMatch
>(MI
, MRI
, OpcodeTriple
, Val
, K0
, K1
))
253 if (K0
->Value
> K1
->Value
)
256 // For IEEE=false perform combine only when it's safe to assume that there are
257 // no NaN inputs. Most often MI is marked with nnan fast math flag.
258 // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
259 // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
260 // nodes(max/min) have same behavior when one input is NaN and other isn't.
261 // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
262 // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
263 if ((getIEEE() && isFminnumIeee(MI
)) || isKnownNeverNaN(Dst
, MRI
)) {
264 // Don't fold single use constant that can't be inlined.
265 if ((!MRI
.hasOneNonDBGUse(K0
->VReg
) || TII
.isInlineConstant(K0
->Value
)) &&
266 (!MRI
.hasOneNonDBGUse(K1
->VReg
) || TII
.isInlineConstant(K1
->Value
))) {
267 MatchInfo
= {OpcodeTriple
.Med
, Val
, K0
->VReg
, K1
->VReg
};
275 bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr
&MI
,
276 Register
&Reg
) const {
277 // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
278 auto OpcodeTriple
= getMinMaxPair(MI
.getOpcode());
280 std::optional
<FPValueAndVReg
> K0
, K1
;
281 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
282 if (!matchMed
<GFCstOrSplatGFCstMatch
>(MI
, MRI
, OpcodeTriple
, Val
, K0
, K1
))
285 if (!K0
->Value
.isExactlyValue(0.0) || !K1
->Value
.isExactlyValue(1.0))
288 // For IEEE=false perform combine only when it's safe to assume that there are
289 // no NaN inputs. Most often MI is marked with nnan fast math flag.
290 // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
291 // to 0.0 requires dx10_clamp = true.
292 if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI
) &&
293 isKnownNeverSNaN(Val
, MRI
)) ||
294 isKnownNeverNaN(MI
.getOperand(0).getReg(), MRI
)) {
302 // Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
303 // Val = SNaN only for ieee = true. It is important which operand is NaN.
304 // min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
305 // min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
306 // min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
307 // Val = NaN,ieee = false or Val = QNaN,ieee = true
308 // min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
309 // min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
310 // min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
311 bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr
&MI
,
312 Register
&Reg
) const {
313 // In llvm-ir, clamp is often represented as an intrinsic call to
314 // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
315 MachineInstr
*Src0
= getDefIgnoringCopies(MI
.getOperand(1).getReg(), MRI
);
316 MachineInstr
*Src1
= getDefIgnoringCopies(MI
.getOperand(2).getReg(), MRI
);
317 MachineInstr
*Src2
= getDefIgnoringCopies(MI
.getOperand(3).getReg(), MRI
);
319 if (isFCst(Src0
) && !isFCst(Src1
))
320 std::swap(Src0
, Src1
);
321 if (isFCst(Src1
) && !isFCst(Src2
))
322 std::swap(Src1
, Src2
);
323 if (isFCst(Src0
) && !isFCst(Src1
))
324 std::swap(Src0
, Src1
);
325 if (!isClampZeroToOne(Src1
, Src2
))
328 Register Val
= Src0
->getOperand(0).getReg();
330 auto isOp3Zero
= [&]() {
331 MachineInstr
*Op3
= getDefIgnoringCopies(MI
.getOperand(4).getReg(), MRI
);
332 if (Op3
->getOpcode() == TargetOpcode::G_FCONSTANT
)
333 return Op3
->getOperand(1).getFPImm()->isExactlyValue(0.0);
336 // For IEEE=false perform combine only when it's safe to assume that there are
337 // no NaN inputs. Most often MI is marked with nnan fast math flag.
338 // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
339 // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
340 if (isKnownNeverNaN(MI
.getOperand(0).getReg(), MRI
) ||
341 (getIEEE() && getDX10Clamp() &&
342 (isKnownNeverSNaN(Val
, MRI
) || isOp3Zero()))) {
350 void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr
&MI
,
351 Register
&Reg
) const {
352 B
.buildInstr(AMDGPU::G_AMDGPU_CLAMP
, {MI
.getOperand(0)}, {Reg
},
354 MI
.eraseFromParent();
357 void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr
&MI
,
358 Med3MatchInfo
&MatchInfo
) const {
359 B
.buildInstr(MatchInfo
.Opc
, {MI
.getOperand(0)},
360 {getAsVgpr(MatchInfo
.Val0
), getAsVgpr(MatchInfo
.Val1
),
361 getAsVgpr(MatchInfo
.Val2
)},
363 MI
.eraseFromParent();
366 SIModeRegisterDefaults
AMDGPURegBankCombinerImpl::getMode() const {
367 return MF
.getInfo
<SIMachineFunctionInfo
>()->getMode();
370 bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE
; }
372 bool AMDGPURegBankCombinerImpl::getDX10Clamp() const {
373 return getMode().DX10Clamp
;
376 bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr
&MI
) const {
377 return MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
;
380 bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr
*MI
) const {
381 return MI
->getOpcode() == AMDGPU::G_FCONSTANT
;
384 bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr
*K0
,
385 MachineInstr
*K1
) const {
386 if (isFCst(K0
) && isFCst(K1
)) {
387 const ConstantFP
*KO_FPImm
= K0
->getOperand(1).getFPImm();
388 const ConstantFP
*K1_FPImm
= K1
->getOperand(1).getFPImm();
389 return (KO_FPImm
->isExactlyValue(0.0) && K1_FPImm
->isExactlyValue(1.0)) ||
390 (KO_FPImm
->isExactlyValue(1.0) && K1_FPImm
->isExactlyValue(0.0));
398 class AMDGPURegBankCombiner
: public MachineFunctionPass
{
402 AMDGPURegBankCombiner(bool IsOptNone
= false);
404 StringRef
getPassName() const override
{ return "AMDGPURegBankCombiner"; }
406 bool runOnMachineFunction(MachineFunction
&MF
) override
;
408 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
412 AMDGPURegBankCombinerImplRuleConfig RuleConfig
;
414 } // end anonymous namespace
416 void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage
&AU
) const {
417 AU
.addRequired
<TargetPassConfig
>();
418 AU
.setPreservesCFG();
419 getSelectionDAGFallbackAnalysisUsage(AU
);
420 AU
.addRequired
<GISelKnownBitsAnalysis
>();
421 AU
.addPreserved
<GISelKnownBitsAnalysis
>();
423 AU
.addRequired
<MachineDominatorTreeWrapperPass
>();
424 AU
.addPreserved
<MachineDominatorTreeWrapperPass
>();
426 MachineFunctionPass::getAnalysisUsage(AU
);
429 AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone
)
430 : MachineFunctionPass(ID
), IsOptNone(IsOptNone
) {
431 initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
433 if (!RuleConfig
.parseCommandLineOption())
434 report_fatal_error("Invalid rule identifier");
437 bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction
&MF
) {
438 if (MF
.getProperties().hasProperty(
439 MachineFunctionProperties::Property::FailedISel
))
441 auto *TPC
= &getAnalysis
<TargetPassConfig
>();
442 const Function
&F
= MF
.getFunction();
444 MF
.getTarget().getOptLevel() != CodeGenOptLevel::None
&& !skipFunction(F
);
446 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
447 GISelKnownBits
*KB
= &getAnalysis
<GISelKnownBitsAnalysis
>().get(MF
);
449 const auto *LI
= ST
.getLegalizerInfo();
450 MachineDominatorTree
*MDT
=
452 : &getAnalysis
<MachineDominatorTreeWrapperPass
>().getDomTree();
454 CombinerInfo
CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
455 LI
, EnableOpt
, F
.hasOptSize(), F
.hasMinSize());
456 // Disable fixed-point iteration to reduce compile-time
457 CInfo
.MaxIterations
= 1;
458 CInfo
.ObserverLvl
= CombinerInfo::ObserverLevel::SinglePass
;
459 // RegBankSelect seems not to leave dead instructions, so a full DCE pass is
461 CInfo
.EnableFullDCE
= false;
462 AMDGPURegBankCombinerImpl
Impl(MF
, CInfo
, TPC
, *KB
, /*CSEInfo*/ nullptr,
463 RuleConfig
, ST
, MDT
, LI
);
464 return Impl
.combineMachineInstrs();
467 char AMDGPURegBankCombiner::ID
= 0;
468 INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner
, DEBUG_TYPE
,
469 "Combine AMDGPU machine instrs after regbankselect",
471 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
472 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis
)
473 INITIALIZE_PASS_END(AMDGPURegBankCombiner
, DEBUG_TYPE
,
474 "Combine AMDGPU machine instrs after regbankselect", false,
478 FunctionPass
*createAMDGPURegBankCombiner(bool IsOptNone
) {
479 return new AMDGPURegBankCombiner(IsOptNone
);
481 } // end namespace llvm