1 //=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass does combining of machine instructions at the generic MI level,
10 // after register banks are known.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/Target/TargetMachine.h"
29 #define DEBUG_TYPE "amdgpu-regbank-combiner"
32 using namespace MIPatternMatch
;
34 class AMDGPURegBankCombinerHelper
{
38 MachineRegisterInfo
&MRI
;
39 const GCNSubtarget
&Subtarget
;
40 const RegisterBankInfo
&RBI
;
41 const TargetRegisterInfo
&TRI
;
42 const SIInstrInfo
&TII
;
43 CombinerHelper
&Helper
;
46 AMDGPURegBankCombinerHelper(MachineIRBuilder
&B
, CombinerHelper
&Helper
)
47 : B(B
), MF(B
.getMF()), MRI(*B
.getMRI()),
48 Subtarget(MF
.getSubtarget
<GCNSubtarget
>()),
49 RBI(*Subtarget
.getRegBankInfo()), TRI(*Subtarget
.getRegisterInfo()),
50 TII(*Subtarget
.getInstrInfo()), Helper(Helper
){};
52 bool isVgprRegBank(Register Reg
);
53 Register
getAsVgpr(Register Reg
);
56 unsigned Min
, Max
, Med
;
59 struct Med3MatchInfo
{
61 Register Val0
, Val1
, Val2
;
64 MinMaxMedOpc
getMinMaxPair(unsigned Opc
);
66 template <class m_Cst
, typename CstTy
>
67 bool matchMed(MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MinMaxMedOpc MMMOpc
,
68 Register
&Val
, CstTy
&K0
, CstTy
&K1
);
70 bool matchIntMinMaxToMed3(MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
);
71 bool matchFPMinMaxToMed3(MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
);
72 bool matchFPMinMaxToClamp(MachineInstr
&MI
, Register
&Reg
);
73 bool matchFPMed3ToClamp(MachineInstr
&MI
, Register
&Reg
);
74 void applyMed3(MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
);
75 void applyClamp(MachineInstr
&MI
, Register
&Reg
);
78 AMDGPU::SIModeRegisterDefaults
getMode();
81 bool isFminnumIeee(const MachineInstr
&MI
);
82 bool isFCst(MachineInstr
*MI
);
83 bool isClampZeroToOne(MachineInstr
*K0
, MachineInstr
*K1
);
86 bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg
) {
87 return RBI
.getRegBank(Reg
, MRI
, TRI
)->getID() == AMDGPU::VGPRRegBankID
;
90 Register
AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg
) {
91 if (isVgprRegBank(Reg
))
94 // Search for existing copy of Reg to vgpr.
95 for (MachineInstr
&Use
: MRI
.use_instructions(Reg
)) {
96 Register Def
= Use
.getOperand(0).getReg();
97 if (Use
.getOpcode() == AMDGPU::COPY
&& isVgprRegBank(Def
))
102 Register VgprReg
= B
.buildCopy(MRI
.getType(Reg
), Reg
).getReg(0);
103 MRI
.setRegBank(VgprReg
, RBI
.getRegBank(AMDGPU::VGPRRegBankID
));
107 AMDGPURegBankCombinerHelper::MinMaxMedOpc
108 AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc
) {
111 llvm_unreachable("Unsupported opcode");
114 return {AMDGPU::G_SMIN
, AMDGPU::G_SMAX
, AMDGPU::G_AMDGPU_SMED3
};
117 return {AMDGPU::G_UMIN
, AMDGPU::G_UMAX
, AMDGPU::G_AMDGPU_UMED3
};
118 case AMDGPU::G_FMAXNUM
:
119 case AMDGPU::G_FMINNUM
:
120 return {AMDGPU::G_FMINNUM
, AMDGPU::G_FMAXNUM
, AMDGPU::G_AMDGPU_FMED3
};
121 case AMDGPU::G_FMAXNUM_IEEE
:
122 case AMDGPU::G_FMINNUM_IEEE
:
123 return {AMDGPU::G_FMINNUM_IEEE
, AMDGPU::G_FMAXNUM_IEEE
,
124 AMDGPU::G_AMDGPU_FMED3
};
128 template <class m_Cst
, typename CstTy
>
129 bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr
&MI
,
130 MachineRegisterInfo
&MRI
,
131 MinMaxMedOpc MMMOpc
, Register
&Val
,
132 CstTy
&K0
, CstTy
&K1
) {
133 // 4 operand commutes of: min(max(Val, K0), K1).
134 // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
135 // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
136 // 4 operand commutes of: max(min(Val, K1), K0).
137 // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
138 // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
143 MMMOpc
.Min
, m_CommutativeBinOp(MMMOpc
.Max
, m_Reg(Val
), m_Cst(K0
)),
146 MMMOpc
.Max
, m_CommutativeBinOp(MMMOpc
.Min
, m_Reg(Val
), m_Cst(K1
)),
150 bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
151 MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) {
152 Register Dst
= MI
.getOperand(0).getReg();
153 if (!isVgprRegBank(Dst
))
156 if (MRI
.getType(Dst
).isVector())
159 MinMaxMedOpc OpcodeTriple
= getMinMaxPair(MI
.getOpcode());
161 Optional
<ValueAndVReg
> K0
, K1
;
162 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
163 if (!matchMed
<GCstAndRegMatch
>(MI
, MRI
, OpcodeTriple
, Val
, K0
, K1
))
166 if (OpcodeTriple
.Med
== AMDGPU::G_AMDGPU_SMED3
&& K0
->Value
.sgt(K1
->Value
))
168 if (OpcodeTriple
.Med
== AMDGPU::G_AMDGPU_UMED3
&& K0
->Value
.ugt(K1
->Value
))
171 MatchInfo
= {OpcodeTriple
.Med
, Val
, K0
->VReg
, K1
->VReg
};
175 // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
176 // ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
177 // ieee = false : min/max(NaN, K) = K
178 // clamp(NaN) = dx10_clamp ? 0.0 : NaN
179 // Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
180 // Other operand commutes (see matchMed) give same result since min and max are
183 // Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
184 // with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
185 // Val = SNaN only for ieee = true
186 // fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
187 // min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
188 // max(min(SNaN, K1), K0) = max(K1, K0) = K1
189 // Val = NaN,ieee = false or Val = QNaN,ieee = true
190 // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
191 // min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
192 // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
193 bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
194 MachineInstr
&MI
, Med3MatchInfo
&MatchInfo
) {
195 Register Dst
= MI
.getOperand(0).getReg();
196 LLT Ty
= MRI
.getType(Dst
);
198 // med3 for f16 is only available on gfx9+, and not available for v2f16.
199 if ((Ty
!= LLT::scalar(16) || !Subtarget
.hasMed3_16()) &&
200 Ty
!= LLT::scalar(32))
203 auto OpcodeTriple
= getMinMaxPair(MI
.getOpcode());
206 Optional
<FPValueAndVReg
> K0
, K1
;
207 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
208 if (!matchMed
<GFCstAndRegMatch
>(MI
, MRI
, OpcodeTriple
, Val
, K0
, K1
))
211 if (K0
->Value
> K1
->Value
)
214 // For IEEE=false perform combine only when it's safe to assume that there are
215 // no NaN inputs. Most often MI is marked with nnan fast math flag.
216 // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
217 // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
218 // nodes(max/min) have same behavior when one input is NaN and other isn't.
219 // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
220 // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
221 if ((getIEEE() && isFminnumIeee(MI
)) || isKnownNeverNaN(Dst
, MRI
)) {
222 // Don't fold single use constant that can't be inlined.
223 if ((!MRI
.hasOneNonDBGUse(K0
->VReg
) || TII
.isInlineConstant(K0
->Value
)) &&
224 (!MRI
.hasOneNonDBGUse(K1
->VReg
) || TII
.isInlineConstant(K1
->Value
))) {
225 MatchInfo
= {OpcodeTriple
.Med
, Val
, K0
->VReg
, K1
->VReg
};
233 bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr
&MI
,
235 // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
236 auto OpcodeTriple
= getMinMaxPair(MI
.getOpcode());
238 Optional
<FPValueAndVReg
> K0
, K1
;
239 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
240 if (!matchMed
<GFCstOrSplatGFCstMatch
>(MI
, MRI
, OpcodeTriple
, Val
, K0
, K1
))
243 if (!K0
->Value
.isExactlyValue(0.0) || !K1
->Value
.isExactlyValue(1.0))
246 // For IEEE=false perform combine only when it's safe to assume that there are
247 // no NaN inputs. Most often MI is marked with nnan fast math flag.
248 // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
249 // to 0.0 requires dx10_clamp = true.
250 if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI
) &&
251 isKnownNeverSNaN(Val
, MRI
)) ||
252 isKnownNeverNaN(MI
.getOperand(0).getReg(), MRI
)) {
260 // Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
261 // Val = SNaN only for ieee = true. It is important which operand is NaN.
262 // min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
263 // min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
264 // min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
265 // Val = NaN,ieee = false or Val = QNaN,ieee = true
266 // min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
267 // min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
268 // min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
269 bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr
&MI
,
271 if (MI
.getIntrinsicID() != Intrinsic::amdgcn_fmed3
)
274 // In llvm-ir, clamp is often represented as an intrinsic call to
275 // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
276 MachineInstr
*Src0
= getDefIgnoringCopies(MI
.getOperand(2).getReg(), MRI
);
277 MachineInstr
*Src1
= getDefIgnoringCopies(MI
.getOperand(3).getReg(), MRI
);
278 MachineInstr
*Src2
= getDefIgnoringCopies(MI
.getOperand(4).getReg(), MRI
);
280 if (isFCst(Src0
) && !isFCst(Src1
))
281 std::swap(Src0
, Src1
);
282 if (isFCst(Src1
) && !isFCst(Src2
))
283 std::swap(Src1
, Src2
);
284 if (isFCst(Src0
) && !isFCst(Src1
))
285 std::swap(Src0
, Src1
);
286 if (!isClampZeroToOne(Src1
, Src2
))
289 Register Val
= Src0
->getOperand(0).getReg();
291 auto isOp3Zero
= [&]() {
292 MachineInstr
*Op3
= getDefIgnoringCopies(MI
.getOperand(4).getReg(), MRI
);
293 if (Op3
->getOpcode() == TargetOpcode::G_FCONSTANT
)
294 return Op3
->getOperand(1).getFPImm()->isExactlyValue(0.0);
297 // For IEEE=false perform combine only when it's safe to assume that there are
298 // no NaN inputs. Most often MI is marked with nnan fast math flag.
299 // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
300 // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
301 if (isKnownNeverNaN(MI
.getOperand(0).getReg(), MRI
) ||
302 (getIEEE() && getDX10Clamp() &&
303 (isKnownNeverSNaN(Val
, MRI
) || isOp3Zero()))) {
311 void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr
&MI
, Register
&Reg
) {
312 B
.setInstrAndDebugLoc(MI
);
313 B
.buildInstr(AMDGPU::G_AMDGPU_CLAMP
, {MI
.getOperand(0)}, {Reg
},
315 MI
.eraseFromParent();
318 void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr
&MI
,
319 Med3MatchInfo
&MatchInfo
) {
320 B
.setInstrAndDebugLoc(MI
);
321 B
.buildInstr(MatchInfo
.Opc
, {MI
.getOperand(0)},
322 {getAsVgpr(MatchInfo
.Val0
), getAsVgpr(MatchInfo
.Val1
),
323 getAsVgpr(MatchInfo
.Val2
)},
325 MI
.eraseFromParent();
328 AMDGPU::SIModeRegisterDefaults
AMDGPURegBankCombinerHelper::getMode() {
329 return MF
.getInfo
<SIMachineFunctionInfo
>()->getMode();
332 bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE
; }
334 bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp
; }
336 bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr
&MI
) {
337 return MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
;
340 bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr
*MI
) {
341 return MI
->getOpcode() == AMDGPU::G_FCONSTANT
;
344 bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr
*K0
,
346 if (isFCst(K0
) && isFCst(K1
)) {
347 const ConstantFP
*KO_FPImm
= K0
->getOperand(1).getFPImm();
348 const ConstantFP
*K1_FPImm
= K1
->getOperand(1).getFPImm();
349 return (KO_FPImm
->isExactlyValue(0.0) && K1_FPImm
->isExactlyValue(1.0)) ||
350 (KO_FPImm
->isExactlyValue(1.0) && K1_FPImm
->isExactlyValue(0.0));
355 class AMDGPURegBankCombinerHelperState
{
357 CombinerHelper
&Helper
;
358 AMDGPURegBankCombinerHelper
&RegBankHelper
;
361 AMDGPURegBankCombinerHelperState(CombinerHelper
&Helper
,
362 AMDGPURegBankCombinerHelper
&RegBankHelper
)
363 : Helper(Helper
), RegBankHelper(RegBankHelper
) {}
366 #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
367 #include "AMDGPUGenRegBankGICombiner.inc"
368 #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
371 #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
372 #include "AMDGPUGenRegBankGICombiner.inc"
373 #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
375 class AMDGPURegBankCombinerInfo final
: public CombinerInfo
{
377 MachineDominatorTree
*MDT
;
380 AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg
;
382 AMDGPURegBankCombinerInfo(bool EnableOpt
, bool OptSize
, bool MinSize
,
383 const AMDGPULegalizerInfo
*LI
,
384 GISelKnownBits
*KB
, MachineDominatorTree
*MDT
)
385 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
386 /*LegalizerInfo*/ LI
, EnableOpt
, OptSize
, MinSize
),
388 if (!GeneratedRuleCfg
.parseCommandLineOption())
389 report_fatal_error("Invalid rule identifier");
392 bool combine(GISelChangeObserver
&Observer
, MachineInstr
&MI
,
393 MachineIRBuilder
&B
) const override
;
396 bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver
&Observer
,
398 MachineIRBuilder
&B
) const {
399 CombinerHelper
Helper(Observer
, B
, KB
, MDT
);
400 AMDGPURegBankCombinerHelper
RegBankHelper(B
, Helper
);
401 AMDGPUGenRegBankCombinerHelper
Generated(GeneratedRuleCfg
, Helper
,
404 if (Generated
.tryCombineAll(Observer
, MI
, B
))
410 #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
411 #include "AMDGPUGenRegBankGICombiner.inc"
412 #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
417 class AMDGPURegBankCombiner
: public MachineFunctionPass
{
421 AMDGPURegBankCombiner(bool IsOptNone
= false);
423 StringRef
getPassName() const override
{
424 return "AMDGPURegBankCombiner";
427 bool runOnMachineFunction(MachineFunction
&MF
) override
;
429 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
433 } // end anonymous namespace
435 void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage
&AU
) const {
436 AU
.addRequired
<TargetPassConfig
>();
437 AU
.setPreservesCFG();
438 getSelectionDAGFallbackAnalysisUsage(AU
);
439 AU
.addRequired
<GISelKnownBitsAnalysis
>();
440 AU
.addPreserved
<GISelKnownBitsAnalysis
>();
442 AU
.addRequired
<MachineDominatorTree
>();
443 AU
.addPreserved
<MachineDominatorTree
>();
445 MachineFunctionPass::getAnalysisUsage(AU
);
448 AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone
)
449 : MachineFunctionPass(ID
), IsOptNone(IsOptNone
) {
450 initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
453 bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction
&MF
) {
454 if (MF
.getProperties().hasProperty(
455 MachineFunctionProperties::Property::FailedISel
))
457 auto *TPC
= &getAnalysis
<TargetPassConfig
>();
458 const Function
&F
= MF
.getFunction();
460 MF
.getTarget().getOptLevel() != CodeGenOpt::None
&& !skipFunction(F
);
462 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
463 const AMDGPULegalizerInfo
*LI
464 = static_cast<const AMDGPULegalizerInfo
*>(ST
.getLegalizerInfo());
466 GISelKnownBits
*KB
= &getAnalysis
<GISelKnownBitsAnalysis
>().get(MF
);
467 MachineDominatorTree
*MDT
=
468 IsOptNone
? nullptr : &getAnalysis
<MachineDominatorTree
>();
469 AMDGPURegBankCombinerInfo
PCInfo(EnableOpt
, F
.hasOptSize(),
470 F
.hasMinSize(), LI
, KB
, MDT
);
471 Combiner
C(PCInfo
, TPC
);
472 return C
.combineMachineInstrs(MF
, /*CSEInfo*/ nullptr);
475 char AMDGPURegBankCombiner::ID
= 0;
476 INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner
, DEBUG_TYPE
,
477 "Combine AMDGPU machine instrs after regbankselect",
479 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
480 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis
)
481 INITIALIZE_PASS_END(AMDGPURegBankCombiner
, DEBUG_TYPE
,
482 "Combine AMDGPU machine instrs after regbankselect", false,
486 FunctionPass
*createAMDGPURegBankCombiner(bool IsOptNone
) {
487 return new AMDGPURegBankCombiner(IsOptNone
);
489 } // end namespace llvm