1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass does post-instruction-selection optimizations in the GlobalISel
10 // pipeline, before the rest of codegen runs.
12 //===----------------------------------------------------------------------===//
15 #include "AArch64TargetMachine.h"
16 #include "MCTargetDesc/AArch64MCTargetDesc.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/Utils.h"
19 #include "llvm/CodeGen/MachineBasicBlock.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineOperand.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/ErrorHandling.h"
27 #define DEBUG_TYPE "aarch64-post-select-optimize"
32 class AArch64PostSelectOptimize
: public MachineFunctionPass
{
36 AArch64PostSelectOptimize();
38 StringRef
getPassName() const override
{
39 return "AArch64 Post Select Optimizer";
42 bool runOnMachineFunction(MachineFunction
&MF
) override
;
44 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
47 bool optimizeNZCVDefs(MachineBasicBlock
&MBB
);
48 bool doPeepholeOpts(MachineBasicBlock
&MBB
);
49 /// Look for cross regclass copies that can be trivially eliminated.
50 bool foldSimpleCrossClassCopies(MachineInstr
&MI
);
51 bool foldCopyDup(MachineInstr
&MI
);
53 } // end anonymous namespace
55 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage
&AU
) const {
56 AU
.addRequired
<TargetPassConfig
>();
58 getSelectionDAGFallbackAnalysisUsage(AU
);
59 MachineFunctionPass::getAnalysisUsage(AU
);
62 AArch64PostSelectOptimize::AArch64PostSelectOptimize()
63 : MachineFunctionPass(ID
) {
64 initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
67 unsigned getNonFlagSettingVariant(unsigned Opc
) {
71 case AArch64::SUBSXrr
:
72 return AArch64::SUBXrr
;
73 case AArch64::SUBSWrr
:
74 return AArch64::SUBWrr
;
75 case AArch64::SUBSXrs
:
76 return AArch64::SUBXrs
;
77 case AArch64::SUBSWrs
:
78 return AArch64::SUBWrs
;
79 case AArch64::SUBSXri
:
80 return AArch64::SUBXri
;
81 case AArch64::SUBSWri
:
82 return AArch64::SUBWri
;
83 case AArch64::ADDSXrr
:
84 return AArch64::ADDXrr
;
85 case AArch64::ADDSWrr
:
86 return AArch64::ADDWrr
;
87 case AArch64::ADDSXrs
:
88 return AArch64::ADDXrs
;
89 case AArch64::ADDSWrs
:
90 return AArch64::ADDWrs
;
91 case AArch64::ADDSXri
:
92 return AArch64::ADDXri
;
93 case AArch64::ADDSWri
:
94 return AArch64::ADDWri
;
96 return AArch64::SBCXr
;
98 return AArch64::SBCWr
;
100 return AArch64::ADCXr
;
101 case AArch64::ADCSWr
:
102 return AArch64::ADCWr
;
106 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock
&MBB
) {
107 bool Changed
= false;
108 for (auto &MI
: make_early_inc_range(make_range(MBB
.begin(), MBB
.end()))) {
109 bool CurrentIterChanged
= foldSimpleCrossClassCopies(MI
);
110 if (!CurrentIterChanged
)
111 CurrentIterChanged
|= foldCopyDup(MI
);
112 Changed
|= CurrentIterChanged
;
117 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr
&MI
) {
118 auto *MF
= MI
.getMF();
119 auto &MRI
= MF
->getRegInfo();
124 if (MI
.getOperand(1).getSubReg())
125 return false; // Don't deal with subreg copies
127 Register Src
= MI
.getOperand(1).getReg();
128 Register Dst
= MI
.getOperand(0).getReg();
130 if (Src
.isPhysical() || Dst
.isPhysical())
133 const TargetRegisterClass
*SrcRC
= MRI
.getRegClass(Src
);
134 const TargetRegisterClass
*DstRC
= MRI
.getRegClass(Dst
);
140 if (SrcRC
->hasSubClass(DstRC
)) {
141 // This is the case where the source class is a superclass of the dest, so
142 // if the copy is the only user of the source, we can just constrain the
143 // source reg to the dest class.
145 if (!MRI
.hasOneNonDBGUse(Src
))
146 return false; // Only constrain single uses of the source.
148 // Constrain to dst reg class as long as it's not a weird class that only
149 // has a few registers.
150 if (!MRI
.constrainRegClass(Src
, DstRC
, /* MinNumRegs */ 25))
152 } else if (DstRC
->hasSubClass(SrcRC
)) {
153 // This is the inverse case, where the destination class is a superclass of
154 // the source. Here, if the copy is the only user, we can just constrain
155 // the user of the copy to use the smaller class of the source.
160 MRI
.replaceRegWith(Dst
, Src
);
161 MI
.eraseFromParent();
165 bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr
&MI
) {
169 auto *MF
= MI
.getMF();
170 auto &MRI
= MF
->getRegInfo();
171 auto *TII
= MF
->getSubtarget().getInstrInfo();
173 // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
174 // Here Dst is y and Src is the result of DUP.
175 Register Dst
= MI
.getOperand(0).getReg();
176 Register Src
= MI
.getOperand(1).getReg();
178 if (!Dst
.isVirtual() || !Src
.isVirtual())
181 auto TryMatchDUP
= [&](const TargetRegisterClass
*GPRRegClass
,
182 const TargetRegisterClass
*FPRRegClass
, unsigned DUP
,
184 if (MRI
.getRegClassOrNull(Dst
) != GPRRegClass
||
185 MRI
.getRegClassOrNull(Src
) != FPRRegClass
)
188 // There is a special case when one of the uses is COPY(z:FPR, y:GPR).
189 // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
190 // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
191 // not worthwhile in that case.
192 for (auto &Use
: MRI
.use_nodbg_instructions(Dst
)) {
196 Register UseOp0
= Use
.getOperand(0).getReg();
197 Register UseOp1
= Use
.getOperand(1).getReg();
198 if (UseOp0
.isPhysical() || UseOp1
.isPhysical())
201 if (MRI
.getRegClassOrNull(UseOp0
) == FPRRegClass
&&
202 MRI
.getRegClassOrNull(UseOp1
) == GPRRegClass
)
206 MachineInstr
*SrcMI
= MRI
.getUniqueVRegDef(Src
);
207 if (!SrcMI
|| SrcMI
->getOpcode() != DUP
|| !MRI
.hasOneNonDBGUse(Src
))
210 Register DupSrc
= SrcMI
->getOperand(1).getReg();
211 int64_t DupImm
= SrcMI
->getOperand(2).getImm();
213 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), TII
->get(UMOV
), Dst
)
216 SrcMI
->eraseFromParent();
217 MI
.eraseFromParent();
221 return TryMatchDUP(&AArch64::GPR32RegClass
, &AArch64::FPR32RegClass
,
222 AArch64::DUPi32
, AArch64::UMOVvi32
) ||
223 TryMatchDUP(&AArch64::GPR64RegClass
, &AArch64::FPR64RegClass
,
224 AArch64::DUPi64
, AArch64::UMOVvi64
);
227 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock
&MBB
) {
228 // If we find a dead NZCV implicit-def, we
229 // - try to convert the operation to a non-flag-setting equivalent
230 // - or mark the def as dead to aid later peephole optimizations.
234 // Consider the following code:
235 // FCMPSrr %0, %1, implicit-def $nzcv
236 // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
237 // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
238 // FCMPSrr %0, %1, implicit-def $nzcv
239 // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
240 // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
241 // when we have a single IR fcmp being used by two selects. During selection,
242 // to ensure that there can be no clobbering of nzcv between the fcmp and the
243 // csel, we have to generate an fcmp immediately before each csel is
245 // However, often we can essentially CSE these together later in MachineCSE.
246 // This doesn't work though if there are unrelated flag-setting instructions
247 // in between the two FCMPs. In this case, the SUBS defines NZCV
248 // but it doesn't have any users, being overwritten by the second FCMP.
251 // The instruction selector always emits the flag-setting variant of ADC/SBC
252 // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these
253 // instructions is never used, we can switch to the non-flag-setting variant.
255 bool Changed
= false;
256 auto &MF
= *MBB
.getParent();
257 auto &Subtarget
= MF
.getSubtarget();
258 const auto &TII
= Subtarget
.getInstrInfo();
259 auto TRI
= Subtarget
.getRegisterInfo();
260 auto RBI
= Subtarget
.getRegBankInfo();
261 auto &MRI
= MF
.getRegInfo();
263 LiveRegUnits
LRU(*MBB
.getParent()->getSubtarget().getRegisterInfo());
264 LRU
.addLiveOuts(MBB
);
266 for (auto &II
: instructionsWithoutDebug(MBB
.rbegin(), MBB
.rend())) {
267 bool NZCVDead
= LRU
.available(AArch64::NZCV
);
268 if (NZCVDead
&& II
.definesRegister(AArch64::NZCV
, /*TRI=*/nullptr)) {
269 // The instruction defines NZCV, but NZCV is dead.
270 unsigned NewOpc
= getNonFlagSettingVariant(II
.getOpcode());
272 II
.findRegisterDefOperandIdx(AArch64::NZCV
, /*TRI=*/nullptr);
273 if (DeadNZCVIdx
!= -1) {
275 // If there is an equivalent non-flag-setting op, we convert.
276 LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
279 II
.setDesc(TII
->get(NewOpc
));
280 II
.removeOperand(DeadNZCVIdx
);
281 // Changing the opcode can result in differing regclass requirements,
282 // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
283 // Constrain the regclasses, possibly introducing a copy.
284 constrainOperandRegClass(MF
, *TRI
, MRI
, *TII
, *RBI
, II
, II
.getDesc(),
285 II
.getOperand(0), 0);
288 // Otherwise, we just set the nzcv imp-def operand to be dead, so the
289 // peephole optimizations can optimize them further.
290 II
.getOperand(DeadNZCVIdx
).setIsDead();
294 LRU
.stepBackward(II
);
299 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction
&MF
) {
300 if (MF
.getProperties().hasProperty(
301 MachineFunctionProperties::Property::FailedISel
))
303 assert(MF
.getProperties().hasProperty(
304 MachineFunctionProperties::Property::Selected
) &&
305 "Expected a selected MF");
307 bool Changed
= false;
308 for (auto &BB
: MF
) {
309 Changed
|= optimizeNZCVDefs(BB
);
310 Changed
|= doPeepholeOpts(BB
);
315 char AArch64PostSelectOptimize::ID
= 0;
316 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize
, DEBUG_TYPE
,
317 "Optimize AArch64 selected instructions",
319 INITIALIZE_PASS_END(AArch64PostSelectOptimize
, DEBUG_TYPE
,
320 "Optimize AArch64 selected instructions", false,
324 FunctionPass
*createAArch64PostSelectOptimize() {
325 return new AArch64PostSelectOptimize();
327 } // end namespace llvm