1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // The machine combiner pass uses machine trace metrics to ensure the combined
10 // instructions do not lengthen the critical path or the resource depth.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/Analysis/ProfileSummaryInfo.h"
16 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17 #include "llvm/CodeGen/MachineCombinerPattern.h"
18 #include "llvm/CodeGen/MachineDominators.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineLoopInfo.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/MachineSizeOpts.h"
24 #include "llvm/CodeGen/MachineTraceMetrics.h"
25 #include "llvm/CodeGen/RegisterClassInfo.h"
26 #include "llvm/CodeGen/TargetInstrInfo.h"
27 #include "llvm/CodeGen/TargetRegisterInfo.h"
28 #include "llvm/CodeGen/TargetSchedule.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
37 #define DEBUG_TYPE "machine-combiner"
39 STATISTIC(NumInstCombined
, "Number of machineinst combined");
41 static cl::opt
<unsigned>
42 inc_threshold("machine-combiner-inc-threshold", cl::Hidden
,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
46 static cl::opt
<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden
,
47 cl::desc("Dump all substituted intrs"),
50 #ifdef EXPENSIVE_CHECKS
51 static cl::opt
<bool> VerifyPatternOrder(
52 "machine-combiner-verify-pattern-order", cl::Hidden
,
54 "Verify that the generated patterns are ordered by increasing latency"),
57 static cl::opt
<bool> VerifyPatternOrder(
58 "machine-combiner-verify-pattern-order", cl::Hidden
,
60 "Verify that the generated patterns are ordered by increasing latency"),
65 class MachineCombiner
: public MachineFunctionPass
{
66 const TargetSubtargetInfo
*STI
= nullptr;
67 const TargetInstrInfo
*TII
= nullptr;
68 const TargetRegisterInfo
*TRI
= nullptr;
69 MCSchedModel SchedModel
;
70 MachineRegisterInfo
*MRI
= nullptr;
71 MachineLoopInfo
*MLI
= nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics
*Traces
= nullptr;
73 MachineTraceMetrics::Ensemble
*TraceEnsemble
= nullptr;
74 MachineBlockFrequencyInfo
*MBFI
= nullptr;
75 ProfileSummaryInfo
*PSI
= nullptr;
76 RegisterClassInfo RegClassInfo
;
78 TargetSchedModel TSchedModel
;
80 /// True if optimizing for code size.
85 MachineCombiner() : MachineFunctionPass(ID
) {
86 initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
88 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
89 bool runOnMachineFunction(MachineFunction
&MF
) override
;
90 StringRef
getPassName() const override
{ return "Machine InstCombiner"; }
93 bool combineInstructions(MachineBasicBlock
*);
94 MachineInstr
*getOperandDef(const MachineOperand
&MO
);
95 bool isTransientMI(const MachineInstr
*MI
);
96 unsigned getDepth(SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
97 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
98 MachineTraceMetrics::Trace BlockTrace
,
99 const MachineBasicBlock
&MBB
);
100 unsigned getLatency(MachineInstr
*Root
, MachineInstr
*NewRoot
,
101 MachineTraceMetrics::Trace BlockTrace
);
103 improvesCriticalPathLen(MachineBasicBlock
*MBB
, MachineInstr
*Root
,
104 MachineTraceMetrics::Trace BlockTrace
,
105 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
106 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
107 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
108 MachineCombinerPattern Pattern
, bool SlackIsAccurate
);
109 bool reduceRegisterPressure(MachineInstr
&Root
, MachineBasicBlock
*MBB
,
110 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
111 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
112 MachineCombinerPattern Pattern
);
113 bool preservesResourceLen(MachineBasicBlock
*MBB
,
114 MachineTraceMetrics::Trace BlockTrace
,
115 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
116 SmallVectorImpl
<MachineInstr
*> &DelInstrs
);
117 void instr2instrSC(SmallVectorImpl
<MachineInstr
*> &Instrs
,
118 SmallVectorImpl
<const MCSchedClassDesc
*> &InstrsSC
);
119 std::pair
<unsigned, unsigned>
120 getLatenciesForInstrSequences(MachineInstr
&MI
,
121 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
122 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
123 MachineTraceMetrics::Trace BlockTrace
);
125 void verifyPatternOrder(MachineBasicBlock
*MBB
, MachineInstr
&Root
,
126 SmallVector
<MachineCombinerPattern
, 16> &Patterns
);
130 char MachineCombiner::ID
= 0;
131 char &llvm::MachineCombinerID
= MachineCombiner::ID
;
133 INITIALIZE_PASS_BEGIN(MachineCombiner
, DEBUG_TYPE
,
134 "Machine InstCombiner", false, false)
135 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo
)
136 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics
)
137 INITIALIZE_PASS_END(MachineCombiner
, DEBUG_TYPE
, "Machine InstCombiner",
140 void MachineCombiner::getAnalysisUsage(AnalysisUsage
&AU
) const {
141 AU
.setPreservesCFG();
142 AU
.addPreserved
<MachineDominatorTree
>();
143 AU
.addRequired
<MachineLoopInfo
>();
144 AU
.addPreserved
<MachineLoopInfo
>();
145 AU
.addRequired
<MachineTraceMetrics
>();
146 AU
.addPreserved
<MachineTraceMetrics
>();
147 AU
.addRequired
<LazyMachineBlockFrequencyInfoPass
>();
148 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
149 MachineFunctionPass::getAnalysisUsage(AU
);
153 MachineCombiner::getOperandDef(const MachineOperand
&MO
) {
154 MachineInstr
*DefInstr
= nullptr;
155 // We need a virtual register definition.
156 if (MO
.isReg() && MO
.getReg().isVirtual())
157 DefInstr
= MRI
->getUniqueVRegDef(MO
.getReg());
158 // PHI's have no depth etc.
159 if (DefInstr
&& DefInstr
->isPHI())
164 /// Return true if MI is unlikely to generate an actual target instruction.
165 bool MachineCombiner::isTransientMI(const MachineInstr
*MI
) {
167 return MI
->isTransient();
169 // If MI is a COPY, check if its src and dst registers can be coalesced.
170 Register Dst
= MI
->getOperand(0).getReg();
171 Register Src
= MI
->getOperand(1).getReg();
173 if (!MI
->isFullCopy()) {
174 // If src RC contains super registers of dst RC, it can also be coalesced.
175 if (MI
->getOperand(0).getSubReg() || Src
.isPhysical() || Dst
.isPhysical())
178 auto SrcSub
= MI
->getOperand(1).getSubReg();
179 auto SrcRC
= MRI
->getRegClass(Src
);
180 auto DstRC
= MRI
->getRegClass(Dst
);
181 return TRI
->getMatchingSuperRegClass(SrcRC
, DstRC
, SrcSub
) != nullptr;
184 if (Src
.isPhysical() && Dst
.isPhysical())
187 if (Src
.isVirtual() && Dst
.isVirtual()) {
188 auto SrcRC
= MRI
->getRegClass(Src
);
189 auto DstRC
= MRI
->getRegClass(Dst
);
190 return SrcRC
->hasSuperClassEq(DstRC
) || SrcRC
->hasSubClassEq(DstRC
);
196 // Now Src is physical register, Dst is virtual register.
197 auto DstRC
= MRI
->getRegClass(Dst
);
198 return DstRC
->contains(Src
);
201 /// Computes depth of instructions in vector \InsInstr.
203 /// \param InsInstrs is a vector of machine instructions
204 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
205 /// of defining machine instruction in \p InsInstrs
206 /// \param BlockTrace is a trace of machine instructions
208 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
210 MachineCombiner::getDepth(SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
211 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
212 MachineTraceMetrics::Trace BlockTrace
,
213 const MachineBasicBlock
&MBB
) {
214 SmallVector
<unsigned, 16> InstrDepth
;
215 // For each instruction in the new sequence compute the depth based on the
216 // operands. Use the trace information when possible. For new operands which
217 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
218 for (auto *InstrPtr
: InsInstrs
) { // for each Use
220 for (const MachineOperand
&MO
: InstrPtr
->all_uses()) {
221 // Check for virtual register operand.
222 if (!MO
.getReg().isVirtual())
224 unsigned DepthOp
= 0;
225 unsigned LatencyOp
= 0;
226 DenseMap
<unsigned, unsigned>::iterator II
=
227 InstrIdxForVirtReg
.find(MO
.getReg());
228 if (II
!= InstrIdxForVirtReg
.end()) {
229 // Operand is new virtual register not in trace
230 assert(II
->second
< InstrDepth
.size() && "Bad Index");
231 MachineInstr
*DefInstr
= InsInstrs
[II
->second
];
233 "There must be a definition for a new virtual register");
234 DepthOp
= InstrDepth
[II
->second
];
235 int DefIdx
= DefInstr
->findRegisterDefOperandIdx(MO
.getReg());
236 int UseIdx
= InstrPtr
->findRegisterUseOperandIdx(MO
.getReg());
237 LatencyOp
= TSchedModel
.computeOperandLatency(DefInstr
, DefIdx
,
240 MachineInstr
*DefInstr
= getOperandDef(MO
);
241 if (DefInstr
&& (TII
->getMachineCombinerTraceStrategy() !=
242 MachineTraceStrategy::TS_Local
||
243 DefInstr
->getParent() == &MBB
)) {
244 DepthOp
= BlockTrace
.getInstrCycles(*DefInstr
).Depth
;
245 if (!isTransientMI(DefInstr
))
246 LatencyOp
= TSchedModel
.computeOperandLatency(
247 DefInstr
, DefInstr
->findRegisterDefOperandIdx(MO
.getReg()),
248 InstrPtr
, InstrPtr
->findRegisterUseOperandIdx(MO
.getReg()));
251 IDepth
= std::max(IDepth
, DepthOp
+ LatencyOp
);
253 InstrDepth
.push_back(IDepth
);
255 unsigned NewRootIdx
= InsInstrs
.size() - 1;
256 return InstrDepth
[NewRootIdx
];
259 /// Computes instruction latency as max of latency of defined operands.
261 /// \param Root is a machine instruction that could be replaced by NewRoot.
262 /// It is used to compute a more accurate latency information for NewRoot in
263 /// case there is a dependent instruction in the same trace (\p BlockTrace)
264 /// \param NewRoot is the instruction for which the latency is computed
265 /// \param BlockTrace is a trace of machine instructions
267 /// \returns Latency of \p NewRoot
268 unsigned MachineCombiner::getLatency(MachineInstr
*Root
, MachineInstr
*NewRoot
,
269 MachineTraceMetrics::Trace BlockTrace
) {
270 // Check each definition in NewRoot and compute the latency
271 unsigned NewRootLatency
= 0;
273 for (const MachineOperand
&MO
: NewRoot
->all_defs()) {
274 // Check for virtual register operand.
275 if (!MO
.getReg().isVirtual())
277 // Get the first instruction that uses MO
278 MachineRegisterInfo::reg_iterator RI
= MRI
->reg_begin(MO
.getReg());
280 if (RI
== MRI
->reg_end())
282 MachineInstr
*UseMO
= RI
->getParent();
283 unsigned LatencyOp
= 0;
284 if (UseMO
&& BlockTrace
.isDepInTrace(*Root
, *UseMO
)) {
285 LatencyOp
= TSchedModel
.computeOperandLatency(
286 NewRoot
, NewRoot
->findRegisterDefOperandIdx(MO
.getReg()), UseMO
,
287 UseMO
->findRegisterUseOperandIdx(MO
.getReg()));
289 LatencyOp
= TSchedModel
.computeInstrLatency(NewRoot
);
291 NewRootLatency
= std::max(NewRootLatency
, LatencyOp
);
293 return NewRootLatency
;
296 /// The combiner's goal may differ based on which pattern it is attempting
298 enum class CombinerObjective
{
299 MustReduceDepth
, // The data dependency chain must be improved.
300 MustReduceRegisterPressure
, // The register pressure must be reduced.
301 Default
// The critical path must not be lengthened.
304 static CombinerObjective
getCombinerObjective(MachineCombinerPattern P
) {
305 // TODO: If C++ ever gets a real enum class, make this part of the
306 // MachineCombinerPattern class.
308 case MachineCombinerPattern::REASSOC_AX_BY
:
309 case MachineCombinerPattern::REASSOC_AX_YB
:
310 case MachineCombinerPattern::REASSOC_XA_BY
:
311 case MachineCombinerPattern::REASSOC_XA_YB
:
312 case MachineCombinerPattern::REASSOC_XY_AMM_BMM
:
313 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM
:
314 case MachineCombinerPattern::SUBADD_OP1
:
315 case MachineCombinerPattern::SUBADD_OP2
:
316 case MachineCombinerPattern::FMADD_AX
:
317 case MachineCombinerPattern::FMADD_XA
:
318 case MachineCombinerPattern::FMSUB
:
319 case MachineCombinerPattern::FNMSUB
:
320 return CombinerObjective::MustReduceDepth
;
321 case MachineCombinerPattern::REASSOC_XY_BCA
:
322 case MachineCombinerPattern::REASSOC_XY_BAC
:
323 return CombinerObjective::MustReduceRegisterPressure
;
325 return CombinerObjective::Default
;
329 /// Estimate the latency of the new and original instruction sequence by summing
330 /// up the latencies of the inserted and deleted instructions. This assumes
331 /// that the inserted and deleted instructions are dependent instruction chains,
332 /// which might not hold in all cases.
333 std::pair
<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
334 MachineInstr
&MI
, SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
335 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
336 MachineTraceMetrics::Trace BlockTrace
) {
337 assert(!InsInstrs
.empty() && "Only support sequences that insert instrs.");
338 unsigned NewRootLatency
= 0;
339 // NewRoot is the last instruction in the \p InsInstrs vector.
340 MachineInstr
*NewRoot
= InsInstrs
.back();
341 for (unsigned i
= 0; i
< InsInstrs
.size() - 1; i
++)
342 NewRootLatency
+= TSchedModel
.computeInstrLatency(InsInstrs
[i
]);
343 NewRootLatency
+= getLatency(&MI
, NewRoot
, BlockTrace
);
345 unsigned RootLatency
= 0;
346 for (auto *I
: DelInstrs
)
347 RootLatency
+= TSchedModel
.computeInstrLatency(I
);
349 return {NewRootLatency
, RootLatency
};
352 bool MachineCombiner::reduceRegisterPressure(
353 MachineInstr
&Root
, MachineBasicBlock
*MBB
,
354 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
355 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
356 MachineCombinerPattern Pattern
) {
357 // FIXME: for now, we don't do any check for the register pressure patterns.
358 // We treat them as always profitable. But we can do better if we make
359 // RegPressureTracker class be aware of TIE attribute. Then we can get an
360 // accurate compare of register pressure with DelInstrs or InsInstrs.
364 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
365 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
366 /// sequence to replace the old sequence is that it cannot lengthen the critical
367 /// path. The definition of "improve" may be restricted by specifying that the
368 /// new path improves the data dependency chain (MustReduceDepth).
369 bool MachineCombiner::improvesCriticalPathLen(
370 MachineBasicBlock
*MBB
, MachineInstr
*Root
,
371 MachineTraceMetrics::Trace BlockTrace
,
372 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
373 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
374 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
375 MachineCombinerPattern Pattern
,
376 bool SlackIsAccurate
) {
377 // Get depth and latency of NewRoot and Root.
378 unsigned NewRootDepth
=
379 getDepth(InsInstrs
, InstrIdxForVirtReg
, BlockTrace
, *MBB
);
380 unsigned RootDepth
= BlockTrace
.getInstrCycles(*Root
).Depth
;
382 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root
<< "\tNewRootDepth: "
383 << NewRootDepth
<< "\tRootDepth: " << RootDepth
);
385 // For a transform such as reassociation, the cost equation is
386 // conservatively calculated so that we must improve the depth (data
387 // dependency cycles) in the critical path to proceed with the transform.
388 // Being conservative also protects against inaccuracies in the underlying
389 // machine trace metrics and CPU models.
390 if (getCombinerObjective(Pattern
) == CombinerObjective::MustReduceDepth
) {
391 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
392 LLVM_DEBUG(NewRootDepth
< RootDepth
393 ? dbgs() << "\t and it does it\n"
394 : dbgs() << "\t but it does NOT do it\n");
395 return NewRootDepth
< RootDepth
;
398 // A more flexible cost calculation for the critical path includes the slack
399 // of the original code sequence. This may allow the transform to proceed
400 // even if the instruction depths (data dependency cycles) become worse.
402 // Account for the latency of the inserted and deleted instructions by
403 unsigned NewRootLatency
, RootLatency
;
404 if (TII
->accumulateInstrSeqToRootLatency(*Root
)) {
405 std::tie(NewRootLatency
, RootLatency
) =
406 getLatenciesForInstrSequences(*Root
, InsInstrs
, DelInstrs
, BlockTrace
);
408 NewRootLatency
= TSchedModel
.computeInstrLatency(InsInstrs
.back());
409 RootLatency
= TSchedModel
.computeInstrLatency(Root
);
412 unsigned RootSlack
= BlockTrace
.getInstrSlack(*Root
);
413 unsigned NewCycleCount
= NewRootDepth
+ NewRootLatency
;
414 unsigned OldCycleCount
=
415 RootDepth
+ RootLatency
+ (SlackIsAccurate
? RootSlack
: 0);
416 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
417 << "\tRootLatency: " << RootLatency
<< "\n\tRootSlack: "
418 << RootSlack
<< " SlackIsAccurate=" << SlackIsAccurate
419 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
420 << "\n\tRootDepth + RootLatency + RootSlack = "
422 LLVM_DEBUG(NewCycleCount
<= OldCycleCount
423 ? dbgs() << "\n\t It IMPROVES PathLen because"
424 : dbgs() << "\n\t It DOES NOT improve PathLen because");
425 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
426 << ", OldCycleCount = " << OldCycleCount
<< "\n");
428 return NewCycleCount
<= OldCycleCount
;
431 /// helper routine to convert instructions into SC
432 void MachineCombiner::instr2instrSC(
433 SmallVectorImpl
<MachineInstr
*> &Instrs
,
434 SmallVectorImpl
<const MCSchedClassDesc
*> &InstrsSC
) {
435 for (auto *InstrPtr
: Instrs
) {
436 unsigned Opc
= InstrPtr
->getOpcode();
437 unsigned Idx
= TII
->get(Opc
).getSchedClass();
438 const MCSchedClassDesc
*SC
= SchedModel
.getSchedClassDesc(Idx
);
439 InstrsSC
.push_back(SC
);
443 /// True when the new instructions do not increase resource length
444 bool MachineCombiner::preservesResourceLen(
445 MachineBasicBlock
*MBB
, MachineTraceMetrics::Trace BlockTrace
,
446 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
447 SmallVectorImpl
<MachineInstr
*> &DelInstrs
) {
448 if (!TSchedModel
.hasInstrSchedModel())
451 // Compute current resource length
453 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
454 SmallVector
<const MachineBasicBlock
*, 1> MBBarr
;
455 MBBarr
.push_back(MBB
);
456 unsigned ResLenBeforeCombine
= BlockTrace
.getResourceLength(MBBarr
);
458 // Deal with SC rather than Instructions.
459 SmallVector
<const MCSchedClassDesc
*, 16> InsInstrsSC
;
460 SmallVector
<const MCSchedClassDesc
*, 16> DelInstrsSC
;
462 instr2instrSC(InsInstrs
, InsInstrsSC
);
463 instr2instrSC(DelInstrs
, DelInstrsSC
);
465 ArrayRef
<const MCSchedClassDesc
*> MSCInsArr
{InsInstrsSC
};
466 ArrayRef
<const MCSchedClassDesc
*> MSCDelArr
{DelInstrsSC
};
468 // Compute new resource length.
469 unsigned ResLenAfterCombine
=
470 BlockTrace
.getResourceLength(MBBarr
, MSCInsArr
, MSCDelArr
);
472 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
473 << ResLenBeforeCombine
474 << " and after: " << ResLenAfterCombine
<< "\n";);
476 ResLenAfterCombine
<=
477 ResLenBeforeCombine
+ TII
->getExtendResourceLenLimit()
478 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
479 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
482 return ResLenAfterCombine
<=
483 ResLenBeforeCombine
+ TII
->getExtendResourceLenLimit();
486 /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
487 /// depths if requested.
489 /// \param MBB basic block to insert instructions in
490 /// \param MI current machine instruction
491 /// \param InsInstrs new instructions to insert in \p MBB
492 /// \param DelInstrs instruction to delete from \p MBB
493 /// \param TraceEnsemble is a pointer to the machine trace information
494 /// \param RegUnits set of live registers, needed to compute instruction depths
495 /// \param TII is target instruction info, used to call target hook
496 /// \param Pattern is used to call target hook finalizeInsInstrs
497 /// \param IncrementalUpdate if true, compute instruction depths incrementally,
498 /// otherwise invalidate the trace
499 static void insertDeleteInstructions(
500 MachineBasicBlock
*MBB
, MachineInstr
&MI
,
501 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
502 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
503 MachineTraceMetrics::Ensemble
*TraceEnsemble
,
504 SparseSet
<LiveRegUnit
> &RegUnits
, const TargetInstrInfo
*TII
,
505 MachineCombinerPattern Pattern
, bool IncrementalUpdate
) {
506 // If we want to fix up some placeholder for some target, do it now.
507 // We need this because in genAlternativeCodeSequence, we have not decided the
508 // better pattern InsInstrs or DelInstrs, so we don't want generate some
509 // sideeffect to the function. For example we need to delay the constant pool
510 // entry creation here after InsInstrs is selected as better pattern.
511 // Otherwise the constant pool entry created for InsInstrs will not be deleted
512 // even if InsInstrs is not the better pattern.
513 TII
->finalizeInsInstrs(MI
, Pattern
, InsInstrs
);
515 for (auto *InstrPtr
: InsInstrs
)
516 MBB
->insert((MachineBasicBlock::iterator
)&MI
, InstrPtr
);
518 for (auto *InstrPtr
: DelInstrs
) {
519 InstrPtr
->eraseFromParent();
520 // Erase all LiveRegs defined by the removed instruction
521 for (auto *I
= RegUnits
.begin(); I
!= RegUnits
.end();) {
522 if (I
->MI
== InstrPtr
)
523 I
= RegUnits
.erase(I
);
529 if (IncrementalUpdate
)
530 for (auto *InstrPtr
: InsInstrs
)
531 TraceEnsemble
->updateDepth(MBB
, *InstrPtr
, RegUnits
);
533 TraceEnsemble
->invalidate(MBB
);
538 // Check that the difference between original and new latency is decreasing for
539 // later patterns. This helps to discover sub-optimal pattern orderings.
540 void MachineCombiner::verifyPatternOrder(
541 MachineBasicBlock
*MBB
, MachineInstr
&Root
,
542 SmallVector
<MachineCombinerPattern
, 16> &Patterns
) {
543 long PrevLatencyDiff
= std::numeric_limits
<long>::max();
544 (void)PrevLatencyDiff
; // Variable is used in assert only.
545 for (auto P
: Patterns
) {
546 SmallVector
<MachineInstr
*, 16> InsInstrs
;
547 SmallVector
<MachineInstr
*, 16> DelInstrs
;
548 DenseMap
<unsigned, unsigned> InstrIdxForVirtReg
;
549 TII
->genAlternativeCodeSequence(Root
, P
, InsInstrs
, DelInstrs
,
551 // Found pattern, but did not generate alternative sequence.
552 // This can happen e.g. when an immediate could not be materialized
553 // in a single instruction.
554 if (InsInstrs
.empty() || !TSchedModel
.hasInstrSchedModelOrItineraries())
557 unsigned NewRootLatency
, RootLatency
;
558 std::tie(NewRootLatency
, RootLatency
) = getLatenciesForInstrSequences(
559 Root
, InsInstrs
, DelInstrs
, TraceEnsemble
->getTrace(MBB
));
560 long CurrentLatencyDiff
= ((long)RootLatency
) - ((long)NewRootLatency
);
561 assert(CurrentLatencyDiff
<= PrevLatencyDiff
&&
562 "Current pattern is better than previous pattern.");
563 PrevLatencyDiff
= CurrentLatencyDiff
;
567 /// Substitute a slow code sequence with a faster one by
568 /// evaluating instruction combining pattern.
569 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
570 /// combining based on machine trace metrics. Only combine a sequence of
571 /// instructions when this neither lengthens the critical path nor increases
572 /// resource pressure. When optimizing for codesize always combine when the new
573 /// sequence is shorter.
574 bool MachineCombiner::combineInstructions(MachineBasicBlock
*MBB
) {
575 bool Changed
= false;
576 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB
->getName() << "\n");
578 bool IncrementalUpdate
= false;
579 auto BlockIter
= MBB
->begin();
580 decltype(BlockIter
) LastUpdate
;
581 // Check if the block is in a loop.
582 const MachineLoop
*ML
= MLI
->getLoopFor(MBB
);
584 TraceEnsemble
= Traces
->getEnsemble(TII
->getMachineCombinerTraceStrategy());
586 SparseSet
<LiveRegUnit
> RegUnits
;
587 RegUnits
.setUniverse(TRI
->getNumRegUnits());
589 bool OptForSize
= OptSize
|| llvm::shouldOptimizeForSize(MBB
, PSI
, MBFI
);
591 bool DoRegPressureReduce
=
592 TII
->shouldReduceRegisterPressure(MBB
, &RegClassInfo
);
594 while (BlockIter
!= MBB
->end()) {
595 auto &MI
= *BlockIter
++;
596 SmallVector
<MachineCombinerPattern
, 16> Patterns
;
597 // The motivating example is:
599 // MUL Other MUL_op1 MUL_op2 Other
601 // ADD/SUB => MADD/MSUB
602 // (=Root) (=NewRoot)
604 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
605 // usually beneficial for code size it unfortunately can hurt performance
606 // when the ADD is on the critical path, but the MUL is not. With the
607 // substitution the MUL becomes part of the critical path (in form of the
608 // MADD) and can lengthen it on architectures where the MADD latency is
609 // longer than the ADD latency.
611 // For each instruction we check if it can be the root of a combiner
612 // pattern. Then for each pattern the new code sequence in form of MI is
613 // generated and evaluated. When the efficiency criteria (don't lengthen
614 // critical path, don't use more resources) is met the new sequence gets
615 // hooked up into the basic block before the old sequence is removed.
617 // The algorithm does not try to evaluate all patterns and pick the best.
618 // This is only an artificial restriction though. In practice there is
619 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
620 // based on an internal cost heuristic. If
621 // machine-combiner-verify-pattern-order is enabled, all patterns are
622 // checked to ensure later patterns do not provide better latency savings.
624 if (!TII
->getMachineCombinerPatterns(MI
, Patterns
, DoRegPressureReduce
))
627 if (VerifyPatternOrder
)
628 verifyPatternOrder(MBB
, MI
, Patterns
);
630 for (const auto P
: Patterns
) {
631 SmallVector
<MachineInstr
*, 16> InsInstrs
;
632 SmallVector
<MachineInstr
*, 16> DelInstrs
;
633 DenseMap
<unsigned, unsigned> InstrIdxForVirtReg
;
634 TII
->genAlternativeCodeSequence(MI
, P
, InsInstrs
, DelInstrs
,
636 // Found pattern, but did not generate alternative sequence.
637 // This can happen e.g. when an immediate could not be materialized
638 // in a single instruction.
639 if (InsInstrs
.empty())
642 LLVM_DEBUG(if (dump_intrs
) {
643 dbgs() << "\tFor the Pattern (" << (int)P
644 << ") these instructions could be removed\n";
645 for (auto const *InstrPtr
: DelInstrs
)
646 InstrPtr
->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
647 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII
);
648 dbgs() << "\tThese instructions could replace the removed ones\n";
649 for (auto const *InstrPtr
: InsInstrs
)
650 InstrPtr
->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
651 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII
);
654 if (IncrementalUpdate
&& LastUpdate
!= BlockIter
) {
655 // Update depths since the last incremental update.
656 TraceEnsemble
->updateDepths(LastUpdate
, BlockIter
, RegUnits
);
657 LastUpdate
= BlockIter
;
660 if (DoRegPressureReduce
&&
661 getCombinerObjective(P
) ==
662 CombinerObjective::MustReduceRegisterPressure
) {
663 if (MBB
->size() > inc_threshold
) {
664 // Use incremental depth updates for basic blocks above threshold
665 IncrementalUpdate
= true;
666 LastUpdate
= BlockIter
;
668 if (reduceRegisterPressure(MI
, MBB
, InsInstrs
, DelInstrs
, P
)) {
669 // Replace DelInstrs with InsInstrs.
670 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, TraceEnsemble
,
671 RegUnits
, TII
, P
, IncrementalUpdate
);
674 // Go back to previous instruction as it may have ILP reassociation
681 if (ML
&& TII
->isThroughputPattern(P
)) {
682 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
683 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, TraceEnsemble
,
684 RegUnits
, TII
, P
, IncrementalUpdate
);
685 // Eagerly stop after the first pattern fires.
688 } else if (OptForSize
&& InsInstrs
.size() < DelInstrs
.size()) {
689 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
690 << InsInstrs
.size() << " < "
691 << DelInstrs
.size() << ")\n");
692 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, TraceEnsemble
,
693 RegUnits
, TII
, P
, IncrementalUpdate
);
694 // Eagerly stop after the first pattern fires.
698 // For big basic blocks, we only compute the full trace the first time
699 // we hit this. We do not invalidate the trace, but instead update the
700 // instruction depths incrementally.
701 // NOTE: Only the instruction depths up to MI are accurate. All other
702 // trace information is not updated.
703 MachineTraceMetrics::Trace BlockTrace
= TraceEnsemble
->getTrace(MBB
);
704 Traces
->verifyAnalysis();
705 if (improvesCriticalPathLen(MBB
, &MI
, BlockTrace
, InsInstrs
, DelInstrs
,
706 InstrIdxForVirtReg
, P
,
707 !IncrementalUpdate
) &&
708 preservesResourceLen(MBB
, BlockTrace
, InsInstrs
, DelInstrs
)) {
709 if (MBB
->size() > inc_threshold
) {
710 // Use incremental depth updates for basic blocks above treshold
711 IncrementalUpdate
= true;
712 LastUpdate
= BlockIter
;
715 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, TraceEnsemble
,
716 RegUnits
, TII
, P
, IncrementalUpdate
);
718 // Eagerly stop after the first pattern fires.
722 // Cleanup instructions of the alternative code sequence. There is no
724 MachineFunction
*MF
= MBB
->getParent();
725 for (auto *InstrPtr
: InsInstrs
)
726 MF
->deleteMachineInstr(InstrPtr
);
728 InstrIdxForVirtReg
.clear();
732 if (Changed
&& IncrementalUpdate
)
733 Traces
->invalidate(MBB
);
737 bool MachineCombiner::runOnMachineFunction(MachineFunction
&MF
) {
738 STI
= &MF
.getSubtarget();
739 TII
= STI
->getInstrInfo();
740 TRI
= STI
->getRegisterInfo();
741 SchedModel
= STI
->getSchedModel();
742 TSchedModel
.init(STI
);
743 MRI
= &MF
.getRegInfo();
744 MLI
= &getAnalysis
<MachineLoopInfo
>();
745 Traces
= &getAnalysis
<MachineTraceMetrics
>();
746 PSI
= &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
747 MBFI
= (PSI
&& PSI
->hasProfileSummary()) ?
748 &getAnalysis
<LazyMachineBlockFrequencyInfoPass
>().getBFI() :
750 TraceEnsemble
= nullptr;
751 OptSize
= MF
.getFunction().hasOptSize();
752 RegClassInfo
.runOnMachineFunction(MF
);
754 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF
.getName() << '\n');
755 if (!TII
->useMachineCombiner()) {
758 << " Skipping pass: Target does not support machine combiner\n");
762 bool Changed
= false;
764 // Try to combine instructions.
766 Changed
|= combineInstructions(&MBB
);