1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // The machine combiner pass uses machine trace metrics to ensure the combined
10 // instructions do not lengthen the critical path or the resource depth.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/Analysis/ProfileSummaryInfo.h"
16 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17 #include "llvm/CodeGen/MachineDominators.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/CodeGen/MachineLoopInfo.h"
21 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 #include "llvm/CodeGen/MachineSizeOpts.h"
23 #include "llvm/CodeGen/MachineTraceMetrics.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/CodeGen/RegisterClassInfo.h"
26 #include "llvm/CodeGen/TargetInstrInfo.h"
27 #include "llvm/CodeGen/TargetRegisterInfo.h"
28 #include "llvm/CodeGen/TargetSchedule.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
37 #define DEBUG_TYPE "machine-combiner"
39 STATISTIC(NumInstCombined
, "Number of machineinst combined");
41 static cl::opt
<unsigned>
42 inc_threshold("machine-combiner-inc-threshold", cl::Hidden
,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
46 static cl::opt
<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden
,
47 cl::desc("Dump all substituted intrs"),
50 #ifdef EXPENSIVE_CHECKS
51 static cl::opt
<bool> VerifyPatternOrder(
52 "machine-combiner-verify-pattern-order", cl::Hidden
,
54 "Verify that the generated patterns are ordered by increasing latency"),
57 static cl::opt
<bool> VerifyPatternOrder(
58 "machine-combiner-verify-pattern-order", cl::Hidden
,
60 "Verify that the generated patterns are ordered by increasing latency"),
65 class MachineCombiner
: public MachineFunctionPass
{
66 const TargetSubtargetInfo
*STI
;
67 const TargetInstrInfo
*TII
;
68 const TargetRegisterInfo
*TRI
;
69 MCSchedModel SchedModel
;
70 MachineRegisterInfo
*MRI
;
71 MachineLoopInfo
*MLI
; // Current MachineLoopInfo
72 MachineTraceMetrics
*Traces
;
73 MachineTraceMetrics::Ensemble
*MinInstr
;
74 MachineBlockFrequencyInfo
*MBFI
;
75 ProfileSummaryInfo
*PSI
;
76 RegisterClassInfo RegClassInfo
;
78 TargetSchedModel TSchedModel
;
80 /// True if optimizing for code size.
85 MachineCombiner() : MachineFunctionPass(ID
) {
86 initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
88 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
89 bool runOnMachineFunction(MachineFunction
&MF
) override
;
90 StringRef
getPassName() const override
{ return "Machine InstCombiner"; }
93 bool doSubstitute(unsigned NewSize
, unsigned OldSize
, bool OptForSize
);
94 bool combineInstructions(MachineBasicBlock
*);
95 MachineInstr
*getOperandDef(const MachineOperand
&MO
);
96 unsigned getDepth(SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
97 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
98 MachineTraceMetrics::Trace BlockTrace
);
99 unsigned getLatency(MachineInstr
*Root
, MachineInstr
*NewRoot
,
100 MachineTraceMetrics::Trace BlockTrace
);
102 improvesCriticalPathLen(MachineBasicBlock
*MBB
, MachineInstr
*Root
,
103 MachineTraceMetrics::Trace BlockTrace
,
104 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
105 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
106 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
107 MachineCombinerPattern Pattern
, bool SlackIsAccurate
);
108 bool reduceRegisterPressure(MachineInstr
&Root
, MachineBasicBlock
*MBB
,
109 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
110 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
111 MachineCombinerPattern Pattern
);
112 bool preservesResourceLen(MachineBasicBlock
*MBB
,
113 MachineTraceMetrics::Trace BlockTrace
,
114 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
115 SmallVectorImpl
<MachineInstr
*> &DelInstrs
);
116 void instr2instrSC(SmallVectorImpl
<MachineInstr
*> &Instrs
,
117 SmallVectorImpl
<const MCSchedClassDesc
*> &InstrsSC
);
118 std::pair
<unsigned, unsigned>
119 getLatenciesForInstrSequences(MachineInstr
&MI
,
120 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
121 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
122 MachineTraceMetrics::Trace BlockTrace
);
124 void verifyPatternOrder(MachineBasicBlock
*MBB
, MachineInstr
&Root
,
125 SmallVector
<MachineCombinerPattern
, 16> &Patterns
);
129 char MachineCombiner::ID
= 0;
130 char &llvm::MachineCombinerID
= MachineCombiner::ID
;
132 INITIALIZE_PASS_BEGIN(MachineCombiner
, DEBUG_TYPE
,
133 "Machine InstCombiner", false, false)
134 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo
)
135 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics
)
136 INITIALIZE_PASS_END(MachineCombiner
, DEBUG_TYPE
, "Machine InstCombiner",
139 void MachineCombiner::getAnalysisUsage(AnalysisUsage
&AU
) const {
140 AU
.setPreservesCFG();
141 AU
.addPreserved
<MachineDominatorTree
>();
142 AU
.addRequired
<MachineLoopInfo
>();
143 AU
.addPreserved
<MachineLoopInfo
>();
144 AU
.addRequired
<MachineTraceMetrics
>();
145 AU
.addPreserved
<MachineTraceMetrics
>();
146 AU
.addRequired
<LazyMachineBlockFrequencyInfoPass
>();
147 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
148 MachineFunctionPass::getAnalysisUsage(AU
);
151 MachineInstr
*MachineCombiner::getOperandDef(const MachineOperand
&MO
) {
152 MachineInstr
*DefInstr
= nullptr;
153 // We need a virtual register definition.
154 if (MO
.isReg() && Register::isVirtualRegister(MO
.getReg()))
155 DefInstr
= MRI
->getUniqueVRegDef(MO
.getReg());
156 // PHI's have no depth etc.
157 if (DefInstr
&& DefInstr
->isPHI())
162 /// Computes depth of instructions in vector \InsInstr.
164 /// \param InsInstrs is a vector of machine instructions
165 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
166 /// of defining machine instruction in \p InsInstrs
167 /// \param BlockTrace is a trace of machine instructions
169 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
171 MachineCombiner::getDepth(SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
172 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
173 MachineTraceMetrics::Trace BlockTrace
) {
174 SmallVector
<unsigned, 16> InstrDepth
;
175 assert(TSchedModel
.hasInstrSchedModelOrItineraries() &&
176 "Missing machine model\n");
178 // For each instruction in the new sequence compute the depth based on the
179 // operands. Use the trace information when possible. For new operands which
180 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
181 for (auto *InstrPtr
: InsInstrs
) { // for each Use
183 for (const MachineOperand
&MO
: InstrPtr
->operands()) {
184 // Check for virtual register operand.
185 if (!(MO
.isReg() && Register::isVirtualRegister(MO
.getReg())))
189 unsigned DepthOp
= 0;
190 unsigned LatencyOp
= 0;
191 DenseMap
<unsigned, unsigned>::iterator II
=
192 InstrIdxForVirtReg
.find(MO
.getReg());
193 if (II
!= InstrIdxForVirtReg
.end()) {
194 // Operand is new virtual register not in trace
195 assert(II
->second
< InstrDepth
.size() && "Bad Index");
196 MachineInstr
*DefInstr
= InsInstrs
[II
->second
];
198 "There must be a definition for a new virtual register");
199 DepthOp
= InstrDepth
[II
->second
];
200 int DefIdx
= DefInstr
->findRegisterDefOperandIdx(MO
.getReg());
201 int UseIdx
= InstrPtr
->findRegisterUseOperandIdx(MO
.getReg());
202 LatencyOp
= TSchedModel
.computeOperandLatency(DefInstr
, DefIdx
,
205 MachineInstr
*DefInstr
= getOperandDef(MO
);
207 DepthOp
= BlockTrace
.getInstrCycles(*DefInstr
).Depth
;
208 LatencyOp
= TSchedModel
.computeOperandLatency(
209 DefInstr
, DefInstr
->findRegisterDefOperandIdx(MO
.getReg()),
210 InstrPtr
, InstrPtr
->findRegisterUseOperandIdx(MO
.getReg()));
213 IDepth
= std::max(IDepth
, DepthOp
+ LatencyOp
);
215 InstrDepth
.push_back(IDepth
);
217 unsigned NewRootIdx
= InsInstrs
.size() - 1;
218 return InstrDepth
[NewRootIdx
];
221 /// Computes instruction latency as max of latency of defined operands.
223 /// \param Root is a machine instruction that could be replaced by NewRoot.
224 /// It is used to compute a more accurate latency information for NewRoot in
225 /// case there is a dependent instruction in the same trace (\p BlockTrace)
226 /// \param NewRoot is the instruction for which the latency is computed
227 /// \param BlockTrace is a trace of machine instructions
229 /// \returns Latency of \p NewRoot
230 unsigned MachineCombiner::getLatency(MachineInstr
*Root
, MachineInstr
*NewRoot
,
231 MachineTraceMetrics::Trace BlockTrace
) {
232 assert(TSchedModel
.hasInstrSchedModelOrItineraries() &&
233 "Missing machine model\n");
235 // Check each definition in NewRoot and compute the latency
236 unsigned NewRootLatency
= 0;
238 for (const MachineOperand
&MO
: NewRoot
->operands()) {
239 // Check for virtual register operand.
240 if (!(MO
.isReg() && Register::isVirtualRegister(MO
.getReg())))
244 // Get the first instruction that uses MO
245 MachineRegisterInfo::reg_iterator RI
= MRI
->reg_begin(MO
.getReg());
247 if (RI
== MRI
->reg_end())
249 MachineInstr
*UseMO
= RI
->getParent();
250 unsigned LatencyOp
= 0;
251 if (UseMO
&& BlockTrace
.isDepInTrace(*Root
, *UseMO
)) {
252 LatencyOp
= TSchedModel
.computeOperandLatency(
253 NewRoot
, NewRoot
->findRegisterDefOperandIdx(MO
.getReg()), UseMO
,
254 UseMO
->findRegisterUseOperandIdx(MO
.getReg()));
256 LatencyOp
= TSchedModel
.computeInstrLatency(NewRoot
);
258 NewRootLatency
= std::max(NewRootLatency
, LatencyOp
);
260 return NewRootLatency
;
263 /// The combiner's goal may differ based on which pattern it is attempting
265 enum class CombinerObjective
{
266 MustReduceDepth
, // The data dependency chain must be improved.
267 MustReduceRegisterPressure
, // The register pressure must be reduced.
268 Default
// The critical path must not be lengthened.
271 static CombinerObjective
getCombinerObjective(MachineCombinerPattern P
) {
272 // TODO: If C++ ever gets a real enum class, make this part of the
273 // MachineCombinerPattern class.
275 case MachineCombinerPattern::REASSOC_AX_BY
:
276 case MachineCombinerPattern::REASSOC_AX_YB
:
277 case MachineCombinerPattern::REASSOC_XA_BY
:
278 case MachineCombinerPattern::REASSOC_XA_YB
:
279 case MachineCombinerPattern::REASSOC_XY_AMM_BMM
:
280 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM
:
281 return CombinerObjective::MustReduceDepth
;
282 case MachineCombinerPattern::REASSOC_XY_BCA
:
283 case MachineCombinerPattern::REASSOC_XY_BAC
:
284 return CombinerObjective::MustReduceRegisterPressure
;
286 return CombinerObjective::Default
;
290 /// Estimate the latency of the new and original instruction sequence by summing
291 /// up the latencies of the inserted and deleted instructions. This assumes
292 /// that the inserted and deleted instructions are dependent instruction chains,
293 /// which might not hold in all cases.
294 std::pair
<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
295 MachineInstr
&MI
, SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
296 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
297 MachineTraceMetrics::Trace BlockTrace
) {
298 assert(!InsInstrs
.empty() && "Only support sequences that insert instrs.");
299 unsigned NewRootLatency
= 0;
300 // NewRoot is the last instruction in the \p InsInstrs vector.
301 MachineInstr
*NewRoot
= InsInstrs
.back();
302 for (unsigned i
= 0; i
< InsInstrs
.size() - 1; i
++)
303 NewRootLatency
+= TSchedModel
.computeInstrLatency(InsInstrs
[i
]);
304 NewRootLatency
+= getLatency(&MI
, NewRoot
, BlockTrace
);
306 unsigned RootLatency
= 0;
307 for (auto I
: DelInstrs
)
308 RootLatency
+= TSchedModel
.computeInstrLatency(I
);
310 return {NewRootLatency
, RootLatency
};
313 bool MachineCombiner::reduceRegisterPressure(
314 MachineInstr
&Root
, MachineBasicBlock
*MBB
,
315 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
316 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
317 MachineCombinerPattern Pattern
) {
318 // FIXME: for now, we don't do any check for the register pressure patterns.
319 // We treat them as always profitable. But we can do better if we make
320 // RegPressureTracker class be aware of TIE attribute. Then we can get an
321 // accurate compare of register pressure with DelInstrs or InsInstrs.
325 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
326 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
327 /// sequence to replace the old sequence is that it cannot lengthen the critical
328 /// path. The definition of "improve" may be restricted by specifying that the
329 /// new path improves the data dependency chain (MustReduceDepth).
330 bool MachineCombiner::improvesCriticalPathLen(
331 MachineBasicBlock
*MBB
, MachineInstr
*Root
,
332 MachineTraceMetrics::Trace BlockTrace
,
333 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
334 SmallVectorImpl
<MachineInstr
*> &DelInstrs
,
335 DenseMap
<unsigned, unsigned> &InstrIdxForVirtReg
,
336 MachineCombinerPattern Pattern
,
337 bool SlackIsAccurate
) {
338 assert(TSchedModel
.hasInstrSchedModelOrItineraries() &&
339 "Missing machine model\n");
340 // Get depth and latency of NewRoot and Root.
341 unsigned NewRootDepth
= getDepth(InsInstrs
, InstrIdxForVirtReg
, BlockTrace
);
342 unsigned RootDepth
= BlockTrace
.getInstrCycles(*Root
).Depth
;
344 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root
<< "\tNewRootDepth: "
345 << NewRootDepth
<< "\tRootDepth: " << RootDepth
);
347 // For a transform such as reassociation, the cost equation is
348 // conservatively calculated so that we must improve the depth (data
349 // dependency cycles) in the critical path to proceed with the transform.
350 // Being conservative also protects against inaccuracies in the underlying
351 // machine trace metrics and CPU models.
352 if (getCombinerObjective(Pattern
) == CombinerObjective::MustReduceDepth
) {
353 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
354 LLVM_DEBUG(NewRootDepth
< RootDepth
355 ? dbgs() << "\t and it does it\n"
356 : dbgs() << "\t but it does NOT do it\n");
357 return NewRootDepth
< RootDepth
;
360 // A more flexible cost calculation for the critical path includes the slack
361 // of the original code sequence. This may allow the transform to proceed
362 // even if the instruction depths (data dependency cycles) become worse.
364 // Account for the latency of the inserted and deleted instructions by
365 unsigned NewRootLatency
, RootLatency
;
366 std::tie(NewRootLatency
, RootLatency
) =
367 getLatenciesForInstrSequences(*Root
, InsInstrs
, DelInstrs
, BlockTrace
);
369 unsigned RootSlack
= BlockTrace
.getInstrSlack(*Root
);
370 unsigned NewCycleCount
= NewRootDepth
+ NewRootLatency
;
371 unsigned OldCycleCount
=
372 RootDepth
+ RootLatency
+ (SlackIsAccurate
? RootSlack
: 0);
373 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
374 << "\tRootLatency: " << RootLatency
<< "\n\tRootSlack: "
375 << RootSlack
<< " SlackIsAccurate=" << SlackIsAccurate
376 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
377 << "\n\tRootDepth + RootLatency + RootSlack = "
379 LLVM_DEBUG(NewCycleCount
<= OldCycleCount
380 ? dbgs() << "\n\t It IMPROVES PathLen because"
381 : dbgs() << "\n\t It DOES NOT improve PathLen because");
382 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
383 << ", OldCycleCount = " << OldCycleCount
<< "\n");
385 return NewCycleCount
<= OldCycleCount
;
388 /// helper routine to convert instructions into SC
389 void MachineCombiner::instr2instrSC(
390 SmallVectorImpl
<MachineInstr
*> &Instrs
,
391 SmallVectorImpl
<const MCSchedClassDesc
*> &InstrsSC
) {
392 for (auto *InstrPtr
: Instrs
) {
393 unsigned Opc
= InstrPtr
->getOpcode();
394 unsigned Idx
= TII
->get(Opc
).getSchedClass();
395 const MCSchedClassDesc
*SC
= SchedModel
.getSchedClassDesc(Idx
);
396 InstrsSC
.push_back(SC
);
400 /// True when the new instructions do not increase resource length
401 bool MachineCombiner::preservesResourceLen(
402 MachineBasicBlock
*MBB
, MachineTraceMetrics::Trace BlockTrace
,
403 SmallVectorImpl
<MachineInstr
*> &InsInstrs
,
404 SmallVectorImpl
<MachineInstr
*> &DelInstrs
) {
405 if (!TSchedModel
.hasInstrSchedModel())
408 // Compute current resource length
410 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
411 SmallVector
<const MachineBasicBlock
*, 1> MBBarr
;
412 MBBarr
.push_back(MBB
);
413 unsigned ResLenBeforeCombine
= BlockTrace
.getResourceLength(MBBarr
);
415 // Deal with SC rather than Instructions.
416 SmallVector
<const MCSchedClassDesc
*, 16> InsInstrsSC
;
417 SmallVector
<const MCSchedClassDesc
*, 16> DelInstrsSC
;
419 instr2instrSC(InsInstrs
, InsInstrsSC
);
420 instr2instrSC(DelInstrs
, DelInstrsSC
);
422 ArrayRef
<const MCSchedClassDesc
*> MSCInsArr
= makeArrayRef(InsInstrsSC
);
423 ArrayRef
<const MCSchedClassDesc
*> MSCDelArr
= makeArrayRef(DelInstrsSC
);
425 // Compute new resource length.
426 unsigned ResLenAfterCombine
=
427 BlockTrace
.getResourceLength(MBBarr
, MSCInsArr
, MSCDelArr
);
429 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
430 << ResLenBeforeCombine
431 << " and after: " << ResLenAfterCombine
<< "\n";);
433 ResLenAfterCombine
<=
434 ResLenBeforeCombine
+ TII
->getExtendResourceLenLimit()
435 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
436 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
439 return ResLenAfterCombine
<=
440 ResLenBeforeCombine
+ TII
->getExtendResourceLenLimit();
443 /// \returns true when new instruction sequence should be generated
444 /// independent if it lengthens critical path or not
445 bool MachineCombiner::doSubstitute(unsigned NewSize
, unsigned OldSize
,
447 if (OptForSize
&& (NewSize
< OldSize
))
449 if (!TSchedModel
.hasInstrSchedModelOrItineraries())
454 /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
455 /// depths if requested.
457 /// \param MBB basic block to insert instructions in
458 /// \param MI current machine instruction
459 /// \param InsInstrs new instructions to insert in \p MBB
460 /// \param DelInstrs instruction to delete from \p MBB
461 /// \param MinInstr is a pointer to the machine trace information
462 /// \param RegUnits set of live registers, needed to compute instruction depths
463 /// \param TII is target instruction info, used to call target hook
464 /// \param Pattern is used to call target hook finalizeInsInstrs
465 /// \param IncrementalUpdate if true, compute instruction depths incrementally,
466 /// otherwise invalidate the trace
467 static void insertDeleteInstructions(MachineBasicBlock
*MBB
, MachineInstr
&MI
,
468 SmallVector
<MachineInstr
*, 16> InsInstrs
,
469 SmallVector
<MachineInstr
*, 16> DelInstrs
,
470 MachineTraceMetrics::Ensemble
*MinInstr
,
471 SparseSet
<LiveRegUnit
> &RegUnits
,
472 const TargetInstrInfo
*TII
,
473 MachineCombinerPattern Pattern
,
474 bool IncrementalUpdate
) {
475 // If we want to fix up some placeholder for some target, do it now.
476 // We need this because in genAlternativeCodeSequence, we have not decided the
477 // better pattern InsInstrs or DelInstrs, so we don't want generate some
478 // sideeffect to the function. For example we need to delay the constant pool
479 // entry creation here after InsInstrs is selected as better pattern.
480 // Otherwise the constant pool entry created for InsInstrs will not be deleted
481 // even if InsInstrs is not the better pattern.
482 TII
->finalizeInsInstrs(MI
, Pattern
, InsInstrs
);
484 for (auto *InstrPtr
: InsInstrs
)
485 MBB
->insert((MachineBasicBlock::iterator
)&MI
, InstrPtr
);
487 for (auto *InstrPtr
: DelInstrs
) {
488 InstrPtr
->eraseFromParent();
489 // Erase all LiveRegs defined by the removed instruction
490 for (auto I
= RegUnits
.begin(); I
!= RegUnits
.end(); ) {
491 if (I
->MI
== InstrPtr
)
492 I
= RegUnits
.erase(I
);
498 if (IncrementalUpdate
)
499 for (auto *InstrPtr
: InsInstrs
)
500 MinInstr
->updateDepth(MBB
, *InstrPtr
, RegUnits
);
502 MinInstr
->invalidate(MBB
);
507 // Check that the difference between original and new latency is decreasing for
508 // later patterns. This helps to discover sub-optimal pattern orderings.
509 void MachineCombiner::verifyPatternOrder(
510 MachineBasicBlock
*MBB
, MachineInstr
&Root
,
511 SmallVector
<MachineCombinerPattern
, 16> &Patterns
) {
512 long PrevLatencyDiff
= std::numeric_limits
<long>::max();
513 (void)PrevLatencyDiff
; // Variable is used in assert only.
514 for (auto P
: Patterns
) {
515 SmallVector
<MachineInstr
*, 16> InsInstrs
;
516 SmallVector
<MachineInstr
*, 16> DelInstrs
;
517 DenseMap
<unsigned, unsigned> InstrIdxForVirtReg
;
518 TII
->genAlternativeCodeSequence(Root
, P
, InsInstrs
, DelInstrs
,
520 // Found pattern, but did not generate alternative sequence.
521 // This can happen e.g. when an immediate could not be materialized
522 // in a single instruction.
523 if (InsInstrs
.empty() || !TSchedModel
.hasInstrSchedModelOrItineraries())
526 unsigned NewRootLatency
, RootLatency
;
527 std::tie(NewRootLatency
, RootLatency
) = getLatenciesForInstrSequences(
528 Root
, InsInstrs
, DelInstrs
, MinInstr
->getTrace(MBB
));
529 long CurrentLatencyDiff
= ((long)RootLatency
) - ((long)NewRootLatency
);
530 assert(CurrentLatencyDiff
<= PrevLatencyDiff
&&
531 "Current pattern is better than previous pattern.");
532 PrevLatencyDiff
= CurrentLatencyDiff
;
536 /// Substitute a slow code sequence with a faster one by
537 /// evaluating instruction combining pattern.
538 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
539 /// combining based on machine trace metrics. Only combine a sequence of
540 /// instructions when this neither lengthens the critical path nor increases
541 /// resource pressure. When optimizing for codesize always combine when the new
542 /// sequence is shorter.
543 bool MachineCombiner::combineInstructions(MachineBasicBlock
*MBB
) {
544 bool Changed
= false;
545 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB
->getName() << "\n");
547 bool IncrementalUpdate
= false;
548 auto BlockIter
= MBB
->begin();
549 decltype(BlockIter
) LastUpdate
;
550 // Check if the block is in a loop.
551 const MachineLoop
*ML
= MLI
->getLoopFor(MBB
);
553 MinInstr
= Traces
->getEnsemble(MachineTraceMetrics::TS_MinInstrCount
);
555 SparseSet
<LiveRegUnit
> RegUnits
;
556 RegUnits
.setUniverse(TRI
->getNumRegUnits());
558 bool OptForSize
= OptSize
|| llvm::shouldOptimizeForSize(MBB
, PSI
, MBFI
);
560 bool DoRegPressureReduce
=
561 TII
->shouldReduceRegisterPressure(MBB
, &RegClassInfo
);
563 while (BlockIter
!= MBB
->end()) {
564 auto &MI
= *BlockIter
++;
565 SmallVector
<MachineCombinerPattern
, 16> Patterns
;
566 // The motivating example is:
568 // MUL Other MUL_op1 MUL_op2 Other
570 // ADD/SUB => MADD/MSUB
571 // (=Root) (=NewRoot)
573 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
574 // usually beneficial for code size it unfortunately can hurt performance
575 // when the ADD is on the critical path, but the MUL is not. With the
576 // substitution the MUL becomes part of the critical path (in form of the
577 // MADD) and can lengthen it on architectures where the MADD latency is
578 // longer than the ADD latency.
580 // For each instruction we check if it can be the root of a combiner
581 // pattern. Then for each pattern the new code sequence in form of MI is
582 // generated and evaluated. When the efficiency criteria (don't lengthen
583 // critical path, don't use more resources) is met the new sequence gets
584 // hooked up into the basic block before the old sequence is removed.
586 // The algorithm does not try to evaluate all patterns and pick the best.
587 // This is only an artificial restriction though. In practice there is
588 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
589 // based on an internal cost heuristic. If
590 // machine-combiner-verify-pattern-order is enabled, all patterns are
591 // checked to ensure later patterns do not provide better latency savings.
593 if (!TII
->getMachineCombinerPatterns(MI
, Patterns
, DoRegPressureReduce
))
596 if (VerifyPatternOrder
)
597 verifyPatternOrder(MBB
, MI
, Patterns
);
599 for (auto P
: Patterns
) {
600 SmallVector
<MachineInstr
*, 16> InsInstrs
;
601 SmallVector
<MachineInstr
*, 16> DelInstrs
;
602 DenseMap
<unsigned, unsigned> InstrIdxForVirtReg
;
603 TII
->genAlternativeCodeSequence(MI
, P
, InsInstrs
, DelInstrs
,
605 unsigned NewInstCount
= InsInstrs
.size();
606 unsigned OldInstCount
= DelInstrs
.size();
607 // Found pattern, but did not generate alternative sequence.
608 // This can happen e.g. when an immediate could not be materialized
609 // in a single instruction.
613 LLVM_DEBUG(if (dump_intrs
) {
614 dbgs() << "\tFor the Pattern (" << (int)P
615 << ") these instructions could be removed\n";
616 for (auto const *InstrPtr
: DelInstrs
)
617 InstrPtr
->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
618 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII
);
619 dbgs() << "\tThese instructions could replace the removed ones\n";
620 for (auto const *InstrPtr
: InsInstrs
)
621 InstrPtr
->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
622 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII
);
625 bool SubstituteAlways
= false;
626 if (ML
&& TII
->isThroughputPattern(P
))
627 SubstituteAlways
= true;
629 if (IncrementalUpdate
&& LastUpdate
!= BlockIter
) {
630 // Update depths since the last incremental update.
631 MinInstr
->updateDepths(LastUpdate
, BlockIter
, RegUnits
);
632 LastUpdate
= BlockIter
;
635 if (DoRegPressureReduce
&&
636 getCombinerObjective(P
) ==
637 CombinerObjective::MustReduceRegisterPressure
) {
638 if (MBB
->size() > inc_threshold
) {
639 // Use incremental depth updates for basic blocks above threshold
640 IncrementalUpdate
= true;
641 LastUpdate
= BlockIter
;
643 if (reduceRegisterPressure(MI
, MBB
, InsInstrs
, DelInstrs
, P
)) {
644 // Replace DelInstrs with InsInstrs.
645 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, MinInstr
,
646 RegUnits
, TII
, P
, IncrementalUpdate
);
649 // Go back to previous instruction as it may have ILP reassociation
656 // Substitute when we optimize for codesize and the new sequence has
657 // fewer instructions OR
658 // the new sequence neither lengthens the critical path nor increases
659 // resource pressure.
660 if (SubstituteAlways
||
661 doSubstitute(NewInstCount
, OldInstCount
, OptForSize
)) {
662 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, MinInstr
,
663 RegUnits
, TII
, P
, IncrementalUpdate
);
664 // Eagerly stop after the first pattern fires.
668 // For big basic blocks, we only compute the full trace the first time
669 // we hit this. We do not invalidate the trace, but instead update the
670 // instruction depths incrementally.
671 // NOTE: Only the instruction depths up to MI are accurate. All other
672 // trace information is not updated.
673 MachineTraceMetrics::Trace BlockTrace
= MinInstr
->getTrace(MBB
);
674 Traces
->verifyAnalysis();
675 if (improvesCriticalPathLen(MBB
, &MI
, BlockTrace
, InsInstrs
, DelInstrs
,
676 InstrIdxForVirtReg
, P
,
677 !IncrementalUpdate
) &&
678 preservesResourceLen(MBB
, BlockTrace
, InsInstrs
, DelInstrs
)) {
679 if (MBB
->size() > inc_threshold
) {
680 // Use incremental depth updates for basic blocks above treshold
681 IncrementalUpdate
= true;
682 LastUpdate
= BlockIter
;
685 insertDeleteInstructions(MBB
, MI
, InsInstrs
, DelInstrs
, MinInstr
,
686 RegUnits
, TII
, P
, IncrementalUpdate
);
688 // Eagerly stop after the first pattern fires.
692 // Cleanup instructions of the alternative code sequence. There is no
694 MachineFunction
*MF
= MBB
->getParent();
695 for (auto *InstrPtr
: InsInstrs
)
696 MF
->deleteMachineInstr(InstrPtr
);
698 InstrIdxForVirtReg
.clear();
702 if (Changed
&& IncrementalUpdate
)
703 Traces
->invalidate(MBB
);
707 bool MachineCombiner::runOnMachineFunction(MachineFunction
&MF
) {
708 STI
= &MF
.getSubtarget();
709 TII
= STI
->getInstrInfo();
710 TRI
= STI
->getRegisterInfo();
711 SchedModel
= STI
->getSchedModel();
712 TSchedModel
.init(STI
);
713 MRI
= &MF
.getRegInfo();
714 MLI
= &getAnalysis
<MachineLoopInfo
>();
715 Traces
= &getAnalysis
<MachineTraceMetrics
>();
716 PSI
= &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
717 MBFI
= (PSI
&& PSI
->hasProfileSummary()) ?
718 &getAnalysis
<LazyMachineBlockFrequencyInfoPass
>().getBFI() :
721 OptSize
= MF
.getFunction().hasOptSize();
722 RegClassInfo
.runOnMachineFunction(MF
);
724 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF
.getName() << '\n');
725 if (!TII
->useMachineCombiner()) {
728 << " Skipping pass: Target does not support machine combiner\n");
732 bool Changed
= false;
734 // Try to combine instructions.
736 Changed
|= combineInstructions(&MBB
);