1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass converts selects to conditional jumps when profitable.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/CodeGen/SelectOptimize.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/BlockFrequencyInfo.h"
17 #include "llvm/Analysis/BranchProbabilityInfo.h"
18 #include "llvm/Analysis/LoopInfo.h"
19 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
20 #include "llvm/Analysis/ProfileSummaryInfo.h"
21 #include "llvm/Analysis/TargetTransformInfo.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/CodeGen/TargetLowering.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/CodeGen/TargetSchedule.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/PatternMatch.h"
33 #include "llvm/IR/ProfDataUtils.h"
34 #include "llvm/InitializePasses.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/ScaledNumber.h"
37 #include "llvm/Target/TargetMachine.h"
38 #include "llvm/Transforms/Utils/SizeOpts.h"
45 using namespace llvm::PatternMatch
;
47 #define DEBUG_TYPE "select-optimize"
49 STATISTIC(NumSelectOptAnalyzed
,
50 "Number of select groups considered for conversion to branch");
51 STATISTIC(NumSelectConvertedExpColdOperand
,
52 "Number of select groups converted due to expensive cold operand");
53 STATISTIC(NumSelectConvertedHighPred
,
54 "Number of select groups converted due to high-predictability");
55 STATISTIC(NumSelectUnPred
,
56 "Number of select groups not converted due to unpredictability");
57 STATISTIC(NumSelectColdBB
,
58 "Number of select groups not converted due to cold basic block");
59 STATISTIC(NumSelectConvertedLoop
,
60 "Number of select groups converted due to loop-level analysis");
61 STATISTIC(NumSelectsConverted
, "Number of selects converted");
63 static cl::opt
<unsigned> ColdOperandThreshold(
64 "cold-operand-threshold",
65 cl::desc("Maximum frequency of path for an operand to be considered cold."),
66 cl::init(20), cl::Hidden
);
68 static cl::opt
<unsigned> ColdOperandMaxCostMultiplier(
69 "cold-operand-max-cost-multiplier",
70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71 "slice of a cold operand to be considered inexpensive."),
72 cl::init(1), cl::Hidden
);
74 static cl::opt
<unsigned>
75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76 cl::desc("Gradient gain threshold (%)."),
77 cl::init(25), cl::Hidden
);
79 static cl::opt
<unsigned>
80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81 cl::desc("Minimum gain per loop (in cycles) threshold."),
82 cl::init(4), cl::Hidden
);
84 static cl::opt
<unsigned> GainRelativeThreshold(
85 "select-opti-loop-relative-gain-threshold",
87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
88 cl::init(8), cl::Hidden
);
90 static cl::opt
<unsigned> MispredictDefaultRate(
91 "mispredict-default-rate", cl::Hidden
, cl::init(25),
92 cl::desc("Default mispredict rate (initialized to 25%)."));
95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden
,
97 cl::desc("Disable loop-level heuristics."));
101 class SelectOptimizeImpl
{
102 const TargetMachine
*TM
= nullptr;
103 const TargetSubtargetInfo
*TSI
= nullptr;
104 const TargetLowering
*TLI
= nullptr;
105 const TargetTransformInfo
*TTI
= nullptr;
106 const LoopInfo
*LI
= nullptr;
107 BlockFrequencyInfo
*BFI
;
108 ProfileSummaryInfo
*PSI
= nullptr;
109 OptimizationRemarkEmitter
*ORE
= nullptr;
110 TargetSchedModel TSchedModel
;
113 SelectOptimizeImpl() = default;
114 SelectOptimizeImpl(const TargetMachine
*TM
) : TM(TM
){};
115 PreservedAnalyses
run(Function
&F
, FunctionAnalysisManager
&FAM
);
116 bool runOnFunction(Function
&F
, Pass
&P
);
118 using Scaled64
= ScaledNumber
<uint64_t>;
121 /// Predicated cost (with selects as conditional moves).
123 /// Non-predicated cost (with selects converted to branches).
124 Scaled64 NonPredCost
;
127 /// SelectLike is an abstraction over SelectInst and other operations that can
128 /// act like selects. For example Or(Zext(icmp), X) can be treated like
129 /// select(icmp, X|1, X).
131 SelectLike(Instruction
*I
) : I(I
) {}
136 /// Match a select or select-like instruction, returning a SelectLike.
137 static SelectLike
match(Instruction
*I
) {
138 // Select instruction are what we are usually looking for.
139 if (isa
<SelectInst
>(I
))
140 return SelectLike(I
);
142 // An Or(zext(i1 X), Y) can also be treated like a select, with condition
143 // C and values Y|1 and Y.
145 if (PatternMatch::match(
146 I
, m_c_Or(m_OneUse(m_ZExt(m_Value(X
))), m_Value())) &&
147 X
->getType()->isIntegerTy(1))
148 return SelectLike(I
);
150 return SelectLike(nullptr);
153 bool isValid() { return I
; }
154 operator bool() { return isValid(); }
156 Instruction
*getI() { return I
; }
157 const Instruction
*getI() const { return I
; }
159 Type
*getType() const { return I
->getType(); }
161 /// Return the condition for the SelectLike instruction. For example the
162 /// condition of a select or c in `or(zext(c), x)`
163 Value
*getCondition() const {
164 if (auto *Sel
= dyn_cast
<SelectInst
>(I
))
165 return Sel
->getCondition();
167 if (auto *BO
= dyn_cast
<BinaryOperator
>(I
)) {
169 if (PatternMatch::match(BO
->getOperand(0),
170 m_OneUse(m_ZExt(m_Value(X
)))))
172 if (PatternMatch::match(BO
->getOperand(1),
173 m_OneUse(m_ZExt(m_Value(X
)))))
177 llvm_unreachable("Unhandled case in getCondition");
180 /// Return the true value for the SelectLike instruction. Note this may not
181 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
182 /// the true value would be `or(x,1)`. As this value does not exist, nullptr
184 Value
*getTrueValue() const {
185 if (auto *Sel
= dyn_cast
<SelectInst
>(I
))
186 return Sel
->getTrueValue();
187 // Or(zext) case - The true value is Or(X), so return nullptr as the value
188 // does not yet exist.
189 if (isa
<BinaryOperator
>(I
))
192 llvm_unreachable("Unhandled case in getTrueValue");
195 /// Return the false value for the SelectLike instruction. For example the
196 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
197 /// `select(c, x|1, x)`)
198 Value
*getFalseValue() const {
199 if (auto *Sel
= dyn_cast
<SelectInst
>(I
))
200 return Sel
->getFalseValue();
201 // Or(zext) case - return the operand which is not the zext.
202 if (auto *BO
= dyn_cast
<BinaryOperator
>(I
)) {
204 if (PatternMatch::match(BO
->getOperand(0),
205 m_OneUse(m_ZExt(m_Value(X
)))))
206 return BO
->getOperand(1);
207 if (PatternMatch::match(BO
->getOperand(1),
208 m_OneUse(m_ZExt(m_Value(X
)))))
209 return BO
->getOperand(0);
212 llvm_unreachable("Unhandled case in getFalseValue");
215 /// Return the NonPredCost cost of the true op, given the costs in
216 /// InstCostMap. This may need to be generated for select-like instructions.
217 Scaled64
getTrueOpCost(DenseMap
<const Instruction
*, CostInfo
> &InstCostMap
,
218 const TargetTransformInfo
*TTI
) {
219 if (auto *Sel
= dyn_cast
<SelectInst
>(I
))
220 if (auto *I
= dyn_cast
<Instruction
>(Sel
->getTrueValue()))
221 return InstCostMap
.contains(I
) ? InstCostMap
[I
].NonPredCost
222 : Scaled64::getZero();
224 // Or case - add the cost of an extra Or to the cost of the False case.
225 if (isa
<BinaryOperator
>(I
))
226 if (auto I
= dyn_cast
<Instruction
>(getFalseValue()))
227 if (InstCostMap
.contains(I
)) {
228 InstructionCost OrCost
= TTI
->getArithmeticInstrCost(
229 Instruction::Or
, I
->getType(), TargetTransformInfo::TCK_Latency
,
230 {TargetTransformInfo::OK_AnyValue
,
231 TargetTransformInfo::OP_None
},
232 {TTI::OK_UniformConstantValue
, TTI::OP_PowerOf2
});
233 return InstCostMap
[I
].NonPredCost
+
234 Scaled64::get(*OrCost
.getValue());
237 return Scaled64::getZero();
240 /// Return the NonPredCost cost of the false op, given the costs in
241 /// InstCostMap. This may need to be generated for select-like instructions.
243 getFalseOpCost(DenseMap
<const Instruction
*, CostInfo
> &InstCostMap
,
244 const TargetTransformInfo
*TTI
) {
245 if (auto *Sel
= dyn_cast
<SelectInst
>(I
))
246 if (auto *I
= dyn_cast
<Instruction
>(Sel
->getFalseValue()))
247 return InstCostMap
.contains(I
) ? InstCostMap
[I
].NonPredCost
248 : Scaled64::getZero();
250 // Or case - return the cost of the false case
251 if (isa
<BinaryOperator
>(I
))
252 if (auto I
= dyn_cast
<Instruction
>(getFalseValue()))
253 if (InstCostMap
.contains(I
))
254 return InstCostMap
[I
].NonPredCost
;
256 return Scaled64::getZero();
261 // Select groups consist of consecutive select instructions with the same
263 using SelectGroup
= SmallVector
<SelectLike
, 2>;
264 using SelectGroups
= SmallVector
<SelectGroup
, 2>;
266 // Converts select instructions of a function to conditional jumps when deemed
267 // profitable. Returns true if at least one select was converted.
268 bool optimizeSelects(Function
&F
);
270 // Heuristics for determining which select instructions can be profitably
271 // conveted to branches. Separate heuristics for selects in inner-most loops
272 // and the rest of code regions (base heuristics for non-inner-most loop
274 void optimizeSelectsBase(Function
&F
, SelectGroups
&ProfSIGroups
);
275 void optimizeSelectsInnerLoops(Function
&F
, SelectGroups
&ProfSIGroups
);
277 // Converts to branches the select groups that were deemed
278 // profitable-to-convert.
279 void convertProfitableSIGroups(SelectGroups
&ProfSIGroups
);
281 // Splits selects of a given basic block into select groups.
282 void collectSelectGroups(BasicBlock
&BB
, SelectGroups
&SIGroups
);
284 // Determines for which select groups it is profitable converting to branches
285 // (base and inner-most-loop heuristics).
286 void findProfitableSIGroupsBase(SelectGroups
&SIGroups
,
287 SelectGroups
&ProfSIGroups
);
288 void findProfitableSIGroupsInnerLoops(const Loop
*L
, SelectGroups
&SIGroups
,
289 SelectGroups
&ProfSIGroups
);
291 // Determines if a select group should be converted to a branch (base
293 bool isConvertToBranchProfitableBase(const SelectGroup
&ASI
);
295 // Returns true if there are expensive instructions in the cold value
296 // operand's (if any) dependence slice of any of the selects of the given
298 bool hasExpensiveColdOperand(const SelectGroup
&ASI
);
300 // For a given source instruction, collect its backwards dependence slice
301 // consisting of instructions exclusively computed for producing the operands
302 // of the source instruction.
303 void getExclBackwardsSlice(Instruction
*I
, std::stack
<Instruction
*> &Slice
,
304 Instruction
*SI
, bool ForSinking
= false);
306 // Returns true if the condition of the select is highly predictable.
307 bool isSelectHighlyPredictable(const SelectLike SI
);
309 // Loop-level checks to determine if a non-predicated version (with branches)
310 // of the given loop is more profitable than its predicated version.
311 bool checkLoopHeuristics(const Loop
*L
, const CostInfo LoopDepth
[2]);
313 // Computes instruction and loop-critical-path costs for both the predicated
314 // and non-predicated version of the given loop.
315 bool computeLoopCosts(const Loop
*L
, const SelectGroups
&SIGroups
,
316 DenseMap
<const Instruction
*, CostInfo
> &InstCostMap
,
319 // Returns a set of all the select instructions in the given select groups.
320 SmallDenseMap
<const Instruction
*, SelectLike
, 2>
321 getSImap(const SelectGroups
&SIGroups
);
323 // Returns the latency cost of a given instruction.
324 std::optional
<uint64_t> computeInstCost(const Instruction
*I
);
326 // Returns the misprediction cost of a given select when converted to branch.
327 Scaled64
getMispredictionCost(const SelectLike SI
, const Scaled64 CondCost
);
329 // Returns the cost of a branch when the prediction is correct.
330 Scaled64
getPredictedPathCost(Scaled64 TrueCost
, Scaled64 FalseCost
,
331 const SelectLike SI
);
333 // Returns true if the target architecture supports lowering a given select.
334 bool isSelectKindSupported(const SelectLike SI
);
337 class SelectOptimize
: public FunctionPass
{
338 SelectOptimizeImpl Impl
;
343 SelectOptimize() : FunctionPass(ID
) {
344 initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
347 bool runOnFunction(Function
&F
) override
{
348 return Impl
.runOnFunction(F
, *this);
351 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
352 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
353 AU
.addRequired
<TargetPassConfig
>();
354 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
355 AU
.addRequired
<LoopInfoWrapperPass
>();
356 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
357 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
363 PreservedAnalyses
SelectOptimizePass::run(Function
&F
,
364 FunctionAnalysisManager
&FAM
) {
365 SelectOptimizeImpl
Impl(TM
);
366 return Impl
.run(F
, FAM
);
369 char SelectOptimize::ID
= 0;
371 INITIALIZE_PASS_BEGIN(SelectOptimize
, DEBUG_TYPE
, "Optimize selects", false,
373 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
374 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
375 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
376 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
377 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
378 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
379 INITIALIZE_PASS_END(SelectOptimize
, DEBUG_TYPE
, "Optimize selects", false,
382 FunctionPass
*llvm::createSelectOptimizePass() { return new SelectOptimize(); }
384 PreservedAnalyses
SelectOptimizeImpl::run(Function
&F
,
385 FunctionAnalysisManager
&FAM
) {
386 TSI
= TM
->getSubtargetImpl(F
);
387 TLI
= TSI
->getTargetLowering();
389 // If none of the select types are supported then skip this pass.
390 // This is an optimization pass. Legality issues will be handled by
391 // instruction selection.
392 if (!TLI
->isSelectSupported(TargetLowering::ScalarValSelect
) &&
393 !TLI
->isSelectSupported(TargetLowering::ScalarCondVectorVal
) &&
394 !TLI
->isSelectSupported(TargetLowering::VectorMaskSelect
))
395 return PreservedAnalyses::all();
397 TTI
= &FAM
.getResult
<TargetIRAnalysis
>(F
);
398 if (!TTI
->enableSelectOptimize())
399 return PreservedAnalyses::all();
401 PSI
= FAM
.getResult
<ModuleAnalysisManagerFunctionProxy
>(F
)
402 .getCachedResult
<ProfileSummaryAnalysis
>(*F
.getParent());
403 assert(PSI
&& "This pass requires module analysis pass `profile-summary`!");
404 BFI
= &FAM
.getResult
<BlockFrequencyAnalysis
>(F
);
406 // When optimizing for size, selects are preferable over branches.
407 if (F
.hasOptSize() || llvm::shouldOptimizeForSize(&F
, PSI
, BFI
))
408 return PreservedAnalyses::all();
410 LI
= &FAM
.getResult
<LoopAnalysis
>(F
);
411 ORE
= &FAM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
412 TSchedModel
.init(TSI
);
414 bool Changed
= optimizeSelects(F
);
415 return Changed
? PreservedAnalyses::none() : PreservedAnalyses::all();
418 bool SelectOptimizeImpl::runOnFunction(Function
&F
, Pass
&P
) {
419 TM
= &P
.getAnalysis
<TargetPassConfig
>().getTM
<TargetMachine
>();
420 TSI
= TM
->getSubtargetImpl(F
);
421 TLI
= TSI
->getTargetLowering();
423 // If none of the select types are supported then skip this pass.
424 // This is an optimization pass. Legality issues will be handled by
425 // instruction selection.
426 if (!TLI
->isSelectSupported(TargetLowering::ScalarValSelect
) &&
427 !TLI
->isSelectSupported(TargetLowering::ScalarCondVectorVal
) &&
428 !TLI
->isSelectSupported(TargetLowering::VectorMaskSelect
))
431 TTI
= &P
.getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
433 if (!TTI
->enableSelectOptimize())
436 LI
= &P
.getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
437 BFI
= &P
.getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
438 PSI
= &P
.getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
439 ORE
= &P
.getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
440 TSchedModel
.init(TSI
);
442 // When optimizing for size, selects are preferable over branches.
443 if (F
.hasOptSize() || llvm::shouldOptimizeForSize(&F
, PSI
, BFI
))
446 return optimizeSelects(F
);
449 bool SelectOptimizeImpl::optimizeSelects(Function
&F
) {
450 // Determine for which select groups it is profitable converting to branches.
451 SelectGroups ProfSIGroups
;
452 // Base heuristics apply only to non-loops and outer loops.
453 optimizeSelectsBase(F
, ProfSIGroups
);
454 // Separate heuristics for inner-most loops.
455 optimizeSelectsInnerLoops(F
, ProfSIGroups
);
457 // Convert to branches the select groups that were deemed
458 // profitable-to-convert.
459 convertProfitableSIGroups(ProfSIGroups
);
461 // Code modified if at least one select group was converted.
462 return !ProfSIGroups
.empty();
465 void SelectOptimizeImpl::optimizeSelectsBase(Function
&F
,
466 SelectGroups
&ProfSIGroups
) {
467 // Collect all the select groups.
468 SelectGroups SIGroups
;
469 for (BasicBlock
&BB
: F
) {
470 // Base heuristics apply only to non-loops and outer loops.
471 Loop
*L
= LI
->getLoopFor(&BB
);
472 if (L
&& L
->isInnermost())
474 collectSelectGroups(BB
, SIGroups
);
477 // Determine for which select groups it is profitable converting to branches.
478 findProfitableSIGroupsBase(SIGroups
, ProfSIGroups
);
481 void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function
&F
,
482 SelectGroups
&ProfSIGroups
) {
483 SmallVector
<Loop
*, 4> Loops(LI
->begin(), LI
->end());
484 // Need to check size on each iteration as we accumulate child loops.
485 for (unsigned long i
= 0; i
< Loops
.size(); ++i
)
486 for (Loop
*ChildL
: Loops
[i
]->getSubLoops())
487 Loops
.push_back(ChildL
);
489 for (Loop
*L
: Loops
) {
490 if (!L
->isInnermost())
493 SelectGroups SIGroups
;
494 for (BasicBlock
*BB
: L
->getBlocks())
495 collectSelectGroups(*BB
, SIGroups
);
497 findProfitableSIGroupsInnerLoops(L
, SIGroups
, ProfSIGroups
);
501 /// If \p isTrue is true, return the true value of \p SI, otherwise return
502 /// false value of \p SI. If the true/false value of \p SI is defined by any
503 /// select instructions in \p Selects, look through the defining select
504 /// instruction until the true/false value is not defined in \p Selects.
506 getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI
, bool isTrue
,
507 const SmallPtrSet
<const Instruction
*, 2> &Selects
,
510 for (SelectInst
*DefSI
= dyn_cast
<SelectInst
>(SI
.getI());
511 DefSI
!= nullptr && Selects
.count(DefSI
);
512 DefSI
= dyn_cast
<SelectInst
>(V
)) {
513 assert(DefSI
->getCondition() == SI
.getCondition() &&
514 "The condition of DefSI does not match with SI");
515 V
= (isTrue
? DefSI
->getTrueValue() : DefSI
->getFalseValue());
518 if (isa
<BinaryOperator
>(SI
.getI())) {
519 assert(SI
.getI()->getOpcode() == Instruction::Or
&&
520 "Only currently handling Or instructions.");
521 V
= SI
.getFalseValue();
523 V
= IB
.CreateOr(V
, ConstantInt::get(V
->getType(), 1));
526 assert(V
&& "Failed to get select true/false value");
530 void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups
&ProfSIGroups
) {
531 for (SelectGroup
&ASI
: ProfSIGroups
) {
532 // The code transformation here is a modified version of the sinking
533 // transformation in CodeGenPrepare::optimizeSelectInst with a more
534 // aggressive strategy of which instructions to sink.
536 // TODO: eliminate the redundancy of logic transforming selects to branches
537 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
538 // selects for all cases (with and without profile information).
540 // Transform a sequence like this:
542 // %cmp = cmp uge i32 %a, %b
543 // %sel = select i1 %cmp, i32 %c, i32 %d
547 // %cmp = cmp uge i32 %a, %b
548 // %cmp.frozen = freeze %cmp
549 // br i1 %cmp.frozen, label %select.true, label %select.false
551 // br label %select.end
553 // br label %select.end
555 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
557 // %cmp should be frozen, otherwise it may introduce undefined behavior.
558 // In addition, we may sink instructions that produce %c or %d into the
559 // destination(s) of the new branch.
560 // If the true or false blocks do not contain a sunken instruction, that
561 // block and its branch may be optimized away. In that case, one side of the
562 // first branch will point directly to select.end, and the corresponding PHI
563 // predecessor block will be the start block.
565 // Find all the instructions that can be soundly sunk to the true/false
566 // blocks. These are instructions that are computed solely for producing the
567 // operands of the select instructions in the group and can be sunk without
568 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
569 // with side effects).
570 SmallVector
<std::stack
<Instruction
*>, 2> TrueSlices
, FalseSlices
;
571 typedef std::stack
<Instruction
*>::size_type StackSizeType
;
572 StackSizeType maxTrueSliceLen
= 0, maxFalseSliceLen
= 0;
573 for (SelectLike SI
: ASI
) {
574 // For each select, compute the sinkable dependence chains of the true and
576 if (auto *TI
= dyn_cast_or_null
<Instruction
>(SI
.getTrueValue())) {
577 std::stack
<Instruction
*> TrueSlice
;
578 getExclBackwardsSlice(TI
, TrueSlice
, SI
.getI(), true);
579 maxTrueSliceLen
= std::max(maxTrueSliceLen
, TrueSlice
.size());
580 TrueSlices
.push_back(TrueSlice
);
582 if (auto *FI
= dyn_cast_or_null
<Instruction
>(SI
.getFalseValue())) {
583 if (isa
<SelectInst
>(SI
.getI()) || !FI
->hasOneUse()) {
584 std::stack
<Instruction
*> FalseSlice
;
585 getExclBackwardsSlice(FI
, FalseSlice
, SI
.getI(), true);
586 maxFalseSliceLen
= std::max(maxFalseSliceLen
, FalseSlice
.size());
587 FalseSlices
.push_back(FalseSlice
);
591 // In the case of multiple select instructions in the same group, the order
592 // of non-dependent instructions (instructions of different dependence
593 // slices) in the true/false blocks appears to affect performance.
594 // Interleaving the slices seems to experimentally be the optimal approach.
595 // This interleaving scheduling allows for more ILP (with a natural downside
596 // of increasing a bit register pressure) compared to a simple ordering of
597 // one whole chain after another. One would expect that this ordering would
598 // not matter since the scheduling in the backend of the compiler would
599 // take care of it, but apparently the scheduler fails to deliver optimal
600 // ILP with a naive ordering here.
601 SmallVector
<Instruction
*, 2> TrueSlicesInterleaved
, FalseSlicesInterleaved
;
602 for (StackSizeType IS
= 0; IS
< maxTrueSliceLen
; ++IS
) {
603 for (auto &S
: TrueSlices
) {
605 TrueSlicesInterleaved
.push_back(S
.top());
610 for (StackSizeType IS
= 0; IS
< maxFalseSliceLen
; ++IS
) {
611 for (auto &S
: FalseSlices
) {
613 FalseSlicesInterleaved
.push_back(S
.top());
619 // We split the block containing the select(s) into two blocks.
620 SelectLike SI
= ASI
.front();
621 SelectLike LastSI
= ASI
.back();
622 BasicBlock
*StartBlock
= SI
.getI()->getParent();
623 BasicBlock::iterator SplitPt
= ++(BasicBlock::iterator(LastSI
.getI()));
624 BasicBlock
*EndBlock
= StartBlock
->splitBasicBlock(SplitPt
, "select.end");
625 BFI
->setBlockFreq(EndBlock
, BFI
->getBlockFreq(StartBlock
));
626 // Delete the unconditional branch that was just created by the split.
627 StartBlock
->getTerminator()->eraseFromParent();
629 // Move any debug/pseudo instructions that were in-between the select
630 // group to the newly-created end block.
631 SmallVector
<Instruction
*, 2> DebugPseudoINS
;
632 auto DIt
= SI
.getI()->getIterator();
633 while (&*DIt
!= LastSI
.getI()) {
634 if (DIt
->isDebugOrPseudoInst())
635 DebugPseudoINS
.push_back(&*DIt
);
638 for (auto *DI
: DebugPseudoINS
) {
639 DI
->moveBeforePreserving(&*EndBlock
->getFirstInsertionPt());
642 // Duplicate implementation for DPValues, the non-instruction debug-info
643 // record. Helper lambda for moving DPValues to the end block.
644 auto TransferDPValues
= [&](Instruction
&I
) {
645 for (auto &DPValue
: llvm::make_early_inc_range(I
.getDbgValueRange())) {
646 DPValue
.removeFromParent();
647 EndBlock
->insertDPValueBefore(&DPValue
,
648 EndBlock
->getFirstInsertionPt());
652 // Iterate over all instructions in between SI and LastSI, not including
653 // SI itself. These are all the variable assignments that happen "in the
654 // middle" of the select group.
655 auto R
= make_range(std::next(SI
.getI()->getIterator()),
656 std::next(LastSI
.getI()->getIterator()));
657 llvm::for_each(R
, TransferDPValues
);
659 // These are the new basic blocks for the conditional branch.
660 // At least one will become an actual new basic block.
661 BasicBlock
*TrueBlock
= nullptr, *FalseBlock
= nullptr;
662 BranchInst
*TrueBranch
= nullptr, *FalseBranch
= nullptr;
663 if (!TrueSlicesInterleaved
.empty()) {
664 TrueBlock
= BasicBlock::Create(EndBlock
->getContext(), "select.true.sink",
665 EndBlock
->getParent(), EndBlock
);
666 TrueBranch
= BranchInst::Create(EndBlock
, TrueBlock
);
667 TrueBranch
->setDebugLoc(LastSI
.getI()->getDebugLoc());
668 for (Instruction
*TrueInst
: TrueSlicesInterleaved
)
669 TrueInst
->moveBefore(TrueBranch
);
671 if (!FalseSlicesInterleaved
.empty()) {
673 BasicBlock::Create(EndBlock
->getContext(), "select.false.sink",
674 EndBlock
->getParent(), EndBlock
);
675 FalseBranch
= BranchInst::Create(EndBlock
, FalseBlock
);
676 FalseBranch
->setDebugLoc(LastSI
.getI()->getDebugLoc());
677 for (Instruction
*FalseInst
: FalseSlicesInterleaved
)
678 FalseInst
->moveBefore(FalseBranch
);
680 // If there was nothing to sink, then arbitrarily choose the 'false' side
681 // for a new input value to the PHI.
682 if (TrueBlock
== FalseBlock
) {
683 assert(TrueBlock
== nullptr &&
684 "Unexpected basic block transform while optimizing select");
686 FalseBlock
= BasicBlock::Create(StartBlock
->getContext(), "select.false",
687 EndBlock
->getParent(), EndBlock
);
688 auto *FalseBranch
= BranchInst::Create(EndBlock
, FalseBlock
);
689 FalseBranch
->setDebugLoc(SI
.getI()->getDebugLoc());
692 // Insert the real conditional branch based on the original condition.
693 // If we did not create a new block for one of the 'true' or 'false' paths
694 // of the condition, it means that side of the branch goes to the end block
695 // directly and the path originates from the start block from the point of
696 // view of the new PHI.
698 if (TrueBlock
== nullptr) {
701 TrueBlock
= StartBlock
;
702 } else if (FalseBlock
== nullptr) {
705 FalseBlock
= StartBlock
;
710 IRBuilder
<> IB(SI
.getI());
711 auto *CondFr
= IB
.CreateFreeze(SI
.getCondition(),
712 SI
.getCondition()->getName() + ".frozen");
714 SmallPtrSet
<const Instruction
*, 2> INS
;
716 INS
.insert(SI
.getI());
718 // Use reverse iterator because later select may use the value of the
719 // earlier select, and we need to propagate value through earlier select
720 // to get the PHI operand.
721 for (auto It
= ASI
.rbegin(); It
!= ASI
.rend(); ++It
) {
723 // The select itself is replaced with a PHI Node.
724 PHINode
*PN
= PHINode::Create(SI
.getType(), 2, "");
725 PN
->insertBefore(EndBlock
->begin());
726 PN
->takeName(SI
.getI());
727 PN
->addIncoming(getTrueOrFalseValue(SI
, true, INS
, IB
), TrueBlock
);
728 PN
->addIncoming(getTrueOrFalseValue(SI
, false, INS
, IB
), FalseBlock
);
729 PN
->setDebugLoc(SI
.getI()->getDebugLoc());
730 SI
.getI()->replaceAllUsesWith(PN
);
731 INS
.erase(SI
.getI());
732 ++NumSelectsConverted
;
734 IB
.CreateCondBr(CondFr
, TT
, FT
, SI
.getI());
736 // Remove the old select instructions, now that they are not longer used.
738 SI
.getI()->eraseFromParent();
742 void SelectOptimizeImpl::collectSelectGroups(BasicBlock
&BB
,
743 SelectGroups
&SIGroups
) {
744 BasicBlock::iterator BBIt
= BB
.begin();
745 while (BBIt
!= BB
.end()) {
746 Instruction
*I
= &*BBIt
++;
747 if (SelectLike SI
= SelectLike::match(I
)) {
748 if (!TTI
->shouldTreatInstructionLikeSelect(I
))
752 SIGroup
.push_back(SI
);
753 while (BBIt
!= BB
.end()) {
754 Instruction
*NI
= &*BBIt
;
755 // Debug/pseudo instructions should be skipped and not prevent the
756 // formation of a select group.
757 if (NI
->isDebugOrPseudoInst()) {
761 // We only allow selects in the same group, not other select-like
763 if (!isa
<SelectInst
>(NI
))
766 SelectLike NSI
= SelectLike::match(NI
);
767 if (NSI
&& SI
.getCondition() == NSI
.getCondition()) {
768 SIGroup
.push_back(NSI
);
774 // If the select type is not supported, no point optimizing it.
775 // Instruction selection will take care of it.
776 if (!isSelectKindSupported(SI
))
779 SIGroups
.push_back(SIGroup
);
784 void SelectOptimizeImpl::findProfitableSIGroupsBase(
785 SelectGroups
&SIGroups
, SelectGroups
&ProfSIGroups
) {
786 for (SelectGroup
&ASI
: SIGroups
) {
787 ++NumSelectOptAnalyzed
;
788 if (isConvertToBranchProfitableBase(ASI
))
789 ProfSIGroups
.push_back(ASI
);
793 static void EmitAndPrintRemark(OptimizationRemarkEmitter
*ORE
,
794 DiagnosticInfoOptimizationBase
&Rem
) {
795 LLVM_DEBUG(dbgs() << Rem
.getMsg() << "\n");
799 void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
800 const Loop
*L
, SelectGroups
&SIGroups
, SelectGroups
&ProfSIGroups
) {
801 NumSelectOptAnalyzed
+= SIGroups
.size();
802 // For each select group in an inner-most loop,
803 // a branch is more preferable than a select/conditional-move if:
804 // i) conversion to branches for all the select groups of the loop satisfies
805 // loop-level heuristics including reducing the loop's critical path by
806 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
807 // ii) the total cost of the select group is cheaper with a branch compared
808 // to its predicated version. The cost is in terms of latency and the cost
809 // of a select group is the cost of its most expensive select instruction
810 // (assuming infinite resources and thus fully leveraging available ILP).
812 DenseMap
<const Instruction
*, CostInfo
> InstCostMap
;
813 CostInfo LoopCost
[2] = {{Scaled64::getZero(), Scaled64::getZero()},
814 {Scaled64::getZero(), Scaled64::getZero()}};
815 if (!computeLoopCosts(L
, SIGroups
, InstCostMap
, LoopCost
) ||
816 !checkLoopHeuristics(L
, LoopCost
)) {
820 for (SelectGroup
&ASI
: SIGroups
) {
821 // Assuming infinite resources, the cost of a group of instructions is the
822 // cost of the most expensive instruction of the group.
823 Scaled64 SelectCost
= Scaled64::getZero(), BranchCost
= Scaled64::getZero();
824 for (SelectLike SI
: ASI
) {
825 SelectCost
= std::max(SelectCost
, InstCostMap
[SI
.getI()].PredCost
);
826 BranchCost
= std::max(BranchCost
, InstCostMap
[SI
.getI()].NonPredCost
);
828 if (BranchCost
< SelectCost
) {
829 OptimizationRemark
OR(DEBUG_TYPE
, "SelectOpti", ASI
.front().getI());
830 OR
<< "Profitable to convert to branch (loop analysis). BranchCost="
831 << BranchCost
.toString() << ", SelectCost=" << SelectCost
.toString()
833 EmitAndPrintRemark(ORE
, OR
);
834 ++NumSelectConvertedLoop
;
835 ProfSIGroups
.push_back(ASI
);
837 OptimizationRemarkMissed
ORmiss(DEBUG_TYPE
, "SelectOpti",
839 ORmiss
<< "Select is more profitable (loop analysis). BranchCost="
840 << BranchCost
.toString()
841 << ", SelectCost=" << SelectCost
.toString() << ". ";
842 EmitAndPrintRemark(ORE
, ORmiss
);
847 bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
848 const SelectGroup
&ASI
) {
849 SelectLike SI
= ASI
.front();
850 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << SI
.getI()
852 OptimizationRemark
OR(DEBUG_TYPE
, "SelectOpti", SI
.getI());
853 OptimizationRemarkMissed
ORmiss(DEBUG_TYPE
, "SelectOpti", SI
.getI());
855 // Skip cold basic blocks. Better to optimize for size for cold blocks.
856 if (PSI
->isColdBlock(SI
.getI()->getParent(), BFI
)) {
858 ORmiss
<< "Not converted to branch because of cold basic block. ";
859 EmitAndPrintRemark(ORE
, ORmiss
);
863 // If unpredictable, branch form is less profitable.
864 if (SI
.getI()->getMetadata(LLVMContext::MD_unpredictable
)) {
866 ORmiss
<< "Not converted to branch because of unpredictable branch. ";
867 EmitAndPrintRemark(ORE
, ORmiss
);
871 // If highly predictable, branch form is more profitable, unless a
872 // predictable select is inexpensive in the target architecture.
873 if (isSelectHighlyPredictable(SI
) && TLI
->isPredictableSelectExpensive()) {
874 ++NumSelectConvertedHighPred
;
875 OR
<< "Converted to branch because of highly predictable branch. ";
876 EmitAndPrintRemark(ORE
, OR
);
880 // Look for expensive instructions in the cold operand's (if any) dependence
881 // slice of any of the selects in the group.
882 if (hasExpensiveColdOperand(ASI
)) {
883 ++NumSelectConvertedExpColdOperand
;
884 OR
<< "Converted to branch because of expensive cold operand.";
885 EmitAndPrintRemark(ORE
, OR
);
889 ORmiss
<< "Not profitable to convert to branch (base heuristic).";
890 EmitAndPrintRemark(ORE
, ORmiss
);
894 static InstructionCost
divideNearest(InstructionCost Numerator
,
895 uint64_t Denominator
) {
896 return (Numerator
+ (Denominator
/ 2)) / Denominator
;
899 static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI
,
900 uint64_t &TrueVal
, uint64_t &FalseVal
) {
901 if (isa
<SelectInst
>(SI
.getI()))
902 return extractBranchWeights(*SI
.getI(), TrueVal
, FalseVal
);
906 bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup
&ASI
) {
907 bool ColdOperand
= false;
908 uint64_t TrueWeight
, FalseWeight
, TotalWeight
;
909 if (extractBranchWeights(ASI
.front(), TrueWeight
, FalseWeight
)) {
910 uint64_t MinWeight
= std::min(TrueWeight
, FalseWeight
);
911 TotalWeight
= TrueWeight
+ FalseWeight
;
912 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
913 ColdOperand
= TotalWeight
* ColdOperandThreshold
> 100 * MinWeight
;
914 } else if (PSI
->hasProfileSummary()) {
915 OptimizationRemarkMissed
ORmiss(DEBUG_TYPE
, "SelectOpti",
917 ORmiss
<< "Profile data available but missing branch-weights metadata for "
918 "select instruction. ";
919 EmitAndPrintRemark(ORE
, ORmiss
);
923 // Check if the cold path's dependence slice is expensive for any of the
924 // selects of the group.
925 for (SelectLike SI
: ASI
) {
926 Instruction
*ColdI
= nullptr;
928 if (TrueWeight
< FalseWeight
) {
929 ColdI
= dyn_cast_or_null
<Instruction
>(SI
.getTrueValue());
930 HotWeight
= FalseWeight
;
932 ColdI
= dyn_cast_or_null
<Instruction
>(SI
.getFalseValue());
933 HotWeight
= TrueWeight
;
936 std::stack
<Instruction
*> ColdSlice
;
937 getExclBackwardsSlice(ColdI
, ColdSlice
, SI
.getI());
938 InstructionCost SliceCost
= 0;
939 while (!ColdSlice
.empty()) {
940 SliceCost
+= TTI
->getInstructionCost(ColdSlice
.top(),
941 TargetTransformInfo::TCK_Latency
);
944 // The colder the cold value operand of the select is the more expensive
945 // the cmov becomes for computing the cold value operand every time. Thus,
946 // the colder the cold operand is the more its cost counts.
947 // Get nearest integer cost adjusted for coldness.
948 InstructionCost AdjSliceCost
=
949 divideNearest(SliceCost
* HotWeight
, TotalWeight
);
951 ColdOperandMaxCostMultiplier
* TargetTransformInfo::TCC_Expensive
)
958 // Check if it is safe to move LoadI next to the SI.
959 // Conservatively assume it is safe only if there is no instruction
960 // modifying memory in-between the load and the select instruction.
961 static bool isSafeToSinkLoad(Instruction
*LoadI
, Instruction
*SI
) {
962 // Assume loads from different basic blocks are unsafe to move.
963 if (LoadI
->getParent() != SI
->getParent())
965 auto It
= LoadI
->getIterator();
967 if (It
->mayWriteToMemory())
974 // For a given source instruction, collect its backwards dependence slice
975 // consisting of instructions exclusively computed for the purpose of producing
976 // the operands of the source instruction. As an approximation
977 // (sufficiently-accurate in practice), we populate this set with the
978 // instructions of the backwards dependence slice that only have one-use and
979 // form an one-use chain that leads to the source instruction.
980 void SelectOptimizeImpl::getExclBackwardsSlice(Instruction
*I
,
981 std::stack
<Instruction
*> &Slice
,
984 SmallPtrSet
<Instruction
*, 2> Visited
;
985 std::queue
<Instruction
*> Worklist
;
987 while (!Worklist
.empty()) {
988 Instruction
*II
= Worklist
.front();
992 if (!Visited
.insert(II
).second
)
995 if (!II
->hasOneUse())
998 // Cannot soundly sink instructions with side-effects.
999 // Terminator or phi instructions cannot be sunk.
1000 // Avoid sinking other select instructions (should be handled separetely).
1001 if (ForSinking
&& (II
->isTerminator() || II
->mayHaveSideEffects() ||
1002 isa
<SelectInst
>(II
) || isa
<PHINode
>(II
)))
1005 // Avoid sinking loads in order not to skip state-modifying instructions,
1006 // that may alias with the loaded address.
1007 // Only allow sinking of loads within the same basic block that are
1008 // conservatively proven to be safe.
1009 if (ForSinking
&& II
->mayReadFromMemory() && !isSafeToSinkLoad(II
, SI
))
1012 // Avoid considering instructions with less frequency than the source
1013 // instruction (i.e., avoid colder code regions of the dependence slice).
1014 if (BFI
->getBlockFreq(II
->getParent()) < BFI
->getBlockFreq(I
->getParent()))
1017 // Eligible one-use instruction added to the dependence slice.
1020 // Explore all the operands of the current instruction to expand the slice.
1021 for (unsigned k
= 0; k
< II
->getNumOperands(); ++k
)
1022 if (auto *OpI
= dyn_cast
<Instruction
>(II
->getOperand(k
)))
1027 bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI
) {
1028 uint64_t TrueWeight
, FalseWeight
;
1029 if (extractBranchWeights(SI
, TrueWeight
, FalseWeight
)) {
1030 uint64_t Max
= std::max(TrueWeight
, FalseWeight
);
1031 uint64_t Sum
= TrueWeight
+ FalseWeight
;
1033 auto Probability
= BranchProbability::getBranchProbability(Max
, Sum
);
1034 if (Probability
> TTI
->getPredictableBranchThreshold())
1041 bool SelectOptimizeImpl::checkLoopHeuristics(const Loop
*L
,
1042 const CostInfo LoopCost
[2]) {
1043 // Loop-level checks to determine if a non-predicated version (with branches)
1044 // of the loop is more profitable than its predicated version.
1046 if (DisableLoopLevelHeuristics
)
1049 OptimizationRemarkMissed
ORmissL(DEBUG_TYPE
, "SelectOpti",
1050 L
->getHeader()->getFirstNonPHI());
1052 if (LoopCost
[0].NonPredCost
> LoopCost
[0].PredCost
||
1053 LoopCost
[1].NonPredCost
>= LoopCost
[1].PredCost
) {
1054 ORmissL
<< "No select conversion in the loop due to no reduction of loop's "
1056 EmitAndPrintRemark(ORE
, ORmissL
);
1060 Scaled64 Gain
[2] = {LoopCost
[0].PredCost
- LoopCost
[0].NonPredCost
,
1061 LoopCost
[1].PredCost
- LoopCost
[1].NonPredCost
};
1063 // Profitably converting to branches need to reduce the loop's critical path
1064 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1065 // relative gain of 12.5%).
1066 if (Gain
[1] < Scaled64::get(GainCycleThreshold
) ||
1067 Gain
[1] * Scaled64::get(GainRelativeThreshold
) < LoopCost
[1].PredCost
) {
1068 Scaled64 RelativeGain
= Scaled64::get(100) * Gain
[1] / LoopCost
[1].PredCost
;
1069 ORmissL
<< "No select conversion in the loop due to small reduction of "
1070 "loop's critical path. Gain="
1071 << Gain
[1].toString()
1072 << ", RelativeGain=" << RelativeGain
.toString() << "%. ";
1073 EmitAndPrintRemark(ORE
, ORmissL
);
1077 // If the loop's critical path involves loop-carried dependences, the gradient
1078 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1079 // This check ensures that the latency reduction for the loop's critical path
1080 // keeps decreasing with sufficient rate beyond the two analyzed loop
1082 if (Gain
[1] > Gain
[0]) {
1083 Scaled64 GradientGain
= Scaled64::get(100) * (Gain
[1] - Gain
[0]) /
1084 (LoopCost
[1].PredCost
- LoopCost
[0].PredCost
);
1085 if (GradientGain
< Scaled64::get(GainGradientThreshold
)) {
1086 ORmissL
<< "No select conversion in the loop due to small gradient gain. "
1088 << GradientGain
.toString() << "%. ";
1089 EmitAndPrintRemark(ORE
, ORmissL
);
1093 // If the gain decreases it is not profitable to convert.
1094 else if (Gain
[1] < Gain
[0]) {
1096 << "No select conversion in the loop due to negative gradient gain. ";
1097 EmitAndPrintRemark(ORE
, ORmissL
);
1101 // Non-predicated version of the loop is more profitable than its
1102 // predicated version.
1106 // Computes instruction and loop-critical-path costs for both the predicated
1107 // and non-predicated version of the given loop.
1108 // Returns false if unable to compute these costs due to invalid cost of loop
1110 bool SelectOptimizeImpl::computeLoopCosts(
1111 const Loop
*L
, const SelectGroups
&SIGroups
,
1112 DenseMap
<const Instruction
*, CostInfo
> &InstCostMap
, CostInfo
*LoopCost
) {
1113 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1114 << L
->getHeader()->getName() << "\n");
1115 const auto &SImap
= getSImap(SIGroups
);
1116 // Compute instruction and loop-critical-path costs across two iterations for
1117 // both predicated and non-predicated version.
1118 const unsigned Iterations
= 2;
1119 for (unsigned Iter
= 0; Iter
< Iterations
; ++Iter
) {
1120 // Cost of the loop's critical path.
1121 CostInfo
&MaxCost
= LoopCost
[Iter
];
1122 for (BasicBlock
*BB
: L
->getBlocks()) {
1123 for (const Instruction
&I
: *BB
) {
1124 if (I
.isDebugOrPseudoInst())
1126 // Compute the predicated and non-predicated cost of the instruction.
1127 Scaled64 IPredCost
= Scaled64::getZero(),
1128 INonPredCost
= Scaled64::getZero();
1130 // Assume infinite resources that allow to fully exploit the available
1131 // instruction-level parallelism.
1132 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1133 for (const Use
&U
: I
.operands()) {
1134 auto UI
= dyn_cast
<Instruction
>(U
.get());
1137 if (InstCostMap
.count(UI
)) {
1138 IPredCost
= std::max(IPredCost
, InstCostMap
[UI
].PredCost
);
1139 INonPredCost
= std::max(INonPredCost
, InstCostMap
[UI
].NonPredCost
);
1142 auto ILatency
= computeInstCost(&I
);
1144 OptimizationRemarkMissed
ORmissL(DEBUG_TYPE
, "SelectOpti", &I
);
1145 ORmissL
<< "Invalid instruction cost preventing analysis and "
1146 "optimization of the inner-most loop containing this "
1148 EmitAndPrintRemark(ORE
, ORmissL
);
1151 IPredCost
+= Scaled64::get(*ILatency
);
1152 INonPredCost
+= Scaled64::get(*ILatency
);
1154 // For a select that can be converted to branch,
1155 // compute its cost as a branch (non-predicated cost).
1157 // BranchCost = PredictedPathCost + MispredictCost
1158 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1159 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1160 if (SImap
.contains(&I
)) {
1161 auto SI
= SImap
.at(&I
);
1162 Scaled64 TrueOpCost
= SI
.getTrueOpCost(InstCostMap
, TTI
);
1163 Scaled64 FalseOpCost
= SI
.getFalseOpCost(InstCostMap
, TTI
);
1164 Scaled64 PredictedPathCost
=
1165 getPredictedPathCost(TrueOpCost
, FalseOpCost
, SI
);
1167 Scaled64 CondCost
= Scaled64::getZero();
1168 if (auto *CI
= dyn_cast
<Instruction
>(SI
.getCondition()))
1169 if (InstCostMap
.count(CI
))
1170 CondCost
= InstCostMap
[CI
].NonPredCost
;
1171 Scaled64 MispredictCost
= getMispredictionCost(SI
, CondCost
);
1173 INonPredCost
= PredictedPathCost
+ MispredictCost
;
1175 LLVM_DEBUG(dbgs() << " " << ILatency
<< "/" << IPredCost
<< "/"
1176 << INonPredCost
<< " for " << I
<< "\n");
1178 InstCostMap
[&I
] = {IPredCost
, INonPredCost
};
1179 MaxCost
.PredCost
= std::max(MaxCost
.PredCost
, IPredCost
);
1180 MaxCost
.NonPredCost
= std::max(MaxCost
.NonPredCost
, INonPredCost
);
1183 LLVM_DEBUG(dbgs() << "Iteration " << Iter
+ 1
1184 << " MaxCost = " << MaxCost
.PredCost
<< " "
1185 << MaxCost
.NonPredCost
<< "\n");
1190 SmallDenseMap
<const Instruction
*, SelectOptimizeImpl::SelectLike
, 2>
1191 SelectOptimizeImpl::getSImap(const SelectGroups
&SIGroups
) {
1192 SmallDenseMap
<const Instruction
*, SelectLike
, 2> SImap
;
1193 for (const SelectGroup
&ASI
: SIGroups
)
1194 for (SelectLike SI
: ASI
)
1195 SImap
.try_emplace(SI
.getI(), SI
);
1199 std::optional
<uint64_t>
1200 SelectOptimizeImpl::computeInstCost(const Instruction
*I
) {
1201 InstructionCost ICost
=
1202 TTI
->getInstructionCost(I
, TargetTransformInfo::TCK_Latency
);
1203 if (auto OC
= ICost
.getValue())
1204 return std::optional
<uint64_t>(*OC
);
1205 return std::nullopt
;
1208 ScaledNumber
<uint64_t>
1209 SelectOptimizeImpl::getMispredictionCost(const SelectLike SI
,
1210 const Scaled64 CondCost
) {
1211 uint64_t MispredictPenalty
= TSchedModel
.getMCSchedModel()->MispredictPenalty
;
1213 // Account for the default misprediction rate when using a branch
1214 // (conservatively set to 25% by default).
1215 uint64_t MispredictRate
= MispredictDefaultRate
;
1216 // If the select condition is obviously predictable, then the misprediction
1218 if (isSelectHighlyPredictable(SI
))
1221 // CondCost is included to account for cases where the computation of the
1222 // condition is part of a long dependence chain (potentially loop-carried)
1223 // that would delay detection of a misprediction and increase its cost.
1224 Scaled64 MispredictCost
=
1225 std::max(Scaled64::get(MispredictPenalty
), CondCost
) *
1226 Scaled64::get(MispredictRate
);
1227 MispredictCost
/= Scaled64::get(100);
1229 return MispredictCost
;
1232 // Returns the cost of a branch when the prediction is correct.
1233 // TrueCost * TrueProbability + FalseCost * FalseProbability.
1234 ScaledNumber
<uint64_t>
1235 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost
, Scaled64 FalseCost
,
1236 const SelectLike SI
) {
1237 Scaled64 PredPathCost
;
1238 uint64_t TrueWeight
, FalseWeight
;
1239 if (extractBranchWeights(SI
, TrueWeight
, FalseWeight
)) {
1240 uint64_t SumWeight
= TrueWeight
+ FalseWeight
;
1241 if (SumWeight
!= 0) {
1242 PredPathCost
= TrueCost
* Scaled64::get(TrueWeight
) +
1243 FalseCost
* Scaled64::get(FalseWeight
);
1244 PredPathCost
/= Scaled64::get(SumWeight
);
1245 return PredPathCost
;
1248 // Without branch weight metadata, we assume 75% for the one path and 25% for
1249 // the other, and pick the result with the biggest cost.
1250 PredPathCost
= std::max(TrueCost
* Scaled64::get(3) + FalseCost
,
1251 FalseCost
* Scaled64::get(3) + TrueCost
);
1252 PredPathCost
/= Scaled64::get(4);
1253 return PredPathCost
;
1256 bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI
) {
1257 bool VectorCond
= !SI
.getCondition()->getType()->isIntegerTy(1);
1260 TargetLowering::SelectSupportKind SelectKind
;
1261 if (SI
.getType()->isVectorTy())
1262 SelectKind
= TargetLowering::ScalarCondVectorVal
;
1264 SelectKind
= TargetLowering::ScalarValSelect
;
1265 return TLI
->isSelectSupported(SelectKind
);