llvm/lib/CodeGen/SelectOptimize.cpp

   1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass converts selects to conditional jumps when profitable.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "llvm/CodeGen/SelectOptimize.h"
  14 #include "llvm/ADT/SmallVector.h"
  15 #include "llvm/ADT/Statistic.h"
  16 #include "llvm/Analysis/BlockFrequencyInfo.h"
  17 #include "llvm/Analysis/BranchProbabilityInfo.h"
  18 #include "llvm/Analysis/LoopInfo.h"
  19 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  20 #include "llvm/Analysis/ProfileSummaryInfo.h"
  21 #include "llvm/Analysis/TargetTransformInfo.h"
  22 #include "llvm/CodeGen/Passes.h"
  23 #include "llvm/CodeGen/TargetLowering.h"
  24 #include "llvm/CodeGen/TargetPassConfig.h"
  25 #include "llvm/CodeGen/TargetSchedule.h"
  26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
  27 #include "llvm/IR/BasicBlock.h"
  28 #include "llvm/IR/Dominators.h"
  29 #include "llvm/IR/Function.h"
  30 #include "llvm/IR/IRBuilder.h"
  31 #include "llvm/IR/Instruction.h"
  32 #include "llvm/IR/PatternMatch.h"
  33 #include "llvm/IR/ProfDataUtils.h"
  34 #include "llvm/InitializePasses.h"
  35 #include "llvm/Pass.h"
  36 #include "llvm/Support/ScaledNumber.h"
  37 #include "llvm/Target/TargetMachine.h"
  38 #include "llvm/Transforms/Utils/SizeOpts.h"
  39 #include <algorithm>
  40 #include <memory>
  41 #include <queue>
  42 #include <stack>
  43
  44 using namespace llvm;
  45 using namespace llvm::PatternMatch;
  46
  47 #define DEBUG_TYPE "select-optimize"
  48
  49 STATISTIC(NumSelectOptAnalyzed,
  50           "Number of select groups considered for conversion to branch");
  51 STATISTIC(NumSelectConvertedExpColdOperand,
  52           "Number of select groups converted due to expensive cold operand");
  53 STATISTIC(NumSelectConvertedHighPred,
  54           "Number of select groups converted due to high-predictability");
  55 STATISTIC(NumSelectUnPred,
  56           "Number of select groups not converted due to unpredictability");
  57 STATISTIC(NumSelectColdBB,
  58           "Number of select groups not converted due to cold basic block");
  59 STATISTIC(NumSelectConvertedLoop,
  60           "Number of select groups converted due to loop-level analysis");
  61 STATISTIC(NumSelectsConverted, "Number of selects converted");
  62
  63 static cl::opt<unsigned> ColdOperandThreshold(
  64     "cold-operand-threshold",
  65     cl::desc("Maximum frequency of path for an operand to be considered cold."),
  66     cl::init(20), cl::Hidden);
  67
  68 static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
  69     "cold-operand-max-cost-multiplier",
  70     cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
  71              "slice of a cold operand to be considered inexpensive."),
  72     cl::init(1), cl::Hidden);
  73
  74 static cl::opt<unsigned>
  75     GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
  76                           cl::desc("Gradient gain threshold (%)."),
  77                           cl::init(25), cl::Hidden);
  78
  79 static cl::opt<unsigned>
  80     GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
  81                        cl::desc("Minimum gain per loop (in cycles) threshold."),
  82                        cl::init(4), cl::Hidden);
  83
  84 static cl::opt<unsigned> GainRelativeThreshold(
  85     "select-opti-loop-relative-gain-threshold",
  86     cl::desc(
  87         "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
  88     cl::init(8), cl::Hidden);
  89
  90 static cl::opt<unsigned> MispredictDefaultRate(
  91     "mispredict-default-rate", cl::Hidden, cl::init(25),
  92     cl::desc("Default mispredict rate (initialized to 25%)."));
  93
  94 static cl::opt<bool>
  95     DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
  96                                cl::init(false),
  97                                cl::desc("Disable loop-level heuristics."));
  98
  99 namespace {
 100
 101 class SelectOptimizeImpl {
 102   const TargetMachine *TM = nullptr;
 103   const TargetSubtargetInfo *TSI = nullptr;
 104   const TargetLowering *TLI = nullptr;
 105   const TargetTransformInfo *TTI = nullptr;
 106   const LoopInfo *LI = nullptr;
 107   BlockFrequencyInfo *BFI;
 108   ProfileSummaryInfo *PSI = nullptr;
 109   OptimizationRemarkEmitter *ORE = nullptr;
 110   TargetSchedModel TSchedModel;
 111
 112 public:
 113   SelectOptimizeImpl() = default;
 114   SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
 115   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 116   bool runOnFunction(Function &F, Pass &P);
 117
 118   using Scaled64 = ScaledNumber<uint64_t>;
 119
 120   struct CostInfo {
 121     /// Predicated cost (with selects as conditional moves).
 122     Scaled64 PredCost;
 123     /// Non-predicated cost (with selects converted to branches).
 124     Scaled64 NonPredCost;
 125   };
 126
 127   /// SelectLike is an abstraction over SelectInst and other operations that can
 128   /// act like selects. For example Or(Zext(icmp), X) can be treated like
 129   /// select(icmp, X|1, X).
 130   class SelectLike {
 131     SelectLike(Instruction *I) : I(I) {}
 132
 133     Instruction *I;
 134
 135   public:
 136     /// Match a select or select-like instruction, returning a SelectLike.
 137     static SelectLike match(Instruction *I) {
 138       // Select instruction are what we are usually looking for.
 139       if (isa<SelectInst>(I))
 140         return SelectLike(I);
 141
 142       // An Or(zext(i1 X), Y) can also be treated like a select, with condition
 143       // C and values Y|1 and Y.
 144       Value *X;
 145       if (PatternMatch::match(
 146               I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
 147           X->getType()->isIntegerTy(1))
 148         return SelectLike(I);
 149
 150       return SelectLike(nullptr);
 151     }
 152
 153     bool isValid() { return I; }
 154     operator bool() { return isValid(); }
 155
 156     Instruction *getI() { return I; }
 157     const Instruction *getI() const { return I; }
 158
 159     Type *getType() const { return I->getType(); }
 160
 161     /// Return the condition for the SelectLike instruction. For example the
 162     /// condition of a select or c in `or(zext(c), x)`
 163     Value *getCondition() const {
 164       if (auto *Sel = dyn_cast<SelectInst>(I))
 165         return Sel->getCondition();
 166       // Or(zext) case
 167       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
 168         Value *X;
 169         if (PatternMatch::match(BO->getOperand(0),
 170                                 m_OneUse(m_ZExt(m_Value(X)))))
 171           return X;
 172         if (PatternMatch::match(BO->getOperand(1),
 173                                 m_OneUse(m_ZExt(m_Value(X)))))
 174           return X;
 175       }
 176
 177       llvm_unreachable("Unhandled case in getCondition");
 178     }
 179
 180     /// Return the true value for the SelectLike instruction. Note this may not
 181     /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
 182     /// the true value would be `or(x,1)`. As this value does not exist, nullptr
 183     /// is returned.
 184     Value *getTrueValue() const {
 185       if (auto *Sel = dyn_cast<SelectInst>(I))
 186         return Sel->getTrueValue();
 187       // Or(zext) case - The true value is Or(X), so return nullptr as the value
 188       // does not yet exist.
 189       if (isa<BinaryOperator>(I))
 190         return nullptr;
 191
 192       llvm_unreachable("Unhandled case in getTrueValue");
 193     }
 194
 195     /// Return the false value for the SelectLike instruction. For example the
 196     /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
 197     /// `select(c, x|1, x)`)
 198     Value *getFalseValue() const {
 199       if (auto *Sel = dyn_cast<SelectInst>(I))
 200         return Sel->getFalseValue();
 201       // Or(zext) case - return the operand which is not the zext.
 202       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
 203         Value *X;
 204         if (PatternMatch::match(BO->getOperand(0),
 205                                 m_OneUse(m_ZExt(m_Value(X)))))
 206           return BO->getOperand(1);
 207         if (PatternMatch::match(BO->getOperand(1),
 208                                 m_OneUse(m_ZExt(m_Value(X)))))
 209           return BO->getOperand(0);
 210       }
 211
 212       llvm_unreachable("Unhandled case in getFalseValue");
 213     }
 214
 215     /// Return the NonPredCost cost of the true op, given the costs in
 216     /// InstCostMap. This may need to be generated for select-like instructions.
 217     Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
 218                            const TargetTransformInfo *TTI) {
 219       if (auto *Sel = dyn_cast<SelectInst>(I))
 220         if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
 221           return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
 222                                          : Scaled64::getZero();
 223
 224       // Or case - add the cost of an extra Or to the cost of the False case.
 225       if (isa<BinaryOperator>(I))
 226         if (auto I = dyn_cast<Instruction>(getFalseValue()))
 227           if (InstCostMap.contains(I)) {
 228             InstructionCost OrCost = TTI->getArithmeticInstrCost(
 229                 Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
 230                 {TargetTransformInfo::OK_AnyValue,
 231                  TargetTransformInfo::OP_None},
 232                 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
 233             return InstCostMap[I].NonPredCost +
 234                    Scaled64::get(*OrCost.getValue());
 235           }
 236
 237       return Scaled64::getZero();
 238     }
 239
 240     /// Return the NonPredCost cost of the false op, given the costs in
 241     /// InstCostMap. This may need to be generated for select-like instructions.
 242     Scaled64
 243     getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
 244                    const TargetTransformInfo *TTI) {
 245       if (auto *Sel = dyn_cast<SelectInst>(I))
 246         if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
 247           return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
 248                                          : Scaled64::getZero();
 249
 250       // Or case - return the cost of the false case
 251       if (isa<BinaryOperator>(I))
 252         if (auto I = dyn_cast<Instruction>(getFalseValue()))
 253           if (InstCostMap.contains(I))
 254             return InstCostMap[I].NonPredCost;
 255
 256       return Scaled64::getZero();
 257     }
 258   };
 259
 260 private:
 261   // Select groups consist of consecutive select instructions with the same
 262   // condition.
 263   using SelectGroup = SmallVector<SelectLike, 2>;
 264   using SelectGroups = SmallVector<SelectGroup, 2>;
 265
 266   // Converts select instructions of a function to conditional jumps when deemed
 267   // profitable. Returns true if at least one select was converted.
 268   bool optimizeSelects(Function &F);
 269
 270   // Heuristics for determining which select instructions can be profitably
 271   // conveted to branches. Separate heuristics for selects in inner-most loops
 272   // and the rest of code regions (base heuristics for non-inner-most loop
 273   // regions).
 274   void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
 275   void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
 276
 277   // Converts to branches the select groups that were deemed
 278   // profitable-to-convert.
 279   void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
 280
 281   // Splits selects of a given basic block into select groups.
 282   void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
 283
 284   // Determines for which select groups it is profitable converting to branches
 285   // (base and inner-most-loop heuristics).
 286   void findProfitableSIGroupsBase(SelectGroups &SIGroups,
 287                                   SelectGroups &ProfSIGroups);
 288   void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
 289                                         SelectGroups &ProfSIGroups);
 290
 291   // Determines if a select group should be converted to a branch (base
 292   // heuristics).
 293   bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
 294
 295   // Returns true if there are expensive instructions in the cold value
 296   // operand's (if any) dependence slice of any of the selects of the given
 297   // group.
 298   bool hasExpensiveColdOperand(const SelectGroup &ASI);
 299
 300   // For a given source instruction, collect its backwards dependence slice
 301   // consisting of instructions exclusively computed for producing the operands
 302   // of the source instruction.
 303   void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
 304                              Instruction *SI, bool ForSinking = false);
 305
 306   // Returns true if the condition of the select is highly predictable.
 307   bool isSelectHighlyPredictable(const SelectLike SI);
 308
 309   // Loop-level checks to determine if a non-predicated version (with branches)
 310   // of the given loop is more profitable than its predicated version.
 311   bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
 312
 313   // Computes instruction and loop-critical-path costs for both the predicated
 314   // and non-predicated version of the given loop.
 315   bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
 316                         DenseMap<const Instruction *, CostInfo> &InstCostMap,
 317                         CostInfo *LoopCost);
 318
 319   // Returns a set of all the select instructions in the given select groups.
 320   SmallDenseMap<const Instruction *, SelectLike, 2>
 321   getSImap(const SelectGroups &SIGroups);
 322
 323   // Returns the latency cost of a given instruction.
 324   std::optional<uint64_t> computeInstCost(const Instruction *I);
 325
 326   // Returns the misprediction cost of a given select when converted to branch.
 327   Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
 328
 329   // Returns the cost of a branch when the prediction is correct.
 330   Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
 331                                 const SelectLike SI);
 332
 333   // Returns true if the target architecture supports lowering a given select.
 334   bool isSelectKindSupported(const SelectLike SI);
 335 };
 336
 337 class SelectOptimize : public FunctionPass {
 338   SelectOptimizeImpl Impl;
 339
 340 public:
 341   static char ID;
 342
 343   SelectOptimize() : FunctionPass(ID) {
 344     initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
 345   }
 346
 347   bool runOnFunction(Function &F) override {
 348     return Impl.runOnFunction(F, *this);
 349   }
 350
 351   void getAnalysisUsage(AnalysisUsage &AU) const override {
 352     AU.addRequired<ProfileSummaryInfoWrapperPass>();
 353     AU.addRequired<TargetPassConfig>();
 354     AU.addRequired<TargetTransformInfoWrapperPass>();
 355     AU.addRequired<LoopInfoWrapperPass>();
 356     AU.addRequired<BlockFrequencyInfoWrapperPass>();
 357     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
 358   }
 359 };
 360
 361 } // namespace
 362
 363 PreservedAnalyses SelectOptimizePass::run(Function &F,
 364                                           FunctionAnalysisManager &FAM) {
 365   SelectOptimizeImpl Impl(TM);
 366   return Impl.run(F, FAM);
 367 }
 368
 369 char SelectOptimize::ID = 0;
 370
 371 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
 372                       false)
 373 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 374 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 375 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 376 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 377 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 378 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 379 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
 380                     false)
 381
 382 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
 383
 384 PreservedAnalyses SelectOptimizeImpl::run(Function &F,
 385                                           FunctionAnalysisManager &FAM) {
 386   TSI = TM->getSubtargetImpl(F);
 387   TLI = TSI->getTargetLowering();
 388
 389   // If none of the select types are supported then skip this pass.
 390   // This is an optimization pass. Legality issues will be handled by
 391   // instruction selection.
 392   if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
 393       !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
 394       !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
 395     return PreservedAnalyses::all();
 396
 397   TTI = &FAM.getResult<TargetIRAnalysis>(F);
 398   if (!TTI->enableSelectOptimize())
 399     return PreservedAnalyses::all();
 400
 401   PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)
 402             .getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
 403   assert(PSI && "This pass requires module analysis pass `profile-summary`!");
 404   BFI = &FAM.getResult<BlockFrequencyAnalysis>(F);
 405
 406   // When optimizing for size, selects are preferable over branches.
 407   if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
 408     return PreservedAnalyses::all();
 409
 410   LI = &FAM.getResult<LoopAnalysis>(F);
 411   ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 412   TSchedModel.init(TSI);
 413
 414   bool Changed = optimizeSelects(F);
 415   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 416 }
 417
 418 bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
 419   TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
 420   TSI = TM->getSubtargetImpl(F);
 421   TLI = TSI->getTargetLowering();
 422
 423   // If none of the select types are supported then skip this pass.
 424   // This is an optimization pass. Legality issues will be handled by
 425   // instruction selection.
 426   if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
 427       !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
 428       !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
 429     return false;
 430
 431   TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 432
 433   if (!TTI->enableSelectOptimize())
 434     return false;
 435
 436   LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 437   BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
 438   PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 439   ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 440   TSchedModel.init(TSI);
 441
 442   // When optimizing for size, selects are preferable over branches.
 443   if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
 444     return false;
 445
 446   return optimizeSelects(F);
 447 }
 448
 449 bool SelectOptimizeImpl::optimizeSelects(Function &F) {
 450   // Determine for which select groups it is profitable converting to branches.
 451   SelectGroups ProfSIGroups;
 452   // Base heuristics apply only to non-loops and outer loops.
 453   optimizeSelectsBase(F, ProfSIGroups);
 454   // Separate heuristics for inner-most loops.
 455   optimizeSelectsInnerLoops(F, ProfSIGroups);
 456
 457   // Convert to branches the select groups that were deemed
 458   // profitable-to-convert.
 459   convertProfitableSIGroups(ProfSIGroups);
 460
 461   // Code modified if at least one select group was converted.
 462   return !ProfSIGroups.empty();
 463 }
 464
 465 void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
 466                                              SelectGroups &ProfSIGroups) {
 467   // Collect all the select groups.
 468   SelectGroups SIGroups;
 469   for (BasicBlock &BB : F) {
 470     // Base heuristics apply only to non-loops and outer loops.
 471     Loop *L = LI->getLoopFor(&BB);
 472     if (L && L->isInnermost())
 473       continue;
 474     collectSelectGroups(BB, SIGroups);
 475   }
 476
 477   // Determine for which select groups it is profitable converting to branches.
 478   findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
 479 }
 480
 481 void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
 482                                                    SelectGroups &ProfSIGroups) {
 483   SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
 484   // Need to check size on each iteration as we accumulate child loops.
 485   for (unsigned long i = 0; i < Loops.size(); ++i)
 486     for (Loop *ChildL : Loops[i]->getSubLoops())
 487       Loops.push_back(ChildL);
 488
 489   for (Loop *L : Loops) {
 490     if (!L->isInnermost())
 491       continue;
 492
 493     SelectGroups SIGroups;
 494     for (BasicBlock *BB : L->getBlocks())
 495       collectSelectGroups(*BB, SIGroups);
 496
 497     findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
 498   }
 499 }
 500
 501 /// If \p isTrue is true, return the true value of \p SI, otherwise return
 502 /// false value of \p SI. If the true/false value of \p SI is defined by any
 503 /// select instructions in \p Selects, look through the defining select
 504 /// instruction until the true/false value is not defined in \p Selects.
 505 static Value *
 506 getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
 507                     const SmallPtrSet<const Instruction *, 2> &Selects,
 508                     IRBuilder<> &IB) {
 509   Value *V = nullptr;
 510   for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
 511        DefSI != nullptr && Selects.count(DefSI);
 512        DefSI = dyn_cast<SelectInst>(V)) {
 513     assert(DefSI->getCondition() == SI.getCondition() &&
 514            "The condition of DefSI does not match with SI");
 515     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
 516   }
 517
 518   if (isa<BinaryOperator>(SI.getI())) {
 519     assert(SI.getI()->getOpcode() == Instruction::Or &&
 520            "Only currently handling Or instructions.");
 521     V = SI.getFalseValue();
 522     if (isTrue)
 523       V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
 524   }
 525
 526   assert(V && "Failed to get select true/false value");
 527   return V;
 528 }
 529
 530 void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
 531   for (SelectGroup &ASI : ProfSIGroups) {
 532     // The code transformation here is a modified version of the sinking
 533     // transformation in CodeGenPrepare::optimizeSelectInst with a more
 534     // aggressive strategy of which instructions to sink.
 535     //
 536     // TODO: eliminate the redundancy of logic transforming selects to branches
 537     // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
 538     // selects for all cases (with and without profile information).
 539
 540     // Transform a sequence like this:
 541     //    start:
 542     //       %cmp = cmp uge i32 %a, %b
 543     //       %sel = select i1 %cmp, i32 %c, i32 %d
 544     //
 545     // Into:
 546     //    start:
 547     //       %cmp = cmp uge i32 %a, %b
 548     //       %cmp.frozen = freeze %cmp
 549     //       br i1 %cmp.frozen, label %select.true, label %select.false
 550     //    select.true:
 551     //       br label %select.end
 552     //    select.false:
 553     //       br label %select.end
 554     //    select.end:
 555     //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
 556     //
 557     // %cmp should be frozen, otherwise it may introduce undefined behavior.
 558     // In addition, we may sink instructions that produce %c or %d into the
 559     // destination(s) of the new branch.
 560     // If the true or false blocks do not contain a sunken instruction, that
 561     // block and its branch may be optimized away. In that case, one side of the
 562     // first branch will point directly to select.end, and the corresponding PHI
 563     // predecessor block will be the start block.
 564
 565     // Find all the instructions that can be soundly sunk to the true/false
 566     // blocks. These are instructions that are computed solely for producing the
 567     // operands of the select instructions in the group and can be sunk without
 568     // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
 569     // with side effects).
 570     SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
 571     typedef std::stack<Instruction *>::size_type StackSizeType;
 572     StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
 573     for (SelectLike SI : ASI) {
 574       // For each select, compute the sinkable dependence chains of the true and
 575       // false operands.
 576       if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) {
 577         std::stack<Instruction *> TrueSlice;
 578         getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true);
 579         maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
 580         TrueSlices.push_back(TrueSlice);
 581       }
 582       if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) {
 583         if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) {
 584           std::stack<Instruction *> FalseSlice;
 585           getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true);
 586           maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
 587           FalseSlices.push_back(FalseSlice);
 588         }
 589       }
 590     }
 591     // In the case of multiple select instructions in the same group, the order
 592     // of non-dependent instructions (instructions of different dependence
 593     // slices) in the true/false blocks appears to affect performance.
 594     // Interleaving the slices seems to experimentally be the optimal approach.
 595     // This interleaving scheduling allows for more ILP (with a natural downside
 596     // of increasing a bit register pressure) compared to a simple ordering of
 597     // one whole chain after another. One would expect that this ordering would
 598     // not matter since the scheduling in the backend of the compiler  would
 599     // take care of it, but apparently the scheduler fails to deliver optimal
 600     // ILP with a naive ordering here.
 601     SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
 602     for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
 603       for (auto &S : TrueSlices) {
 604         if (!S.empty()) {
 605           TrueSlicesInterleaved.push_back(S.top());
 606           S.pop();
 607         }
 608       }
 609     }
 610     for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
 611       for (auto &S : FalseSlices) {
 612         if (!S.empty()) {
 613           FalseSlicesInterleaved.push_back(S.top());
 614           S.pop();
 615         }
 616       }
 617     }
 618
 619     // We split the block containing the select(s) into two blocks.
 620     SelectLike SI = ASI.front();
 621     SelectLike LastSI = ASI.back();
 622     BasicBlock *StartBlock = SI.getI()->getParent();
 623     BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
 624     BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
 625     BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
 626     // Delete the unconditional branch that was just created by the split.
 627     StartBlock->getTerminator()->eraseFromParent();
 628
 629     // Move any debug/pseudo instructions that were in-between the select
 630     // group to the newly-created end block.
 631     SmallVector<Instruction *, 2> DebugPseudoINS;
 632     auto DIt = SI.getI()->getIterator();
 633     while (&*DIt != LastSI.getI()) {
 634       if (DIt->isDebugOrPseudoInst())
 635         DebugPseudoINS.push_back(&*DIt);
 636       DIt++;
 637     }
 638     for (auto *DI : DebugPseudoINS) {
 639       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
 640     }
 641
 642     // Duplicate implementation for DPValues, the non-instruction debug-info
 643     // record. Helper lambda for moving DPValues to the end block.
 644     auto TransferDPValues = [&](Instruction &I) {
 645       for (auto &DPValue : llvm::make_early_inc_range(I.getDbgValueRange())) {
 646         DPValue.removeFromParent();
 647         EndBlock->insertDPValueBefore(&DPValue,
 648                                       EndBlock->getFirstInsertionPt());
 649       }
 650     };
 651
 652     // Iterate over all instructions in between SI and LastSI, not including
 653     // SI itself. These are all the variable assignments that happen "in the
 654     // middle" of the select group.
 655     auto R = make_range(std::next(SI.getI()->getIterator()),
 656                         std::next(LastSI.getI()->getIterator()));
 657     llvm::for_each(R, TransferDPValues);
 658
 659     // These are the new basic blocks for the conditional branch.
 660     // At least one will become an actual new basic block.
 661     BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
 662     BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
 663     if (!TrueSlicesInterleaved.empty()) {
 664       TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink",
 665                                      EndBlock->getParent(), EndBlock);
 666       TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
 667       TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
 668       for (Instruction *TrueInst : TrueSlicesInterleaved)
 669         TrueInst->moveBefore(TrueBranch);
 670     }
 671     if (!FalseSlicesInterleaved.empty()) {
 672       FalseBlock =
 673           BasicBlock::Create(EndBlock->getContext(), "select.false.sink",
 674                              EndBlock->getParent(), EndBlock);
 675       FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
 676       FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
 677       for (Instruction *FalseInst : FalseSlicesInterleaved)
 678         FalseInst->moveBefore(FalseBranch);
 679     }
 680     // If there was nothing to sink, then arbitrarily choose the 'false' side
 681     // for a new input value to the PHI.
 682     if (TrueBlock == FalseBlock) {
 683       assert(TrueBlock == nullptr &&
 684              "Unexpected basic block transform while optimizing select");
 685
 686       FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false",
 687                                       EndBlock->getParent(), EndBlock);
 688       auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
 689       FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
 690     }
 691
 692     // Insert the real conditional branch based on the original condition.
 693     // If we did not create a new block for one of the 'true' or 'false' paths
 694     // of the condition, it means that side of the branch goes to the end block
 695     // directly and the path originates from the start block from the point of
 696     // view of the new PHI.
 697     BasicBlock *TT, *FT;
 698     if (TrueBlock == nullptr) {
 699       TT = EndBlock;
 700       FT = FalseBlock;
 701       TrueBlock = StartBlock;
 702     } else if (FalseBlock == nullptr) {
 703       TT = TrueBlock;
 704       FT = EndBlock;
 705       FalseBlock = StartBlock;
 706     } else {
 707       TT = TrueBlock;
 708       FT = FalseBlock;
 709     }
 710     IRBuilder<> IB(SI.getI());
 711     auto *CondFr = IB.CreateFreeze(SI.getCondition(),
 712                                    SI.getCondition()->getName() + ".frozen");
 713
 714     SmallPtrSet<const Instruction *, 2> INS;
 715     for (auto SI : ASI)
 716       INS.insert(SI.getI());
 717
 718     // Use reverse iterator because later select may use the value of the
 719     // earlier select, and we need to propagate value through earlier select
 720     // to get the PHI operand.
 721     for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
 722       SelectLike SI = *It;
 723       // The select itself is replaced with a PHI Node.
 724       PHINode *PN = PHINode::Create(SI.getType(), 2, "");
 725       PN->insertBefore(EndBlock->begin());
 726       PN->takeName(SI.getI());
 727       PN->addIncoming(getTrueOrFalseValue(SI, true, INS, IB), TrueBlock);
 728       PN->addIncoming(getTrueOrFalseValue(SI, false, INS, IB), FalseBlock);
 729       PN->setDebugLoc(SI.getI()->getDebugLoc());
 730       SI.getI()->replaceAllUsesWith(PN);
 731       INS.erase(SI.getI());
 732       ++NumSelectsConverted;
 733     }
 734     IB.CreateCondBr(CondFr, TT, FT, SI.getI());
 735
 736     // Remove the old select instructions, now that they are not longer used.
 737     for (auto SI : ASI)
 738       SI.getI()->eraseFromParent();
 739   }
 740 }
 741
 742 void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
 743                                              SelectGroups &SIGroups) {
 744   BasicBlock::iterator BBIt = BB.begin();
 745   while (BBIt != BB.end()) {
 746     Instruction *I = &*BBIt++;
 747     if (SelectLike SI = SelectLike::match(I)) {
 748       if (!TTI->shouldTreatInstructionLikeSelect(I))
 749         continue;
 750
 751       SelectGroup SIGroup;
 752       SIGroup.push_back(SI);
 753       while (BBIt != BB.end()) {
 754         Instruction *NI = &*BBIt;
 755         // Debug/pseudo instructions should be skipped and not prevent the
 756         // formation of a select group.
 757         if (NI->isDebugOrPseudoInst()) {
 758           ++BBIt;
 759           continue;
 760         }
 761         // We only allow selects in the same group, not other select-like
 762         // instructions.
 763         if (!isa<SelectInst>(NI))
 764           break;
 765
 766         SelectLike NSI = SelectLike::match(NI);
 767         if (NSI && SI.getCondition() == NSI.getCondition()) {
 768           SIGroup.push_back(NSI);
 769         } else
 770           break;
 771         ++BBIt;
 772       }
 773
 774       // If the select type is not supported, no point optimizing it.
 775       // Instruction selection will take care of it.
 776       if (!isSelectKindSupported(SI))
 777         continue;
 778
 779       SIGroups.push_back(SIGroup);
 780     }
 781   }
 782 }
 783
 784 void SelectOptimizeImpl::findProfitableSIGroupsBase(
 785     SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
 786   for (SelectGroup &ASI : SIGroups) {
 787     ++NumSelectOptAnalyzed;
 788     if (isConvertToBranchProfitableBase(ASI))
 789       ProfSIGroups.push_back(ASI);
 790   }
 791 }
 792
 793 static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
 794                                DiagnosticInfoOptimizationBase &Rem) {
 795   LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
 796   ORE->emit(Rem);
 797 }
 798
 799 void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
 800     const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
 801   NumSelectOptAnalyzed += SIGroups.size();
 802   // For each select group in an inner-most loop,
 803   // a branch is more preferable than a select/conditional-move if:
 804   // i) conversion to branches for all the select groups of the loop satisfies
 805   //    loop-level heuristics including reducing the loop's critical path by
 806   //    some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
 807   // ii) the total cost of the select group is cheaper with a branch compared
 808   //     to its predicated version. The cost is in terms of latency and the cost
 809   //     of a select group is the cost of its most expensive select instruction
 810   //     (assuming infinite resources and thus fully leveraging available ILP).
 811
 812   DenseMap<const Instruction *, CostInfo> InstCostMap;
 813   CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
 814                           {Scaled64::getZero(), Scaled64::getZero()}};
 815   if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
 816       !checkLoopHeuristics(L, LoopCost)) {
 817     return;
 818   }
 819
 820   for (SelectGroup &ASI : SIGroups) {
 821     // Assuming infinite resources, the cost of a group of instructions is the
 822     // cost of the most expensive instruction of the group.
 823     Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
 824     for (SelectLike SI : ASI) {
 825       SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
 826       BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
 827     }
 828     if (BranchCost < SelectCost) {
 829       OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
 830       OR << "Profitable to convert to branch (loop analysis). BranchCost="
 831          << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
 832          << ". ";
 833       EmitAndPrintRemark(ORE, OR);
 834       ++NumSelectConvertedLoop;
 835       ProfSIGroups.push_back(ASI);
 836     } else {
 837       OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
 838                                       ASI.front().getI());
 839       ORmiss << "Select is more profitable (loop analysis). BranchCost="
 840              << BranchCost.toString()
 841              << ", SelectCost=" << SelectCost.toString() << ". ";
 842       EmitAndPrintRemark(ORE, ORmiss);
 843     }
 844   }
 845 }
 846
 847 bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
 848     const SelectGroup &ASI) {
 849   SelectLike SI = ASI.front();
 850   LLVM_DEBUG(dbgs() << "Analyzing select group containing " << SI.getI()
 851                     << "\n");
 852   OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
 853   OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
 854
 855   // Skip cold basic blocks. Better to optimize for size for cold blocks.
 856   if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) {
 857     ++NumSelectColdBB;
 858     ORmiss << "Not converted to branch because of cold basic block. ";
 859     EmitAndPrintRemark(ORE, ORmiss);
 860     return false;
 861   }
 862
 863   // If unpredictable, branch form is less profitable.
 864   if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) {
 865     ++NumSelectUnPred;
 866     ORmiss << "Not converted to branch because of unpredictable branch. ";
 867     EmitAndPrintRemark(ORE, ORmiss);
 868     return false;
 869   }
 870
 871   // If highly predictable, branch form is more profitable, unless a
 872   // predictable select is inexpensive in the target architecture.
 873   if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
 874     ++NumSelectConvertedHighPred;
 875     OR << "Converted to branch because of highly predictable branch. ";
 876     EmitAndPrintRemark(ORE, OR);
 877     return true;
 878   }
 879
 880   // Look for expensive instructions in the cold operand's (if any) dependence
 881   // slice of any of the selects in the group.
 882   if (hasExpensiveColdOperand(ASI)) {
 883     ++NumSelectConvertedExpColdOperand;
 884     OR << "Converted to branch because of expensive cold operand.";
 885     EmitAndPrintRemark(ORE, OR);
 886     return true;
 887   }
 888
 889   ORmiss << "Not profitable to convert to branch (base heuristic).";
 890   EmitAndPrintRemark(ORE, ORmiss);
 891   return false;
 892 }
 893
 894 static InstructionCost divideNearest(InstructionCost Numerator,
 895                                      uint64_t Denominator) {
 896   return (Numerator + (Denominator / 2)) / Denominator;
 897 }
 898
 899 static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
 900                                  uint64_t &TrueVal, uint64_t &FalseVal) {
 901   if (isa<SelectInst>(SI.getI()))
 902     return extractBranchWeights(*SI.getI(), TrueVal, FalseVal);
 903   return false;
 904 }
 905
 906 bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
 907   bool ColdOperand = false;
 908   uint64_t TrueWeight, FalseWeight, TotalWeight;
 909   if (extractBranchWeights(ASI.front(), TrueWeight, FalseWeight)) {
 910     uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
 911     TotalWeight = TrueWeight + FalseWeight;
 912     // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
 913     ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
 914   } else if (PSI->hasProfileSummary()) {
 915     OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
 916                                     ASI.front().getI());
 917     ORmiss << "Profile data available but missing branch-weights metadata for "
 918               "select instruction. ";
 919     EmitAndPrintRemark(ORE, ORmiss);
 920   }
 921   if (!ColdOperand)
 922     return false;
 923   // Check if the cold path's dependence slice is expensive for any of the
 924   // selects of the group.
 925   for (SelectLike SI : ASI) {
 926     Instruction *ColdI = nullptr;
 927     uint64_t HotWeight;
 928     if (TrueWeight < FalseWeight) {
 929       ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue());
 930       HotWeight = FalseWeight;
 931     } else {
 932       ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue());
 933       HotWeight = TrueWeight;
 934     }
 935     if (ColdI) {
 936       std::stack<Instruction *> ColdSlice;
 937       getExclBackwardsSlice(ColdI, ColdSlice, SI.getI());
 938       InstructionCost SliceCost = 0;
 939       while (!ColdSlice.empty()) {
 940         SliceCost += TTI->getInstructionCost(ColdSlice.top(),
 941                                              TargetTransformInfo::TCK_Latency);
 942         ColdSlice.pop();
 943       }
 944       // The colder the cold value operand of the select is the more expensive
 945       // the cmov becomes for computing the cold value operand every time. Thus,
 946       // the colder the cold operand is the more its cost counts.
 947       // Get nearest integer cost adjusted for coldness.
 948       InstructionCost AdjSliceCost =
 949           divideNearest(SliceCost * HotWeight, TotalWeight);
 950       if (AdjSliceCost >=
 951           ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
 952         return true;
 953     }
 954   }
 955   return false;
 956 }
 957
 958 // Check if it is safe to move LoadI next to the SI.
 959 // Conservatively assume it is safe only if there is no instruction
 960 // modifying memory in-between the load and the select instruction.
 961 static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
 962   // Assume loads from different basic blocks are unsafe to move.
 963   if (LoadI->getParent() != SI->getParent())
 964     return false;
 965   auto It = LoadI->getIterator();
 966   while (&*It != SI) {
 967     if (It->mayWriteToMemory())
 968       return false;
 969     It++;
 970   }
 971   return true;
 972 }
 973
 974 // For a given source instruction, collect its backwards dependence slice
 975 // consisting of instructions exclusively computed for the purpose of producing
 976 // the operands of the source instruction. As an approximation
 977 // (sufficiently-accurate in practice), we populate this set with the
 978 // instructions of the backwards dependence slice that only have one-use and
 979 // form an one-use chain that leads to the source instruction.
 980 void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
 981                                                std::stack<Instruction *> &Slice,
 982                                                Instruction *SI,
 983                                                bool ForSinking) {
 984   SmallPtrSet<Instruction *, 2> Visited;
 985   std::queue<Instruction *> Worklist;
 986   Worklist.push(I);
 987   while (!Worklist.empty()) {
 988     Instruction *II = Worklist.front();
 989     Worklist.pop();
 990
 991     // Avoid cycles.
 992     if (!Visited.insert(II).second)
 993       continue;
 994
 995     if (!II->hasOneUse())
 996       continue;
 997
 998     // Cannot soundly sink instructions with side-effects.
 999     // Terminator or phi instructions cannot be sunk.
1000     // Avoid sinking other select instructions (should be handled separetely).
1001     if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1002                        isa<SelectInst>(II) || isa<PHINode>(II)))
1003       continue;
1004
1005     // Avoid sinking loads in order not to skip state-modifying instructions,
1006     // that may alias with the loaded address.
1007     // Only allow sinking of loads within the same basic block that are
1008     // conservatively proven to be safe.
1009     if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
1010       continue;
1011
1012     // Avoid considering instructions with less frequency than the source
1013     // instruction (i.e., avoid colder code regions of the dependence slice).
1014     if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
1015       continue;
1016
1017     // Eligible one-use instruction added to the dependence slice.
1018     Slice.push(II);
1019
1020     // Explore all the operands of the current instruction to expand the slice.
1021     for (unsigned k = 0; k < II->getNumOperands(); ++k)
1022       if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
1023         Worklist.push(OpI);
1024   }
1025 }
1026
1027 bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1028   uint64_t TrueWeight, FalseWeight;
1029   if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1030     uint64_t Max = std::max(TrueWeight, FalseWeight);
1031     uint64_t Sum = TrueWeight + FalseWeight;
1032     if (Sum != 0) {
1033       auto Probability = BranchProbability::getBranchProbability(Max, Sum);
1034       if (Probability > TTI->getPredictableBranchThreshold())
1035         return true;
1036     }
1037   }
1038   return false;
1039 }
1040
1041 bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1042                                              const CostInfo LoopCost[2]) {
1043   // Loop-level checks to determine if a non-predicated version (with branches)
1044   // of the loop is more profitable than its predicated version.
1045
1046   if (DisableLoopLevelHeuristics)
1047     return true;
1048
1049   OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1050                                    L->getHeader()->getFirstNonPHI());
1051
1052   if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1053       LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1054     ORmissL << "No select conversion in the loop due to no reduction of loop's "
1055                "critical path. ";
1056     EmitAndPrintRemark(ORE, ORmissL);
1057     return false;
1058   }
1059
1060   Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1061                       LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1062
1063   // Profitably converting to branches need to reduce the loop's critical path
1064   // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1065   // relative gain of 12.5%).
1066   if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
1067       Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
1068     Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
1069     ORmissL << "No select conversion in the loop due to small reduction of "
1070                "loop's critical path. Gain="
1071             << Gain[1].toString()
1072             << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1073     EmitAndPrintRemark(ORE, ORmissL);
1074     return false;
1075   }
1076
1077   // If the loop's critical path involves loop-carried dependences, the gradient
1078   // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1079   // This check ensures that the latency reduction for the loop's critical path
1080   // keeps decreasing with sufficient rate beyond the two analyzed loop
1081   // iterations.
1082   if (Gain[1] > Gain[0]) {
1083     Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
1084                             (LoopCost[1].PredCost - LoopCost[0].PredCost);
1085     if (GradientGain < Scaled64::get(GainGradientThreshold)) {
1086       ORmissL << "No select conversion in the loop due to small gradient gain. "
1087                  "GradientGain="
1088               << GradientGain.toString() << "%. ";
1089       EmitAndPrintRemark(ORE, ORmissL);
1090       return false;
1091     }
1092   }
1093   // If the gain decreases it is not profitable to convert.
1094   else if (Gain[1] < Gain[0]) {
1095     ORmissL
1096         << "No select conversion in the loop due to negative gradient gain. ";
1097     EmitAndPrintRemark(ORE, ORmissL);
1098     return false;
1099   }
1100
1101   // Non-predicated version of the loop is more profitable than its
1102   // predicated version.
1103   return true;
1104 }
1105
1106 // Computes instruction and loop-critical-path costs for both the predicated
1107 // and non-predicated version of the given loop.
1108 // Returns false if unable to compute these costs due to invalid cost of loop
1109 // instruction(s).
1110 bool SelectOptimizeImpl::computeLoopCosts(
1111     const Loop *L, const SelectGroups &SIGroups,
1112     DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1113   LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1114                     << L->getHeader()->getName() << "\n");
1115   const auto &SImap = getSImap(SIGroups);
1116   // Compute instruction and loop-critical-path costs across two iterations for
1117   // both predicated and non-predicated version.
1118   const unsigned Iterations = 2;
1119   for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1120     // Cost of the loop's critical path.
1121     CostInfo &MaxCost = LoopCost[Iter];
1122     for (BasicBlock *BB : L->getBlocks()) {
1123       for (const Instruction &I : *BB) {
1124         if (I.isDebugOrPseudoInst())
1125           continue;
1126         // Compute the predicated and non-predicated cost of the instruction.
1127         Scaled64 IPredCost = Scaled64::getZero(),
1128                  INonPredCost = Scaled64::getZero();
1129
1130         // Assume infinite resources that allow to fully exploit the available
1131         // instruction-level parallelism.
1132         // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1133         for (const Use &U : I.operands()) {
1134           auto UI = dyn_cast<Instruction>(U.get());
1135           if (!UI)
1136             continue;
1137           if (InstCostMap.count(UI)) {
1138             IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
1139             INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
1140           }
1141         }
1142         auto ILatency = computeInstCost(&I);
1143         if (!ILatency) {
1144           OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1145           ORmissL << "Invalid instruction cost preventing analysis and "
1146                      "optimization of the inner-most loop containing this "
1147                      "instruction. ";
1148           EmitAndPrintRemark(ORE, ORmissL);
1149           return false;
1150         }
1151         IPredCost += Scaled64::get(*ILatency);
1152         INonPredCost += Scaled64::get(*ILatency);
1153
1154         // For a select that can be converted to branch,
1155         // compute its cost as a branch (non-predicated cost).
1156         //
1157         // BranchCost = PredictedPathCost + MispredictCost
1158         // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1159         // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1160         if (SImap.contains(&I)) {
1161           auto SI = SImap.at(&I);
1162           Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1163           Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1164           Scaled64 PredictedPathCost =
1165               getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
1166
1167           Scaled64 CondCost = Scaled64::getZero();
1168           if (auto *CI = dyn_cast<Instruction>(SI.getCondition()))
1169             if (InstCostMap.count(CI))
1170               CondCost = InstCostMap[CI].NonPredCost;
1171           Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1172
1173           INonPredCost = PredictedPathCost + MispredictCost;
1174         }
1175         LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1176                           << INonPredCost << " for " << I << "\n");
1177
1178         InstCostMap[&I] = {IPredCost, INonPredCost};
1179         MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
1180         MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
1181       }
1182     }
1183     LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1184                       << " MaxCost = " << MaxCost.PredCost << " "
1185                       << MaxCost.NonPredCost << "\n");
1186   }
1187   return true;
1188 }
1189
1190 SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2>
1191 SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1192   SmallDenseMap<const Instruction *, SelectLike, 2> SImap;
1193   for (const SelectGroup &ASI : SIGroups)
1194     for (SelectLike SI : ASI)
1195       SImap.try_emplace(SI.getI(), SI);
1196   return SImap;
1197 }
1198
1199 std::optional<uint64_t>
1200 SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1201   InstructionCost ICost =
1202       TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
1203   if (auto OC = ICost.getValue())
1204     return std::optional<uint64_t>(*OC);
1205   return std::nullopt;
1206 }
1207
1208 ScaledNumber<uint64_t>
1209 SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1210                                          const Scaled64 CondCost) {
1211   uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1212
1213   // Account for the default misprediction rate when using a branch
1214   // (conservatively set to 25% by default).
1215   uint64_t MispredictRate = MispredictDefaultRate;
1216   // If the select condition is obviously predictable, then the misprediction
1217   // rate is zero.
1218   if (isSelectHighlyPredictable(SI))
1219     MispredictRate = 0;
1220
1221   // CondCost is included to account for cases where the computation of the
1222   // condition is part of a long dependence chain (potentially loop-carried)
1223   // that would delay detection of a misprediction and increase its cost.
1224   Scaled64 MispredictCost =
1225       std::max(Scaled64::get(MispredictPenalty), CondCost) *
1226       Scaled64::get(MispredictRate);
1227   MispredictCost /= Scaled64::get(100);
1228
1229   return MispredictCost;
1230 }
1231
1232 // Returns the cost of a branch when the prediction is correct.
1233 // TrueCost * TrueProbability + FalseCost * FalseProbability.
1234 ScaledNumber<uint64_t>
1235 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1236                                          const SelectLike SI) {
1237   Scaled64 PredPathCost;
1238   uint64_t TrueWeight, FalseWeight;
1239   if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1240     uint64_t SumWeight = TrueWeight + FalseWeight;
1241     if (SumWeight != 0) {
1242       PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1243                      FalseCost * Scaled64::get(FalseWeight);
1244       PredPathCost /= Scaled64::get(SumWeight);
1245       return PredPathCost;
1246     }
1247   }
1248   // Without branch weight metadata, we assume 75% for the one path and 25% for
1249   // the other, and pick the result with the biggest cost.
1250   PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1251                           FalseCost * Scaled64::get(3) + TrueCost);
1252   PredPathCost /= Scaled64::get(4);
1253   return PredPathCost;
1254 }
1255
1256 bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1257   bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(1);
1258   if (VectorCond)
1259     return false;
1260   TargetLowering::SelectSupportKind SelectKind;
1261   if (SI.getType()->isVectorTy())
1262     SelectKind = TargetLowering::ScalarCondVectorVal;
1263   else
1264     SelectKind = TargetLowering::ScalarValSelect;
1265   return TLI->isSelectSupported(SelectKind);
1266 }