llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

   1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 ///
   9 /// \file
  10 /// This file contains implementations for different VPlan recipes.
  11 ///
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "VPlan.h"
  15 #include "VPlanAnalysis.h"
  16 #include "llvm/ADT/STLExtras.h"
  17 #include "llvm/ADT/SmallVector.h"
  18 #include "llvm/ADT/Twine.h"
  19 #include "llvm/Analysis/IVDescriptors.h"
  20 #include "llvm/IR/BasicBlock.h"
  21 #include "llvm/IR/IRBuilder.h"
  22 #include "llvm/IR/Instruction.h"
  23 #include "llvm/IR/Instructions.h"
  24 #include "llvm/IR/Type.h"
  25 #include "llvm/IR/Value.h"
  26 #include "llvm/Support/Casting.h"
  27 #include "llvm/Support/CommandLine.h"
  28 #include "llvm/Support/Debug.h"
  29 #include "llvm/Support/raw_ostream.h"
  30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  31 #include "llvm/Transforms/Utils/LoopUtils.h"
  32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  33 #include <cassert>
  34
  35 using namespace llvm;
  36
  37 using VectorParts = SmallVector<Value *, 2>;
  38
  39 namespace llvm {
  40 extern cl::opt<bool> EnableVPlanNativePath;
  41 }
  42
  43 #define LV_NAME "loop-vectorize"
  44 #define DEBUG_TYPE LV_NAME
  45
  46 bool VPRecipeBase::mayWriteToMemory() const {
  47   switch (getVPDefID()) {
  48   case VPInterleaveSC:
  49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
  50   case VPWidenMemoryInstructionSC: {
  51     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
  52   }
  53   case VPReplicateSC:
  54   case VPWidenCallSC:
  55     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
  56         ->mayWriteToMemory();
  57   case VPBranchOnMaskSC:
  58   case VPScalarIVStepsSC:
  59   case VPPredInstPHISC:
  60     return false;
  61   case VPBlendSC:
  62   case VPReductionSC:
  63   case VPWidenCanonicalIVSC:
  64   case VPWidenCastSC:
  65   case VPWidenGEPSC:
  66   case VPWidenIntOrFpInductionSC:
  67   case VPWidenPHISC:
  68   case VPWidenSC:
  69   case VPWidenSelectSC: {
  70     const Instruction *I =
  71         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
  72     (void)I;
  73     assert((!I || !I->mayWriteToMemory()) &&
  74            "underlying instruction may write to memory");
  75     return false;
  76   }
  77   default:
  78     return true;
  79   }
  80 }
  81
  82 bool VPRecipeBase::mayReadFromMemory() const {
  83   switch (getVPDefID()) {
  84   case VPWidenMemoryInstructionSC: {
  85     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
  86   }
  87   case VPReplicateSC:
  88   case VPWidenCallSC:
  89     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
  90         ->mayReadFromMemory();
  91   case VPBranchOnMaskSC:
  92   case VPScalarIVStepsSC:
  93   case VPPredInstPHISC:
  94     return false;
  95   case VPBlendSC:
  96   case VPReductionSC:
  97   case VPWidenCanonicalIVSC:
  98   case VPWidenCastSC:
  99   case VPWidenGEPSC:
 100   case VPWidenIntOrFpInductionSC:
 101   case VPWidenPHISC:
 102   case VPWidenSC:
 103   case VPWidenSelectSC: {
 104     const Instruction *I =
 105         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
 106     (void)I;
 107     assert((!I || !I->mayReadFromMemory()) &&
 108            "underlying instruction may read from memory");
 109     return false;
 110   }
 111   default:
 112     return true;
 113   }
 114 }
 115
 116 bool VPRecipeBase::mayHaveSideEffects() const {
 117   switch (getVPDefID()) {
 118   case VPDerivedIVSC:
 119   case VPPredInstPHISC:
 120     return false;
 121   case VPInstructionSC:
 122     switch (cast<VPInstruction>(this)->getOpcode()) {
 123     case Instruction::Or:
 124     case Instruction::ICmp:
 125     case Instruction::Select:
 126     case VPInstruction::Not:
 127     case VPInstruction::CalculateTripCountMinusVF:
 128     case VPInstruction::CanonicalIVIncrementForPart:
 129       return false;
 130     default:
 131       return true;
 132     }
 133   case VPWidenCallSC:
 134     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
 135         ->mayHaveSideEffects();
 136   case VPBlendSC:
 137   case VPReductionSC:
 138   case VPScalarIVStepsSC:
 139   case VPWidenCanonicalIVSC:
 140   case VPWidenCastSC:
 141   case VPWidenGEPSC:
 142   case VPWidenIntOrFpInductionSC:
 143   case VPWidenPHISC:
 144   case VPWidenPointerInductionSC:
 145   case VPWidenSC:
 146   case VPWidenSelectSC: {
 147     const Instruction *I =
 148         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
 149     (void)I;
 150     assert((!I || !I->mayHaveSideEffects()) &&
 151            "underlying instruction has side-effects");
 152     return false;
 153   }
 154   case VPInterleaveSC:
 155     return mayWriteToMemory();
 156   case VPWidenMemoryInstructionSC:
 157     assert(cast<VPWidenMemoryInstructionRecipe>(this)
 158                    ->getIngredient()
 159                    .mayHaveSideEffects() == mayWriteToMemory() &&
 160            "mayHaveSideffects result for ingredient differs from this "
 161            "implementation");
 162     return mayWriteToMemory();
 163   case VPReplicateSC: {
 164     auto *R = cast<VPReplicateRecipe>(this);
 165     return R->getUnderlyingInstr()->mayHaveSideEffects();
 166   }
 167   default:
 168     return true;
 169   }
 170 }
 171
 172 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
 173   auto Lane = VPLane::getLastLaneForVF(State.VF);
 174   VPValue *ExitValue = getOperand(0);
 175   if (vputils::isUniformAfterVectorization(ExitValue))
 176     Lane = VPLane::getFirstLane();
 177   VPBasicBlock *MiddleVPBB =
 178       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
 179   assert(MiddleVPBB->getNumSuccessors() == 0 &&
 180          "the middle block must not have any successors");
 181   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
 182   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
 183                    MiddleBB);
 184 }
 185
 186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 187 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
 188   O << "Live-out ";
 189   getPhi()->printAsOperand(O);
 190   O << " = ";
 191   getOperand(0)->printAsOperand(O, SlotTracker);
 192   O << "\n";
 193 }
 194 #endif
 195
 196 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
 197   assert(!Parent && "Recipe already in some VPBasicBlock");
 198   assert(InsertPos->getParent() &&
 199          "Insertion position not in any VPBasicBlock");
 200   Parent = InsertPos->getParent();
 201   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
 202 }
 203
 204 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
 205                                 iplist<VPRecipeBase>::iterator I) {
 206   assert(!Parent && "Recipe already in some VPBasicBlock");
 207   assert(I == BB.end() || I->getParent() == &BB);
 208   Parent = &BB;
 209   BB.getRecipeList().insert(I, this);
 210 }
 211
 212 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
 213   assert(!Parent && "Recipe already in some VPBasicBlock");
 214   assert(InsertPos->getParent() &&
 215          "Insertion position not in any VPBasicBlock");
 216   Parent = InsertPos->getParent();
 217   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
 218 }
 219
 220 void VPRecipeBase::removeFromParent() {
 221   assert(getParent() && "Recipe not in any VPBasicBlock");
 222   getParent()->getRecipeList().remove(getIterator());
 223   Parent = nullptr;
 224 }
 225
 226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
 227   assert(getParent() && "Recipe not in any VPBasicBlock");
 228   return getParent()->getRecipeList().erase(getIterator());
 229 }
 230
 231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
 232   removeFromParent();
 233   insertAfter(InsertPos);
 234 }
 235
 236 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
 237                               iplist<VPRecipeBase>::iterator I) {
 238   removeFromParent();
 239   insertBefore(BB, I);
 240 }
 241
 242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
 243   assert(OpType == OperationType::FPMathOp &&
 244          "recipe doesn't have fast math flags");
 245   FastMathFlags Res;
 246   Res.setAllowReassoc(FMFs.AllowReassoc);
 247   Res.setNoNaNs(FMFs.NoNaNs);
 248   Res.setNoInfs(FMFs.NoInfs);
 249   Res.setNoSignedZeros(FMFs.NoSignedZeros);
 250   Res.setAllowReciprocal(FMFs.AllowReciprocal);
 251   Res.setAllowContract(FMFs.AllowContract);
 252   Res.setApproxFunc(FMFs.ApproxFunc);
 253   return Res;
 254 }
 255
 256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
 257                              VPValue *A, VPValue *B, DebugLoc DL,
 258                              const Twine &Name)
 259     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
 260                           Pred, DL),
 261       Opcode(Opcode), Name(Name.str()) {
 262   assert(Opcode == Instruction::ICmp &&
 263          "only ICmp predicates supported at the moment");
 264 }
 265
 266 VPInstruction::VPInstruction(unsigned Opcode,
 267                              std::initializer_list<VPValue *> Operands,
 268                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
 269     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
 270       Opcode(Opcode), Name(Name.str()) {
 271   // Make sure the VPInstruction is a floating-point operation.
 272   assert(isFPMathOp() && "this op can't take fast-math flags");
 273 }
 274
 275 Value *VPInstruction::generateInstruction(VPTransformState &State,
 276                                           unsigned Part) {
 277   IRBuilderBase &Builder = State.Builder;
 278   Builder.SetCurrentDebugLocation(getDebugLoc());
 279
 280   if (Instruction::isBinaryOp(getOpcode())) {
 281     if (Part != 0 && vputils::onlyFirstPartUsed(this))
 282       return State.get(this, 0);
 283
 284     Value *A = State.get(getOperand(0), Part);
 285     Value *B = State.get(getOperand(1), Part);
 286     auto *Res =
 287         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
 288     if (auto *I = dyn_cast<Instruction>(Res))
 289       setFlags(I);
 290     return Res;
 291   }
 292
 293   switch (getOpcode()) {
 294   case VPInstruction::Not: {
 295     Value *A = State.get(getOperand(0), Part);
 296     return Builder.CreateNot(A, Name);
 297   }
 298   case Instruction::ICmp: {
 299     Value *A = State.get(getOperand(0), Part);
 300     Value *B = State.get(getOperand(1), Part);
 301     return Builder.CreateCmp(getPredicate(), A, B, Name);
 302   }
 303   case Instruction::Select: {
 304     Value *Cond = State.get(getOperand(0), Part);
 305     Value *Op1 = State.get(getOperand(1), Part);
 306     Value *Op2 = State.get(getOperand(2), Part);
 307     return Builder.CreateSelect(Cond, Op1, Op2, Name);
 308   }
 309   case VPInstruction::ActiveLaneMask: {
 310     // Get first lane of vector induction variable.
 311     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
 312     // Get the original loop tripcount.
 313     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
 314
 315     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
 316     auto *PredTy = VectorType::get(Int1Ty, State.VF);
 317     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
 318                                    {PredTy, ScalarTC->getType()},
 319                                    {VIVElem0, ScalarTC}, nullptr, Name);
 320   }
 321   case VPInstruction::FirstOrderRecurrenceSplice: {
 322     // Generate code to combine the previous and current values in vector v3.
 323     //
 324     //   vector.ph:
 325     //     v_init = vector(..., ..., ..., a[-1])
 326     //     br vector.body
 327     //
 328     //   vector.body
 329     //     i = phi [0, vector.ph], [i+4, vector.body]
 330     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
 331     //     v2 = a[i, i+1, i+2, i+3];
 332     //     v3 = vector(v1(3), v2(0, 1, 2))
 333
 334     // For the first part, use the recurrence phi (v1), otherwise v2.
 335     auto *V1 = State.get(getOperand(0), 0);
 336     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
 337     if (!PartMinus1->getType()->isVectorTy())
 338       return PartMinus1;
 339     Value *V2 = State.get(getOperand(1), Part);
 340     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
 341   }
 342   case VPInstruction::CalculateTripCountMinusVF: {
 343     Value *ScalarTC = State.get(getOperand(0), {0, 0});
 344     Value *Step =
 345         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
 346     Value *Sub = Builder.CreateSub(ScalarTC, Step);
 347     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
 348     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
 349     return Builder.CreateSelect(Cmp, Sub, Zero);
 350   }
 351   case VPInstruction::CanonicalIVIncrementForPart: {
 352     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
 353     if (Part == 0)
 354       return IV;
 355
 356     // The canonical IV is incremented by the vectorization factor (num of SIMD
 357     // elements) times the unroll part.
 358     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
 359     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
 360                              hasNoSignedWrap());
 361   }
 362   case VPInstruction::BranchOnCond: {
 363     if (Part != 0)
 364       return nullptr;
 365
 366     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
 367     VPRegionBlock *ParentRegion = getParent()->getParent();
 368     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
 369
 370     // Replace the temporary unreachable terminator with a new conditional
 371     // branch, hooking it up to backward destination for exiting blocks now and
 372     // to forward destination(s) later when they are created.
 373     BranchInst *CondBr =
 374         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
 375
 376     if (getParent()->isExiting())
 377       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
 378
 379     CondBr->setSuccessor(0, nullptr);
 380     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
 381     return CondBr;
 382   }
 383   case VPInstruction::BranchOnCount: {
 384     if (Part != 0)
 385       return nullptr;
 386     // First create the compare.
 387     Value *IV = State.get(getOperand(0), Part);
 388     Value *TC = State.get(getOperand(1), Part);
 389     Value *Cond = Builder.CreateICmpEQ(IV, TC);
 390
 391     // Now create the branch.
 392     auto *Plan = getParent()->getPlan();
 393     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
 394     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
 395
 396     // Replace the temporary unreachable terminator with a new conditional
 397     // branch, hooking it up to backward destination (the header) now and to the
 398     // forward destination (the exit/middle block) later when it is created.
 399     // Note that CreateCondBr expects a valid BB as first argument, so we need
 400     // to set it to nullptr later.
 401     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
 402                                               State.CFG.VPBB2IRBB[Header]);
 403     CondBr->setSuccessor(0, nullptr);
 404     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
 405     return CondBr;
 406   }
 407   case VPInstruction::ComputeReductionResult: {
 408     if (Part != 0)
 409       return State.get(this, 0);
 410
 411     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
 412     // and will be removed by breaking up the recipe further.
 413     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
 414     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
 415     // Get its reduction variable descriptor.
 416     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
 417
 418     RecurKind RK = RdxDesc.getRecurrenceKind();
 419
 420     State.setDebugLocFrom(getDebugLoc());
 421
 422     VPValue *LoopExitingDef = getOperand(1);
 423     Type *PhiTy = OrigPhi->getType();
 424     VectorParts RdxParts(State.UF);
 425     for (unsigned Part = 0; Part < State.UF; ++Part)
 426       RdxParts[Part] = State.get(LoopExitingDef, Part);
 427
 428     // If the vector reduction can be performed in a smaller type, we truncate
 429     // then extend the loop exit value to enable InstCombine to evaluate the
 430     // entire expression in the smaller type.
 431     // TODO: Handle this in truncateToMinBW.
 432     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
 433       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
 434       for (unsigned Part = 0; Part < State.UF; ++Part)
 435         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
 436     }
 437     // Reduce all of the unrolled parts into a single vector.
 438     Value *ReducedPartRdx = RdxParts[0];
 439     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
 440
 441     if (PhiR->isOrdered()) {
 442       ReducedPartRdx = RdxParts[State.UF - 1];
 443     } else {
 444       // Floating-point operations should have some FMF to enable the reduction.
 445       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
 446       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
 447       for (unsigned Part = 1; Part < State.UF; ++Part) {
 448         Value *RdxPart = RdxParts[Part];
 449         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
 450           ReducedPartRdx = Builder.CreateBinOp(
 451               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
 452         else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
 453           TrackingVH<Value> ReductionStartValue =
 454               RdxDesc.getRecurrenceStartValue();
 455           ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
 456                                          ReducedPartRdx, RdxPart);
 457         } else
 458           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
 459       }
 460     }
 461
 462     // Create the reduction after the loop. Note that inloop reductions create
 463     // the target reduction in the loop using a Reduction recipe.
 464     if (State.VF.isVector() && !PhiR->isInLoop()) {
 465       ReducedPartRdx =
 466           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
 467       // If the reduction can be performed in a smaller type, we need to extend
 468       // the reduction to the wider type before we branch to the original loop.
 469       if (PhiTy != RdxDesc.getRecurrenceType())
 470         ReducedPartRdx = RdxDesc.isSigned()
 471                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
 472                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
 473     }
 474
 475     // If there were stores of the reduction value to a uniform memory address
 476     // inside the loop, create the final store here.
 477     if (StoreInst *SI = RdxDesc.IntermediateStore) {
 478       auto *NewSI = Builder.CreateAlignedStore(
 479           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
 480       propagateMetadata(NewSI, SI);
 481     }
 482
 483     return ReducedPartRdx;
 484   }
 485   default:
 486     llvm_unreachable("Unsupported opcode for instruction");
 487   }
 488 }
 489
 490 #if !defined(NDEBUG)
 491 bool VPInstruction::isFPMathOp() const {
 492   // Inspired by FPMathOperator::classof. Notable differences are that we don't
 493   // support Call, PHI and Select opcodes here yet.
 494   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
 495          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
 496          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
 497          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
 498 }
 499 #endif
 500
 501 void VPInstruction::execute(VPTransformState &State) {
 502   assert(!State.Instance && "VPInstruction executing an Instance");
 503   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
 504   assert((hasFastMathFlags() == isFPMathOp() ||
 505           getOpcode() == Instruction::Select) &&
 506          "Recipe not a FPMathOp but has fast-math flags?");
 507   if (hasFastMathFlags())
 508     State.Builder.setFastMathFlags(getFastMathFlags());
 509   for (unsigned Part = 0; Part < State.UF; ++Part) {
 510     Value *GeneratedValue = generateInstruction(State, Part);
 511     if (!hasResult())
 512       continue;
 513     assert(GeneratedValue && "generateInstruction must produce a value");
 514     State.set(this, GeneratedValue, Part);
 515   }
 516 }
 517
 518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 519 void VPInstruction::dump() const {
 520   VPSlotTracker SlotTracker(getParent()->getPlan());
 521   print(dbgs(), "", SlotTracker);
 522 }
 523
 524 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 525                           VPSlotTracker &SlotTracker) const {
 526   O << Indent << "EMIT ";
 527
 528   if (hasResult()) {
 529     printAsOperand(O, SlotTracker);
 530     O << " = ";
 531   }
 532
 533   switch (getOpcode()) {
 534   case VPInstruction::Not:
 535     O << "not";
 536     break;
 537   case VPInstruction::SLPLoad:
 538     O << "combined load";
 539     break;
 540   case VPInstruction::SLPStore:
 541     O << "combined store";
 542     break;
 543   case VPInstruction::ActiveLaneMask:
 544     O << "active lane mask";
 545     break;
 546   case VPInstruction::FirstOrderRecurrenceSplice:
 547     O << "first-order splice";
 548     break;
 549   case VPInstruction::BranchOnCond:
 550     O << "branch-on-cond";
 551     break;
 552   case VPInstruction::CalculateTripCountMinusVF:
 553     O << "TC > VF ? TC - VF : 0";
 554     break;
 555   case VPInstruction::CanonicalIVIncrementForPart:
 556     O << "VF * Part +";
 557     break;
 558   case VPInstruction::BranchOnCount:
 559     O << "branch-on-count";
 560     break;
 561   case VPInstruction::ComputeReductionResult:
 562     O << "compute-reduction-result";
 563     break;
 564   default:
 565     O << Instruction::getOpcodeName(getOpcode());
 566   }
 567
 568   printFlags(O);
 569   printOperands(O, SlotTracker);
 570
 571   if (auto DL = getDebugLoc()) {
 572     O << ", !dbg ";
 573     DL.print(O);
 574   }
 575 }
 576 #endif
 577
 578 void VPWidenCallRecipe::execute(VPTransformState &State) {
 579   assert(State.VF.isVector() && "not widening");
 580   auto &CI = *cast<CallInst>(getUnderlyingInstr());
 581   assert(!isa<DbgInfoIntrinsic>(CI) &&
 582          "DbgInfoIntrinsic should have been dropped during VPlan construction");
 583   State.setDebugLocFrom(getDebugLoc());
 584
 585   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
 586   FunctionType *VFTy = nullptr;
 587   if (Variant)
 588     VFTy = Variant->getFunctionType();
 589   for (unsigned Part = 0; Part < State.UF; ++Part) {
 590     SmallVector<Type *, 2> TysForDecl;
 591     // Add return type if intrinsic is overloaded on it.
 592     if (UseIntrinsic &&
 593         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
 594       TysForDecl.push_back(
 595           VectorType::get(CI.getType()->getScalarType(), State.VF));
 596     SmallVector<Value *, 4> Args;
 597     for (const auto &I : enumerate(operands())) {
 598       // Some intrinsics have a scalar argument - don't replace it with a
 599       // vector.
 600       Value *Arg;
 601       if (UseIntrinsic &&
 602           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
 603         Arg = State.get(I.value(), VPIteration(0, 0));
 604       // Some vectorized function variants may also take a scalar argument,
 605       // e.g. linear parameters for pointers. This needs to be the scalar value
 606       // from the start of the respective part when interleaving.
 607       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
 608         Arg = State.get(I.value(), VPIteration(Part, 0));
 609       else
 610         Arg = State.get(I.value(), Part);
 611       if (UseIntrinsic &&
 612           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
 613         TysForDecl.push_back(Arg->getType());
 614       Args.push_back(Arg);
 615     }
 616
 617     Function *VectorF;
 618     if (UseIntrinsic) {
 619       // Use vector version of the intrinsic.
 620       Module *M = State.Builder.GetInsertBlock()->getModule();
 621       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
 622       assert(VectorF && "Can't retrieve vector intrinsic.");
 623     } else {
 624 #ifndef NDEBUG
 625       assert(Variant != nullptr && "Can't create vector function.");
 626 #endif
 627       VectorF = Variant;
 628     }
 629
 630     SmallVector<OperandBundleDef, 1> OpBundles;
 631     CI.getOperandBundlesAsDefs(OpBundles);
 632     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
 633
 634     if (isa<FPMathOperator>(V))
 635       V->copyFastMathFlags(&CI);
 636
 637     State.set(this, V, Part);
 638     State.addMetadata(V, &CI);
 639   }
 640 }
 641
 642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 643 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
 644                               VPSlotTracker &SlotTracker) const {
 645   O << Indent << "WIDEN-CALL ";
 646
 647   auto *CI = cast<CallInst>(getUnderlyingInstr());
 648   if (CI->getType()->isVoidTy())
 649     O << "void ";
 650   else {
 651     printAsOperand(O, SlotTracker);
 652     O << " = ";
 653   }
 654
 655   O << "call @" << CI->getCalledFunction()->getName() << "(";
 656   printOperands(O, SlotTracker);
 657   O << ")";
 658
 659   if (VectorIntrinsicID)
 660     O << " (using vector intrinsic)";
 661   else {
 662     O << " (using library function";
 663     if (Variant->hasName())
 664       O << ": " << Variant->getName();
 665     O << ")";
 666   }
 667 }
 668
 669 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
 670                                 VPSlotTracker &SlotTracker) const {
 671   O << Indent << "WIDEN-SELECT ";
 672   printAsOperand(O, SlotTracker);
 673   O << " = select ";
 674   getOperand(0)->printAsOperand(O, SlotTracker);
 675   O << ", ";
 676   getOperand(1)->printAsOperand(O, SlotTracker);
 677   O << ", ";
 678   getOperand(2)->printAsOperand(O, SlotTracker);
 679   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
 680 }
 681 #endif
 682
 683 void VPWidenSelectRecipe::execute(VPTransformState &State) {
 684   State.setDebugLocFrom(getDebugLoc());
 685
 686   // The condition can be loop invariant but still defined inside the
 687   // loop. This means that we can't just use the original 'cond' value.
 688   // We have to take the 'vectorized' value and pick the first lane.
 689   // Instcombine will make this a no-op.
 690   auto *InvarCond =
 691       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
 692
 693   for (unsigned Part = 0; Part < State.UF; ++Part) {
 694     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
 695     Value *Op0 = State.get(getOperand(1), Part);
 696     Value *Op1 = State.get(getOperand(2), Part);
 697     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
 698     State.set(this, Sel, Part);
 699     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
 700   }
 701 }
 702
 703 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
 704     const FastMathFlags &FMF) {
 705   AllowReassoc = FMF.allowReassoc();
 706   NoNaNs = FMF.noNaNs();
 707   NoInfs = FMF.noInfs();
 708   NoSignedZeros = FMF.noSignedZeros();
 709   AllowReciprocal = FMF.allowReciprocal();
 710   AllowContract = FMF.allowContract();
 711   ApproxFunc = FMF.approxFunc();
 712 }
 713
 714 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 715 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
 716   switch (OpType) {
 717   case OperationType::Cmp:
 718     O << " " << CmpInst::getPredicateName(getPredicate());
 719     break;
 720   case OperationType::DisjointOp:
 721     if (DisjointFlags.IsDisjoint)
 722       O << " disjoint";
 723     break;
 724   case OperationType::PossiblyExactOp:
 725     if (ExactFlags.IsExact)
 726       O << " exact";
 727     break;
 728   case OperationType::OverflowingBinOp:
 729     if (WrapFlags.HasNUW)
 730       O << " nuw";
 731     if (WrapFlags.HasNSW)
 732       O << " nsw";
 733     break;
 734   case OperationType::FPMathOp:
 735     getFastMathFlags().print(O);
 736     break;
 737   case OperationType::GEPOp:
 738     if (GEPFlags.IsInBounds)
 739       O << " inbounds";
 740     break;
 741   case OperationType::NonNegOp:
 742     if (NonNegFlags.NonNeg)
 743       O << " nneg";
 744     break;
 745   case OperationType::Other:
 746     break;
 747   }
 748   if (getNumOperands() > 0)
 749     O << " ";
 750 }
 751 #endif
 752
 753 void VPWidenRecipe::execute(VPTransformState &State) {
 754   State.setDebugLocFrom(getDebugLoc());
 755   auto &Builder = State.Builder;
 756   switch (Opcode) {
 757   case Instruction::Call:
 758   case Instruction::Br:
 759   case Instruction::PHI:
 760   case Instruction::GetElementPtr:
 761   case Instruction::Select:
 762     llvm_unreachable("This instruction is handled by a different recipe.");
 763   case Instruction::UDiv:
 764   case Instruction::SDiv:
 765   case Instruction::SRem:
 766   case Instruction::URem:
 767   case Instruction::Add:
 768   case Instruction::FAdd:
 769   case Instruction::Sub:
 770   case Instruction::FSub:
 771   case Instruction::FNeg:
 772   case Instruction::Mul:
 773   case Instruction::FMul:
 774   case Instruction::FDiv:
 775   case Instruction::FRem:
 776   case Instruction::Shl:
 777   case Instruction::LShr:
 778   case Instruction::AShr:
 779   case Instruction::And:
 780   case Instruction::Or:
 781   case Instruction::Xor: {
 782     // Just widen unops and binops.
 783     for (unsigned Part = 0; Part < State.UF; ++Part) {
 784       SmallVector<Value *, 2> Ops;
 785       for (VPValue *VPOp : operands())
 786         Ops.push_back(State.get(VPOp, Part));
 787
 788       Value *V = Builder.CreateNAryOp(Opcode, Ops);
 789
 790       if (auto *VecOp = dyn_cast<Instruction>(V))
 791         setFlags(VecOp);
 792
 793       // Use this vector value for all users of the original instruction.
 794       State.set(this, V, Part);
 795       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
 796     }
 797
 798     break;
 799   }
 800   case Instruction::Freeze: {
 801     for (unsigned Part = 0; Part < State.UF; ++Part) {
 802       Value *Op = State.get(getOperand(0), Part);
 803
 804       Value *Freeze = Builder.CreateFreeze(Op);
 805       State.set(this, Freeze, Part);
 806     }
 807     break;
 808   }
 809   case Instruction::ICmp:
 810   case Instruction::FCmp: {
 811     // Widen compares. Generate vector compares.
 812     bool FCmp = Opcode == Instruction::FCmp;
 813     for (unsigned Part = 0; Part < State.UF; ++Part) {
 814       Value *A = State.get(getOperand(0), Part);
 815       Value *B = State.get(getOperand(1), Part);
 816       Value *C = nullptr;
 817       if (FCmp) {
 818         // Propagate fast math flags.
 819         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
 820         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
 821           Builder.setFastMathFlags(I->getFastMathFlags());
 822         C = Builder.CreateFCmp(getPredicate(), A, B);
 823       } else {
 824         C = Builder.CreateICmp(getPredicate(), A, B);
 825       }
 826       State.set(this, C, Part);
 827       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
 828     }
 829
 830     break;
 831   }
 832   default:
 833     // This instruction is not vectorized by simple widening.
 834     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
 835                       << Instruction::getOpcodeName(Opcode));
 836     llvm_unreachable("Unhandled instruction!");
 837   } // end of switch.
 838
 839 #if !defined(NDEBUG)
 840   // Verify that VPlan type inference results agree with the type of the
 841   // generated values.
 842   for (unsigned Part = 0; Part < State.UF; ++Part) {
 843     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
 844                            State.VF) == State.get(this, Part)->getType() &&
 845            "inferred type and type from generated instructions do not match");
 846   }
 847 #endif
 848 }
 849
 850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 851 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
 852                           VPSlotTracker &SlotTracker) const {
 853   O << Indent << "WIDEN ";
 854   printAsOperand(O, SlotTracker);
 855   O << " = " << Instruction::getOpcodeName(Opcode);
 856   printFlags(O);
 857   printOperands(O, SlotTracker);
 858 }
 859 #endif
 860
 861 void VPWidenCastRecipe::execute(VPTransformState &State) {
 862   State.setDebugLocFrom(getDebugLoc());
 863   auto &Builder = State.Builder;
 864   /// Vectorize casts.
 865   assert(State.VF.isVector() && "Not vectorizing?");
 866   Type *DestTy = VectorType::get(getResultType(), State.VF);
 867   VPValue *Op = getOperand(0);
 868   for (unsigned Part = 0; Part < State.UF; ++Part) {
 869     if (Part > 0 && Op->isLiveIn()) {
 870       // FIXME: Remove once explicit unrolling is implemented using VPlan.
 871       State.set(this, State.get(this, 0), Part);
 872       continue;
 873     }
 874     Value *A = State.get(Op, Part);
 875     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
 876     State.set(this, Cast, Part);
 877     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
 878   }
 879 }
 880
 881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 882 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
 883                               VPSlotTracker &SlotTracker) const {
 884   O << Indent << "WIDEN-CAST ";
 885   printAsOperand(O, SlotTracker);
 886   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
 887   printFlags(O);
 888   printOperands(O, SlotTracker);
 889   O << " to " << *getResultType();
 890 }
 891 #endif
 892
 893 /// This function adds
 894 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
 895 /// to each vector element of Val. The sequence starts at StartIndex.
 896 /// \p Opcode is relevant for FP induction variable.
 897 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
 898                             Instruction::BinaryOps BinOp, ElementCount VF,
 899                             IRBuilderBase &Builder) {
 900   assert(VF.isVector() && "only vector VFs are supported");
 901
 902   // Create and check the types.
 903   auto *ValVTy = cast<VectorType>(Val->getType());
 904   ElementCount VLen = ValVTy->getElementCount();
 905
 906   Type *STy = Val->getType()->getScalarType();
 907   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
 908          "Induction Step must be an integer or FP");
 909   assert(Step->getType() == STy && "Step has wrong type");
 910
 911   SmallVector<Constant *, 8> Indices;
 912
 913   // Create a vector of consecutive numbers from zero to VF.
 914   VectorType *InitVecValVTy = ValVTy;
 915   if (STy->isFloatingPointTy()) {
 916     Type *InitVecValSTy =
 917         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
 918     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
 919   }
 920   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
 921
 922   // Splat the StartIdx
 923   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
 924
 925   if (STy->isIntegerTy()) {
 926     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
 927     Step = Builder.CreateVectorSplat(VLen, Step);
 928     assert(Step->getType() == Val->getType() && "Invalid step vec");
 929     // FIXME: The newly created binary instructions should contain nsw/nuw
 930     // flags, which can be found from the original scalar operations.
 931     Step = Builder.CreateMul(InitVec, Step);
 932     return Builder.CreateAdd(Val, Step, "induction");
 933   }
 934
 935   // Floating point induction.
 936   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
 937          "Binary Opcode should be specified for FP induction");
 938   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
 939   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
 940
 941   Step = Builder.CreateVectorSplat(VLen, Step);
 942   Value *MulOp = Builder.CreateFMul(InitVec, Step);
 943   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
 944 }
 945
 946 /// A helper function that returns an integer or floating-point constant with
 947 /// value C.
 948 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 949   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
 950                            : ConstantFP::get(Ty, C);
 951 }
 952
 953 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
 954                                   ElementCount VF) {
 955   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
 956   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
 957   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
 958   return B.CreateUIToFP(RuntimeVF, FTy);
 959 }
 960
 961 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
 962   assert(!State.Instance && "Int or FP induction being replicated.");
 963
 964   Value *Start = getStartValue()->getLiveInIRValue();
 965   const InductionDescriptor &ID = getInductionDescriptor();
 966   TruncInst *Trunc = getTruncInst();
 967   IRBuilderBase &Builder = State.Builder;
 968   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
 969   assert(State.VF.isVector() && "must have vector VF");
 970
 971   // The value from the original loop to which we are mapping the new induction
 972   // variable.
 973   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
 974
 975   // Fast-math-flags propagate from the original induction instruction.
 976   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
 977   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
 978     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
 979
 980   // Now do the actual transformations, and start with fetching the step value.
 981   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
 982
 983   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
 984          "Expected either an induction phi-node or a truncate of it!");
 985
 986   // Construct the initial value of the vector IV in the vector loop preheader
 987   auto CurrIP = Builder.saveIP();
 988   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
 989   Builder.SetInsertPoint(VectorPH->getTerminator());
 990   if (isa<TruncInst>(EntryVal)) {
 991     assert(Start->getType()->isIntegerTy() &&
 992            "Truncation requires an integer type");
 993     auto *TruncType = cast<IntegerType>(EntryVal->getType());
 994     Step = Builder.CreateTrunc(Step, TruncType);
 995     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
 996   }
 997
 998   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
 999   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1000   Value *SteppedStart = getStepVector(
1001       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1002
1003   // We create vector phi nodes for both integer and floating-point induction
1004   // variables. Here, we determine the kind of arithmetic we will perform.
1005   Instruction::BinaryOps AddOp;
1006   Instruction::BinaryOps MulOp;
1007   if (Step->getType()->isIntegerTy()) {
1008     AddOp = Instruction::Add;
1009     MulOp = Instruction::Mul;
1010   } else {
1011     AddOp = ID.getInductionOpcode();
1012     MulOp = Instruction::FMul;
1013   }
1014
1015   // Multiply the vectorization factor by the step using integer or
1016   // floating-point arithmetic as appropriate.
1017   Type *StepType = Step->getType();
1018   Value *RuntimeVF;
1019   if (Step->getType()->isFloatingPointTy())
1020     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1021   else
1022     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1023   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1024
1025   // Create a vector splat to use in the induction update.
1026   //
1027   // FIXME: If the step is non-constant, we create the vector splat with
1028   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1029   //        handle a constant vector splat.
1030   Value *SplatVF = isa<Constant>(Mul)
1031                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1032                        : Builder.CreateVectorSplat(State.VF, Mul);
1033   Builder.restoreIP(CurrIP);
1034
1035   // We may need to add the step a number of times, depending on the unroll
1036   // factor. The last of those goes into the PHI.
1037   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1038   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1039   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1040   Instruction *LastInduction = VecInd;
1041   for (unsigned Part = 0; Part < State.UF; ++Part) {
1042     State.set(this, LastInduction, Part);
1043
1044     if (isa<TruncInst>(EntryVal))
1045       State.addMetadata(LastInduction, EntryVal);
1046
1047     LastInduction = cast<Instruction>(
1048         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1049     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1050   }
1051
1052   LastInduction->setName("vec.ind.next");
1053   VecInd->addIncoming(SteppedStart, VectorPH);
1054   // Add induction update using an incorrect block temporarily. The phi node
1055   // will be fixed after VPlan execution. Note that at this point the latch
1056   // block cannot be used, as it does not exist yet.
1057   // TODO: Model increment value in VPlan, by turning the recipe into a
1058   // multi-def and a subclass of VPHeaderPHIRecipe.
1059   VecInd->addIncoming(LastInduction, VectorPH);
1060 }
1061
1062 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1063 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1064                                           VPSlotTracker &SlotTracker) const {
1065   O << Indent << "WIDEN-INDUCTION";
1066   if (getTruncInst()) {
1067     O << "\\l\"";
1068     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1069     O << " +\n" << Indent << "\"  ";
1070     getVPValue(0)->printAsOperand(O, SlotTracker);
1071   } else
1072     O << " " << VPlanIngredient(IV);
1073
1074   O << ", ";
1075   getStepValue()->printAsOperand(O, SlotTracker);
1076 }
1077 #endif
1078
1079 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1080   // The step may be defined by a recipe in the preheader (e.g. if it requires
1081   // SCEV expansion), but for the canonical induction the step is required to be
1082   // 1, which is represented as live-in.
1083   if (getStepValue()->getDefiningRecipe())
1084     return false;
1085   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1086   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1087   return StartC && StartC->isZero() && StepC && StepC->isOne();
1088 }
1089
1090 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1091 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1092                               VPSlotTracker &SlotTracker) const {
1093   O << Indent;
1094   printAsOperand(O, SlotTracker);
1095   O << Indent << "= DERIVED-IV ";
1096   getStartValue()->printAsOperand(O, SlotTracker);
1097   O << " + ";
1098   getCanonicalIV()->printAsOperand(O, SlotTracker);
1099   O << " * ";
1100   getStepValue()->printAsOperand(O, SlotTracker);
1101
1102   if (TruncResultTy)
1103     O << " (truncated to " << *TruncResultTy << ")";
1104 }
1105 #endif
1106
1107 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1108   // Fast-math-flags propagate from the original induction instruction.
1109   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1110   if (hasFastMathFlags())
1111     State.Builder.setFastMathFlags(getFastMathFlags());
1112
1113   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1114   /// variable on which to base the steps, \p Step is the size of the step.
1115
1116   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1117   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1118   IRBuilderBase &Builder = State.Builder;
1119
1120   // Ensure step has the same type as that of scalar IV.
1121   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1122   if (BaseIVTy != Step->getType()) {
1123     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
1124     // avoid separate truncate here.
1125     assert(Step->getType()->isIntegerTy() &&
1126            "Truncation requires an integer step");
1127     Step = State.Builder.CreateTrunc(Step, BaseIVTy);
1128   }
1129
1130   // We build scalar steps for both integer and floating-point induction
1131   // variables. Here, we determine the kind of arithmetic we will perform.
1132   Instruction::BinaryOps AddOp;
1133   Instruction::BinaryOps MulOp;
1134   if (BaseIVTy->isIntegerTy()) {
1135     AddOp = Instruction::Add;
1136     MulOp = Instruction::Mul;
1137   } else {
1138     AddOp = InductionOpcode;
1139     MulOp = Instruction::FMul;
1140   }
1141
1142   // Determine the number of scalars we need to generate for each unroll
1143   // iteration.
1144   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1145   // Compute the scalar steps and save the results in State.
1146   Type *IntStepTy =
1147       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1148   Type *VecIVTy = nullptr;
1149   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1150   if (!FirstLaneOnly && State.VF.isScalable()) {
1151     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1152     UnitStepVec =
1153         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1154     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1155     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1156   }
1157
1158   unsigned StartPart = 0;
1159   unsigned EndPart = State.UF;
1160   unsigned StartLane = 0;
1161   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1162   if (State.Instance) {
1163     StartPart = State.Instance->Part;
1164     EndPart = StartPart + 1;
1165     StartLane = State.Instance->Lane.getKnownLane();
1166     EndLane = StartLane + 1;
1167   }
1168   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1169     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1170
1171     if (!FirstLaneOnly && State.VF.isScalable()) {
1172       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1173       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1174       if (BaseIVTy->isFloatingPointTy())
1175         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1176       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1177       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1178       State.set(this, Add, Part);
1179       // It's useful to record the lane values too for the known minimum number
1180       // of elements so we do those below. This improves the code quality when
1181       // trying to extract the first element, for example.
1182     }
1183
1184     if (BaseIVTy->isFloatingPointTy())
1185       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1186
1187     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1188       Value *StartIdx = Builder.CreateBinOp(
1189           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1190       // The step returned by `createStepForVF` is a runtime-evaluated value
1191       // when VF is scalable. Otherwise, it should be folded into a Constant.
1192       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1193              "Expected StartIdx to be folded to a constant when VF is not "
1194              "scalable");
1195       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1196       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1197       State.set(this, Add, VPIteration(Part, Lane));
1198     }
1199   }
1200 }
1201
1202 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1203 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1204                                   VPSlotTracker &SlotTracker) const {
1205   O << Indent;
1206   printAsOperand(O, SlotTracker);
1207   O << " = SCALAR-STEPS ";
1208   printOperands(O, SlotTracker);
1209 }
1210 #endif
1211
1212 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1213   assert(State.VF.isVector() && "not widening");
1214   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1215   // Construct a vector GEP by widening the operands of the scalar GEP as
1216   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1217   // results in a vector of pointers when at least one operand of the GEP
1218   // is vector-typed. Thus, to keep the representation compact, we only use
1219   // vector-typed operands for loop-varying values.
1220
1221   if (areAllOperandsInvariant()) {
1222     // If we are vectorizing, but the GEP has only loop-invariant operands,
1223     // the GEP we build (by only using vector-typed operands for
1224     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1225     // produce a vector of pointers, we need to either arbitrarily pick an
1226     // operand to broadcast, or broadcast a clone of the original GEP.
1227     // Here, we broadcast a clone of the original.
1228     //
1229     // TODO: If at some point we decide to scalarize instructions having
1230     //       loop-invariant operands, this special case will no longer be
1231     //       required. We would add the scalarization decision to
1232     //       collectLoopScalars() and teach getVectorValue() to broadcast
1233     //       the lane-zero scalar value.
1234     SmallVector<Value *> Ops;
1235     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1236       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1237
1238     auto *NewGEP =
1239         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1240                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1241     for (unsigned Part = 0; Part < State.UF; ++Part) {
1242       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1243       State.set(this, EntryPart, Part);
1244       State.addMetadata(EntryPart, GEP);
1245     }
1246   } else {
1247     // If the GEP has at least one loop-varying operand, we are sure to
1248     // produce a vector of pointers. But if we are only unrolling, we want
1249     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1250     // produce with the code below will be scalar (if VF == 1) or vector
1251     // (otherwise). Note that for the unroll-only case, we still maintain
1252     // values in the vector mapping with initVector, as we do for other
1253     // instructions.
1254     for (unsigned Part = 0; Part < State.UF; ++Part) {
1255       // The pointer operand of the new GEP. If it's loop-invariant, we
1256       // won't broadcast it.
1257       auto *Ptr = isPointerLoopInvariant()
1258                       ? State.get(getOperand(0), VPIteration(0, 0))
1259                       : State.get(getOperand(0), Part);
1260
1261       // Collect all the indices for the new GEP. If any index is
1262       // loop-invariant, we won't broadcast it.
1263       SmallVector<Value *, 4> Indices;
1264       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1265         VPValue *Operand = getOperand(I);
1266         if (isIndexLoopInvariant(I - 1))
1267           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1268         else
1269           Indices.push_back(State.get(Operand, Part));
1270       }
1271
1272       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1273       // but it should be a vector, otherwise.
1274       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1275                                              Indices, "", isInBounds());
1276       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1277              "NewGEP is not a pointer vector");
1278       State.set(this, NewGEP, Part);
1279       State.addMetadata(NewGEP, GEP);
1280     }
1281   }
1282 }
1283
1284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1285 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1286                              VPSlotTracker &SlotTracker) const {
1287   O << Indent << "WIDEN-GEP ";
1288   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1289   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1290     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1291
1292   O << " ";
1293   printAsOperand(O, SlotTracker);
1294   O << " = getelementptr";
1295   printFlags(O);
1296   printOperands(O, SlotTracker);
1297 }
1298 #endif
1299
1300 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1301   auto &Builder = State.Builder;
1302   State.setDebugLocFrom(getDebugLoc());
1303   for (unsigned Part = 0; Part < State.UF; ++Part) {
1304     // Calculate the pointer for the specific unroll-part.
1305     Value *PartPtr = nullptr;
1306     // Use i32 for the gep index type when the value is constant,
1307     // or query DataLayout for a more suitable index type otherwise.
1308     const DataLayout &DL =
1309         Builder.GetInsertBlock()->getModule()->getDataLayout();
1310     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1311                         ? DL.getIndexType(IndexedTy->getPointerTo())
1312                         : Builder.getInt32Ty();
1313     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1314     bool InBounds = isInBounds();
1315     if (IsReverse) {
1316       // If the address is consecutive but reversed, then the
1317       // wide store needs to start at the last vector element.
1318       // RunTimeVF =  VScale * VF.getKnownMinValue()
1319       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1320       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1321       // NumElt = -Part * RunTimeVF
1322       Value *NumElt = Builder.CreateMul(
1323           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1324       // LastLane = 1 - RunTimeVF
1325       Value *LastLane =
1326           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1327       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1328       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1329     } else {
1330       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1331       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1332     }
1333
1334     State.set(this, PartPtr, Part);
1335   }
1336 }
1337
1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1339 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1340                                   VPSlotTracker &SlotTracker) const {
1341   O << Indent;
1342   printAsOperand(O, SlotTracker);
1343   O << " = vector-pointer ";
1344   if (IsReverse)
1345     O << "(reverse) ";
1346
1347   printOperands(O, SlotTracker);
1348 }
1349 #endif
1350
1351 void VPBlendRecipe::execute(VPTransformState &State) {
1352   State.setDebugLocFrom(getDebugLoc());
1353   // We know that all PHIs in non-header blocks are converted into
1354   // selects, so we don't have to worry about the insertion order and we
1355   // can just use the builder.
1356   // At this point we generate the predication tree. There may be
1357   // duplications since this is a simple recursive scan, but future
1358   // optimizations will clean it up.
1359
1360   unsigned NumIncoming = getNumIncomingValues();
1361
1362   // Generate a sequence of selects of the form:
1363   // SELECT(Mask3, In3,
1364   //        SELECT(Mask2, In2,
1365   //               SELECT(Mask1, In1,
1366   //                      In0)))
1367   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1368   // are essentially undef are taken from In0.
1369  VectorParts Entry(State.UF);
1370   for (unsigned In = 0; In < NumIncoming; ++In) {
1371     for (unsigned Part = 0; Part < State.UF; ++Part) {
1372       // We might have single edge PHIs (blocks) - use an identity
1373       // 'select' for the first PHI operand.
1374       Value *In0 = State.get(getIncomingValue(In), Part);
1375       if (In == 0)
1376         Entry[Part] = In0; // Initialize with the first incoming value.
1377       else {
1378         // Select between the current value and the previous incoming edge
1379         // based on the incoming mask.
1380         Value *Cond = State.get(getMask(In), Part);
1381         Entry[Part] =
1382             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1383       }
1384     }
1385   }
1386   for (unsigned Part = 0; Part < State.UF; ++Part)
1387     State.set(this, Entry[Part], Part);
1388 }
1389
1390 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1391 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1392                           VPSlotTracker &SlotTracker) const {
1393   O << Indent << "BLEND ";
1394   printAsOperand(O, SlotTracker);
1395   O << " =";
1396   if (getNumIncomingValues() == 1) {
1397     // Not a User of any mask: not really blending, this is a
1398     // single-predecessor phi.
1399     O << " ";
1400     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1401   } else {
1402     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1403       O << " ";
1404       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1405       O << "/";
1406       getMask(I)->printAsOperand(O, SlotTracker);
1407     }
1408   }
1409 }
1410
1411 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1412                               VPSlotTracker &SlotTracker) const {
1413   O << Indent << "REDUCE ";
1414   printAsOperand(O, SlotTracker);
1415   O << " = ";
1416   getChainOp()->printAsOperand(O, SlotTracker);
1417   O << " +";
1418   if (isa<FPMathOperator>(getUnderlyingInstr()))
1419     O << getUnderlyingInstr()->getFastMathFlags();
1420   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1421   getVecOp()->printAsOperand(O, SlotTracker);
1422   if (getCondOp()) {
1423     O << ", ";
1424     getCondOp()->printAsOperand(O, SlotTracker);
1425   }
1426   O << ")";
1427   if (RdxDesc.IntermediateStore)
1428     O << " (with final reduction value stored in invariant address sank "
1429          "outside of loop)";
1430 }
1431 #endif
1432
1433 bool VPReplicateRecipe::shouldPack() const {
1434   // Find if the recipe is used by a widened recipe via an intervening
1435   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1436   return any_of(users(), [](const VPUser *U) {
1437     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1438       return any_of(PredR->users(), [PredR](const VPUser *U) {
1439         return !U->usesScalars(PredR);
1440       });
1441     return false;
1442   });
1443 }
1444
1445 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1446 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1447                               VPSlotTracker &SlotTracker) const {
1448   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1449
1450   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1451     printAsOperand(O, SlotTracker);
1452     O << " = ";
1453   }
1454   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1455     O << "call";
1456     printFlags(O);
1457     O << "@" << CB->getCalledFunction()->getName() << "(";
1458     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1459                     O, [&O, &SlotTracker](VPValue *Op) {
1460                       Op->printAsOperand(O, SlotTracker);
1461                     });
1462     O << ")";
1463   } else {
1464     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1465     printFlags(O);
1466     printOperands(O, SlotTracker);
1467   }
1468
1469   if (shouldPack())
1470     O << " (S->V)";
1471 }
1472 #endif
1473
1474 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1475   assert(State.Instance && "Branch on Mask works only on single instance.");
1476
1477   unsigned Part = State.Instance->Part;
1478   unsigned Lane = State.Instance->Lane.getKnownLane();
1479
1480   Value *ConditionBit = nullptr;
1481   VPValue *BlockInMask = getMask();
1482   if (BlockInMask) {
1483     ConditionBit = State.get(BlockInMask, Part);
1484     if (ConditionBit->getType()->isVectorTy())
1485       ConditionBit = State.Builder.CreateExtractElement(
1486           ConditionBit, State.Builder.getInt32(Lane));
1487   } else // Block in mask is all-one.
1488     ConditionBit = State.Builder.getTrue();
1489
1490   // Replace the temporary unreachable terminator with a new conditional branch,
1491   // whose two destinations will be set later when they are created.
1492   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1493   assert(isa<UnreachableInst>(CurrentTerminator) &&
1494          "Expected to replace unreachable terminator with conditional branch.");
1495   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1496   CondBr->setSuccessor(0, nullptr);
1497   ReplaceInstWithInst(CurrentTerminator, CondBr);
1498 }
1499
1500 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1501   assert(State.Instance && "Predicated instruction PHI works per instance.");
1502   Instruction *ScalarPredInst =
1503       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1504   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1505   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1506   assert(PredicatingBB && "Predicated block has no single predecessor.");
1507   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1508          "operand must be VPReplicateRecipe");
1509
1510   // By current pack/unpack logic we need to generate only a single phi node: if
1511   // a vector value for the predicated instruction exists at this point it means
1512   // the instruction has vector users only, and a phi for the vector value is
1513   // needed. In this case the recipe of the predicated instruction is marked to
1514   // also do that packing, thereby "hoisting" the insert-element sequence.
1515   // Otherwise, a phi node for the scalar value is needed.
1516   unsigned Part = State.Instance->Part;
1517   if (State.hasVectorValue(getOperand(0), Part)) {
1518     Value *VectorValue = State.get(getOperand(0), Part);
1519     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1520     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1521     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1522     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1523     if (State.hasVectorValue(this, Part))
1524       State.reset(this, VPhi, Part);
1525     else
1526       State.set(this, VPhi, Part);
1527     // NOTE: Currently we need to update the value of the operand, so the next
1528     // predicated iteration inserts its generated value in the correct vector.
1529     State.reset(getOperand(0), VPhi, Part);
1530   } else {
1531     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1532     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1533     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1534                      PredicatingBB);
1535     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1536     if (State.hasScalarValue(this, *State.Instance))
1537       State.reset(this, Phi, *State.Instance);
1538     else
1539       State.set(this, Phi, *State.Instance);
1540     // NOTE: Currently we need to update the value of the operand, so the next
1541     // predicated iteration inserts its generated value in the correct vector.
1542     State.reset(getOperand(0), Phi, *State.Instance);
1543   }
1544 }
1545
1546 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1547 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1548                                 VPSlotTracker &SlotTracker) const {
1549   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1550   printAsOperand(O, SlotTracker);
1551   O << " = ";
1552   printOperands(O, SlotTracker);
1553 }
1554
1555 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1556                                            VPSlotTracker &SlotTracker) const {
1557   O << Indent << "WIDEN ";
1558
1559   if (!isStore()) {
1560     getVPSingleValue()->printAsOperand(O, SlotTracker);
1561     O << " = ";
1562   }
1563   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1564
1565   printOperands(O, SlotTracker);
1566 }
1567 #endif
1568
1569 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1570   Value *Start = getStartValue()->getLiveInIRValue();
1571   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1572   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1573
1574   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1575   EntryPart->addIncoming(Start, VectorPH);
1576   EntryPart->setDebugLoc(getDebugLoc());
1577   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1578     State.set(this, EntryPart, Part);
1579 }
1580
1581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1582 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1583                                    VPSlotTracker &SlotTracker) const {
1584   O << Indent << "EMIT ";
1585   printAsOperand(O, SlotTracker);
1586   O << " = CANONICAL-INDUCTION ";
1587   printOperands(O, SlotTracker);
1588 }
1589 #endif
1590
1591 bool VPCanonicalIVPHIRecipe::isCanonical(
1592     InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
1593     Type *Ty) const {
1594   // The types must match and it must be an integer induction.
1595   if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
1596     return false;
1597   // Start must match the start value of this canonical induction.
1598   if (Start != getStartValue())
1599     return false;
1600
1601   // If the step is defined by a recipe, it is not a ConstantInt.
1602   if (Step->getDefiningRecipe())
1603     return false;
1604
1605   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1606   return StepC && StepC->isOne();
1607 }
1608
1609 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1610   return IsScalarAfterVectorization &&
1611          (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1612 }
1613
1614 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1615 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1616                                           VPSlotTracker &SlotTracker) const {
1617   O << Indent << "EMIT ";
1618   printAsOperand(O, SlotTracker);
1619   O << " = WIDEN-POINTER-INDUCTION ";
1620   getStartValue()->printAsOperand(O, SlotTracker);
1621   O << ", " << *IndDesc.getStep();
1622 }
1623 #endif
1624
1625 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1626   assert(!State.Instance && "cannot be used in per-lane");
1627   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1628   SCEVExpander Exp(SE, DL, "induction");
1629
1630   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1631                                  &*State.Builder.GetInsertPoint());
1632   assert(!State.ExpandedSCEVs.contains(Expr) &&
1633          "Same SCEV expanded multiple times");
1634   State.ExpandedSCEVs[Expr] = Res;
1635   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1636     State.set(this, Res, {Part, 0});
1637 }
1638
1639 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1640 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1641                                VPSlotTracker &SlotTracker) const {
1642   O << Indent << "EMIT ";
1643   getVPSingleValue()->printAsOperand(O, SlotTracker);
1644   O << " = EXPAND SCEV " << *Expr;
1645 }
1646 #endif
1647
1648 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1649   Value *CanonicalIV = State.get(getOperand(0), 0);
1650   Type *STy = CanonicalIV->getType();
1651   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1652   ElementCount VF = State.VF;
1653   Value *VStart = VF.isScalar()
1654                       ? CanonicalIV
1655                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1656   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1657     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1658     if (VF.isVector()) {
1659       VStep = Builder.CreateVectorSplat(VF, VStep);
1660       VStep =
1661           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1662     }
1663     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1664     State.set(this, CanonicalVectorIV, Part);
1665   }
1666 }
1667
1668 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1669 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1670                                      VPSlotTracker &SlotTracker) const {
1671   O << Indent << "EMIT ";
1672   printAsOperand(O, SlotTracker);
1673   O << " = WIDEN-CANONICAL-INDUCTION ";
1674   printOperands(O, SlotTracker);
1675 }
1676 #endif
1677
1678 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1679   auto &Builder = State.Builder;
1680   // Create a vector from the initial value.
1681   auto *VectorInit = getStartValue()->getLiveInIRValue();
1682
1683   Type *VecTy = State.VF.isScalar()
1684                     ? VectorInit->getType()
1685                     : VectorType::get(VectorInit->getType(), State.VF);
1686
1687   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1688   if (State.VF.isVector()) {
1689     auto *IdxTy = Builder.getInt32Ty();
1690     auto *One = ConstantInt::get(IdxTy, 1);
1691     IRBuilder<>::InsertPointGuard Guard(Builder);
1692     Builder.SetInsertPoint(VectorPH->getTerminator());
1693     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1694     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1695     VectorInit = Builder.CreateInsertElement(
1696         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1697   }
1698
1699   // Create a phi node for the new recurrence.
1700   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1701   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1702   EntryPart->addIncoming(VectorInit, VectorPH);
1703   State.set(this, EntryPart, 0);
1704 }
1705
1706 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1707 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1708                                             VPSlotTracker &SlotTracker) const {
1709   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1710   printAsOperand(O, SlotTracker);
1711   O << " = phi ";
1712   printOperands(O, SlotTracker);
1713 }
1714 #endif
1715
1716 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1717   auto &Builder = State.Builder;
1718
1719   // Reductions do not have to start at zero. They can start with
1720   // any loop invariant values.
1721   VPValue *StartVPV = getStartValue();
1722   Value *StartV = StartVPV->getLiveInIRValue();
1723
1724   // In order to support recurrences we need to be able to vectorize Phi nodes.
1725   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1726   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1727   // this value when we vectorize all of the instructions that use the PHI.
1728   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1729   Type *VecTy = ScalarPHI ? StartV->getType()
1730                           : VectorType::get(StartV->getType(), State.VF);
1731
1732   BasicBlock *HeaderBB = State.CFG.PrevBB;
1733   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1734          "recipe must be in the vector loop header");
1735   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1736   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1737     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1738     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1739     State.set(this, EntryPart, Part);
1740   }
1741
1742   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1743
1744   Value *Iden = nullptr;
1745   RecurKind RK = RdxDesc.getRecurrenceKind();
1746   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1747       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
1748     // MinMax and AnyOf reductions have the start value as their identity.
1749     if (ScalarPHI) {
1750       Iden = StartV;
1751     } else {
1752       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1753       Builder.SetInsertPoint(VectorPH->getTerminator());
1754       StartV = Iden =
1755           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1756     }
1757   } else {
1758     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1759                                          RdxDesc.getFastMathFlags());
1760
1761     if (!ScalarPHI) {
1762       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1763       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1764       Builder.SetInsertPoint(VectorPH->getTerminator());
1765       Constant *Zero = Builder.getInt32(0);
1766       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1767     }
1768   }
1769
1770   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1771     Value *EntryPart = State.get(this, Part);
1772     // Make sure to add the reduction start value only to the
1773     // first unroll part.
1774     Value *StartVal = (Part == 0) ? StartV : Iden;
1775     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1776   }
1777 }
1778
1779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1780 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1781                                  VPSlotTracker &SlotTracker) const {
1782   O << Indent << "WIDEN-REDUCTION-PHI ";
1783
1784   printAsOperand(O, SlotTracker);
1785   O << " = phi ";
1786   printOperands(O, SlotTracker);
1787 }
1788 #endif
1789
1790 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1791   assert(EnableVPlanNativePath &&
1792          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1793
1794   Value *Op0 = State.get(getOperand(0), 0);
1795   Type *VecTy = Op0->getType();
1796   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1797   State.set(this, VecPhi, 0);
1798 }
1799
1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1801 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1802                              VPSlotTracker &SlotTracker) const {
1803   O << Indent << "WIDEN-PHI ";
1804
1805   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1806   // Unless all incoming values are modeled in VPlan  print the original PHI
1807   // directly.
1808   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1809   // values as VPValues.
1810   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1811     O << VPlanIngredient(OriginalPhi);
1812     return;
1813   }
1814
1815   printAsOperand(O, SlotTracker);
1816   O << " = phi ";
1817   printOperands(O, SlotTracker);
1818 }
1819 #endif
1820
1821 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1822 // remove VPActiveLaneMaskPHIRecipe.
1823 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1824   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1825   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1826     Value *StartMask = State.get(getOperand(0), Part);
1827     PHINode *EntryPart =
1828         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1829     EntryPart->addIncoming(StartMask, VectorPH);
1830     EntryPart->setDebugLoc(getDebugLoc());
1831     State.set(this, EntryPart, Part);
1832   }
1833 }
1834
1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1836 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1837                                       VPSlotTracker &SlotTracker) const {
1838   O << Indent << "ACTIVE-LANE-MASK-PHI ";
1839
1840   printAsOperand(O, SlotTracker);
1841   O << " = phi ";
1842   printOperands(O, SlotTracker);
1843 }
1844 #endif