[RISCV] Fix mgather -> riscv.masked.strided.load combine not extending indices (...
[llvm-project.git] / llvm / lib / Transforms / Vectorize / VPlanRecipes.cpp
blobae2fc522ba4002710054ce88c2724566fdf73596
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
35 using namespace llvm;
37 using VectorParts = SmallVector<Value *, 2>;
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
46 bool VPRecipeBase::mayWriteToMemory() const {
47 switch (getVPDefID()) {
48 case VPInterleaveSC:
49 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50 case VPWidenMemoryInstructionSC: {
51 return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
53 case VPReplicateSC:
54 case VPWidenCallSC:
55 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56 ->mayWriteToMemory();
57 case VPBranchOnMaskSC:
58 case VPScalarIVStepsSC:
59 case VPPredInstPHISC:
60 return false;
61 case VPBlendSC:
62 case VPReductionSC:
63 case VPWidenCanonicalIVSC:
64 case VPWidenCastSC:
65 case VPWidenGEPSC:
66 case VPWidenIntOrFpInductionSC:
67 case VPWidenPHISC:
68 case VPWidenSC:
69 case VPWidenSelectSC: {
70 const Instruction *I =
71 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
72 (void)I;
73 assert((!I || !I->mayWriteToMemory()) &&
74 "underlying instruction may write to memory");
75 return false;
77 default:
78 return true;
82 bool VPRecipeBase::mayReadFromMemory() const {
83 switch (getVPDefID()) {
84 case VPWidenMemoryInstructionSC: {
85 return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
87 case VPReplicateSC:
88 case VPWidenCallSC:
89 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
90 ->mayReadFromMemory();
91 case VPBranchOnMaskSC:
92 case VPScalarIVStepsSC:
93 case VPPredInstPHISC:
94 return false;
95 case VPBlendSC:
96 case VPReductionSC:
97 case VPWidenCanonicalIVSC:
98 case VPWidenCastSC:
99 case VPWidenGEPSC:
100 case VPWidenIntOrFpInductionSC:
101 case VPWidenPHISC:
102 case VPWidenSC:
103 case VPWidenSelectSC: {
104 const Instruction *I =
105 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
106 (void)I;
107 assert((!I || !I->mayReadFromMemory()) &&
108 "underlying instruction may read from memory");
109 return false;
111 default:
112 return true;
116 bool VPRecipeBase::mayHaveSideEffects() const {
117 switch (getVPDefID()) {
118 case VPDerivedIVSC:
119 case VPPredInstPHISC:
120 return false;
121 case VPInstructionSC:
122 switch (cast<VPInstruction>(this)->getOpcode()) {
123 case Instruction::Or:
124 case Instruction::ICmp:
125 case Instruction::Select:
126 case VPInstruction::Not:
127 case VPInstruction::CalculateTripCountMinusVF:
128 case VPInstruction::CanonicalIVIncrementForPart:
129 return false;
130 default:
131 return true;
133 case VPWidenCallSC:
134 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
135 ->mayHaveSideEffects();
136 case VPBlendSC:
137 case VPReductionSC:
138 case VPScalarIVStepsSC:
139 case VPWidenCanonicalIVSC:
140 case VPWidenCastSC:
141 case VPWidenGEPSC:
142 case VPWidenIntOrFpInductionSC:
143 case VPWidenPHISC:
144 case VPWidenPointerInductionSC:
145 case VPWidenSC:
146 case VPWidenSelectSC: {
147 const Instruction *I =
148 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
149 (void)I;
150 assert((!I || !I->mayHaveSideEffects()) &&
151 "underlying instruction has side-effects");
152 return false;
154 case VPInterleaveSC:
155 return mayWriteToMemory();
156 case VPWidenMemoryInstructionSC:
157 assert(cast<VPWidenMemoryInstructionRecipe>(this)
158 ->getIngredient()
159 .mayHaveSideEffects() == mayWriteToMemory() &&
160 "mayHaveSideffects result for ingredient differs from this "
161 "implementation");
162 return mayWriteToMemory();
163 case VPReplicateSC: {
164 auto *R = cast<VPReplicateRecipe>(this);
165 return R->getUnderlyingInstr()->mayHaveSideEffects();
167 default:
168 return true;
172 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
173 auto Lane = VPLane::getLastLaneForVF(State.VF);
174 VPValue *ExitValue = getOperand(0);
175 if (vputils::isUniformAfterVectorization(ExitValue))
176 Lane = VPLane::getFirstLane();
177 VPBasicBlock *MiddleVPBB =
178 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
179 assert(MiddleVPBB->getNumSuccessors() == 0 &&
180 "the middle block must not have any successors");
181 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
182 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
183 MiddleBB);
186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
188 O << "Live-out ";
189 getPhi()->printAsOperand(O);
190 O << " = ";
191 getOperand(0)->printAsOperand(O, SlotTracker);
192 O << "\n";
194 #endif
196 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
197 assert(!Parent && "Recipe already in some VPBasicBlock");
198 assert(InsertPos->getParent() &&
199 "Insertion position not in any VPBasicBlock");
200 Parent = InsertPos->getParent();
201 Parent->getRecipeList().insert(InsertPos->getIterator(), this);
204 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
205 iplist<VPRecipeBase>::iterator I) {
206 assert(!Parent && "Recipe already in some VPBasicBlock");
207 assert(I == BB.end() || I->getParent() == &BB);
208 Parent = &BB;
209 BB.getRecipeList().insert(I, this);
212 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
213 assert(!Parent && "Recipe already in some VPBasicBlock");
214 assert(InsertPos->getParent() &&
215 "Insertion position not in any VPBasicBlock");
216 Parent = InsertPos->getParent();
217 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
220 void VPRecipeBase::removeFromParent() {
221 assert(getParent() && "Recipe not in any VPBasicBlock");
222 getParent()->getRecipeList().remove(getIterator());
223 Parent = nullptr;
226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
227 assert(getParent() && "Recipe not in any VPBasicBlock");
228 return getParent()->getRecipeList().erase(getIterator());
231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
232 removeFromParent();
233 insertAfter(InsertPos);
236 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
237 iplist<VPRecipeBase>::iterator I) {
238 removeFromParent();
239 insertBefore(BB, I);
242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
243 assert(OpType == OperationType::FPMathOp &&
244 "recipe doesn't have fast math flags");
245 FastMathFlags Res;
246 Res.setAllowReassoc(FMFs.AllowReassoc);
247 Res.setNoNaNs(FMFs.NoNaNs);
248 Res.setNoInfs(FMFs.NoInfs);
249 Res.setNoSignedZeros(FMFs.NoSignedZeros);
250 Res.setAllowReciprocal(FMFs.AllowReciprocal);
251 Res.setAllowContract(FMFs.AllowContract);
252 Res.setApproxFunc(FMFs.ApproxFunc);
253 return Res;
256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
257 VPValue *A, VPValue *B, DebugLoc DL,
258 const Twine &Name)
259 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
260 Pred, DL),
261 Opcode(Opcode), Name(Name.str()) {
262 assert(Opcode == Instruction::ICmp &&
263 "only ICmp predicates supported at the moment");
266 VPInstruction::VPInstruction(unsigned Opcode,
267 std::initializer_list<VPValue *> Operands,
268 FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
269 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
270 Opcode(Opcode), Name(Name.str()) {
271 // Make sure the VPInstruction is a floating-point operation.
272 assert(isFPMathOp() && "this op can't take fast-math flags");
275 Value *VPInstruction::generateInstruction(VPTransformState &State,
276 unsigned Part) {
277 IRBuilderBase &Builder = State.Builder;
278 Builder.SetCurrentDebugLocation(getDebugLoc());
280 if (Instruction::isBinaryOp(getOpcode())) {
281 if (Part != 0 && vputils::onlyFirstPartUsed(this))
282 return State.get(this, 0);
284 Value *A = State.get(getOperand(0), Part);
285 Value *B = State.get(getOperand(1), Part);
286 auto *Res =
287 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
288 if (auto *I = dyn_cast<Instruction>(Res))
289 setFlags(I);
290 return Res;
293 switch (getOpcode()) {
294 case VPInstruction::Not: {
295 Value *A = State.get(getOperand(0), Part);
296 return Builder.CreateNot(A, Name);
298 case Instruction::ICmp: {
299 Value *A = State.get(getOperand(0), Part);
300 Value *B = State.get(getOperand(1), Part);
301 return Builder.CreateCmp(getPredicate(), A, B, Name);
303 case Instruction::Select: {
304 Value *Cond = State.get(getOperand(0), Part);
305 Value *Op1 = State.get(getOperand(1), Part);
306 Value *Op2 = State.get(getOperand(2), Part);
307 return Builder.CreateSelect(Cond, Op1, Op2, Name);
309 case VPInstruction::ActiveLaneMask: {
310 // Get first lane of vector induction variable.
311 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
312 // Get the original loop tripcount.
313 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
315 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
316 auto *PredTy = VectorType::get(Int1Ty, State.VF);
317 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
318 {PredTy, ScalarTC->getType()},
319 {VIVElem0, ScalarTC}, nullptr, Name);
321 case VPInstruction::FirstOrderRecurrenceSplice: {
322 // Generate code to combine the previous and current values in vector v3.
324 // vector.ph:
325 // v_init = vector(..., ..., ..., a[-1])
326 // br vector.body
328 // vector.body
329 // i = phi [0, vector.ph], [i+4, vector.body]
330 // v1 = phi [v_init, vector.ph], [v2, vector.body]
331 // v2 = a[i, i+1, i+2, i+3];
332 // v3 = vector(v1(3), v2(0, 1, 2))
334 // For the first part, use the recurrence phi (v1), otherwise v2.
335 auto *V1 = State.get(getOperand(0), 0);
336 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
337 if (!PartMinus1->getType()->isVectorTy())
338 return PartMinus1;
339 Value *V2 = State.get(getOperand(1), Part);
340 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
342 case VPInstruction::CalculateTripCountMinusVF: {
343 Value *ScalarTC = State.get(getOperand(0), {0, 0});
344 Value *Step =
345 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
346 Value *Sub = Builder.CreateSub(ScalarTC, Step);
347 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
348 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
349 return Builder.CreateSelect(Cmp, Sub, Zero);
351 case VPInstruction::CanonicalIVIncrementForPart: {
352 auto *IV = State.get(getOperand(0), VPIteration(0, 0));
353 if (Part == 0)
354 return IV;
356 // The canonical IV is incremented by the vectorization factor (num of SIMD
357 // elements) times the unroll part.
358 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
359 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
360 hasNoSignedWrap());
362 case VPInstruction::BranchOnCond: {
363 if (Part != 0)
364 return nullptr;
366 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
367 VPRegionBlock *ParentRegion = getParent()->getParent();
368 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
370 // Replace the temporary unreachable terminator with a new conditional
371 // branch, hooking it up to backward destination for exiting blocks now and
372 // to forward destination(s) later when they are created.
373 BranchInst *CondBr =
374 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
376 if (getParent()->isExiting())
377 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
379 CondBr->setSuccessor(0, nullptr);
380 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
381 return CondBr;
383 case VPInstruction::BranchOnCount: {
384 if (Part != 0)
385 return nullptr;
386 // First create the compare.
387 Value *IV = State.get(getOperand(0), Part);
388 Value *TC = State.get(getOperand(1), Part);
389 Value *Cond = Builder.CreateICmpEQ(IV, TC);
391 // Now create the branch.
392 auto *Plan = getParent()->getPlan();
393 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
394 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
396 // Replace the temporary unreachable terminator with a new conditional
397 // branch, hooking it up to backward destination (the header) now and to the
398 // forward destination (the exit/middle block) later when it is created.
399 // Note that CreateCondBr expects a valid BB as first argument, so we need
400 // to set it to nullptr later.
401 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
402 State.CFG.VPBB2IRBB[Header]);
403 CondBr->setSuccessor(0, nullptr);
404 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
405 return CondBr;
407 case VPInstruction::ComputeReductionResult: {
408 if (Part != 0)
409 return State.get(this, 0);
411 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
412 // and will be removed by breaking up the recipe further.
413 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
414 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
415 // Get its reduction variable descriptor.
416 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
418 RecurKind RK = RdxDesc.getRecurrenceKind();
420 State.setDebugLocFrom(getDebugLoc());
422 VPValue *LoopExitingDef = getOperand(1);
423 Type *PhiTy = OrigPhi->getType();
424 VectorParts RdxParts(State.UF);
425 for (unsigned Part = 0; Part < State.UF; ++Part)
426 RdxParts[Part] = State.get(LoopExitingDef, Part);
428 // If the vector reduction can be performed in a smaller type, we truncate
429 // then extend the loop exit value to enable InstCombine to evaluate the
430 // entire expression in the smaller type.
431 // TODO: Handle this in truncateToMinBW.
432 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
433 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
434 for (unsigned Part = 0; Part < State.UF; ++Part)
435 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
437 // Reduce all of the unrolled parts into a single vector.
438 Value *ReducedPartRdx = RdxParts[0];
439 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
441 if (PhiR->isOrdered()) {
442 ReducedPartRdx = RdxParts[State.UF - 1];
443 } else {
444 // Floating-point operations should have some FMF to enable the reduction.
445 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
446 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
447 for (unsigned Part = 1; Part < State.UF; ++Part) {
448 Value *RdxPart = RdxParts[Part];
449 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
450 ReducedPartRdx = Builder.CreateBinOp(
451 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
452 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
453 TrackingVH<Value> ReductionStartValue =
454 RdxDesc.getRecurrenceStartValue();
455 ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
456 ReducedPartRdx, RdxPart);
457 } else
458 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
462 // Create the reduction after the loop. Note that inloop reductions create
463 // the target reduction in the loop using a Reduction recipe.
464 if (State.VF.isVector() && !PhiR->isInLoop()) {
465 ReducedPartRdx =
466 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
467 // If the reduction can be performed in a smaller type, we need to extend
468 // the reduction to the wider type before we branch to the original loop.
469 if (PhiTy != RdxDesc.getRecurrenceType())
470 ReducedPartRdx = RdxDesc.isSigned()
471 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
472 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
475 // If there were stores of the reduction value to a uniform memory address
476 // inside the loop, create the final store here.
477 if (StoreInst *SI = RdxDesc.IntermediateStore) {
478 auto *NewSI = Builder.CreateAlignedStore(
479 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
480 propagateMetadata(NewSI, SI);
483 return ReducedPartRdx;
485 default:
486 llvm_unreachable("Unsupported opcode for instruction");
490 #if !defined(NDEBUG)
491 bool VPInstruction::isFPMathOp() const {
492 // Inspired by FPMathOperator::classof. Notable differences are that we don't
493 // support Call, PHI and Select opcodes here yet.
494 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
495 Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
496 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
497 Opcode == Instruction::FCmp || Opcode == Instruction::Select;
499 #endif
501 void VPInstruction::execute(VPTransformState &State) {
502 assert(!State.Instance && "VPInstruction executing an Instance");
503 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
504 assert((hasFastMathFlags() == isFPMathOp() ||
505 getOpcode() == Instruction::Select) &&
506 "Recipe not a FPMathOp but has fast-math flags?");
507 if (hasFastMathFlags())
508 State.Builder.setFastMathFlags(getFastMathFlags());
509 for (unsigned Part = 0; Part < State.UF; ++Part) {
510 Value *GeneratedValue = generateInstruction(State, Part);
511 if (!hasResult())
512 continue;
513 assert(GeneratedValue && "generateInstruction must produce a value");
514 State.set(this, GeneratedValue, Part);
518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
519 void VPInstruction::dump() const {
520 VPSlotTracker SlotTracker(getParent()->getPlan());
521 print(dbgs(), "", SlotTracker);
524 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
525 VPSlotTracker &SlotTracker) const {
526 O << Indent << "EMIT ";
528 if (hasResult()) {
529 printAsOperand(O, SlotTracker);
530 O << " = ";
533 switch (getOpcode()) {
534 case VPInstruction::Not:
535 O << "not";
536 break;
537 case VPInstruction::SLPLoad:
538 O << "combined load";
539 break;
540 case VPInstruction::SLPStore:
541 O << "combined store";
542 break;
543 case VPInstruction::ActiveLaneMask:
544 O << "active lane mask";
545 break;
546 case VPInstruction::FirstOrderRecurrenceSplice:
547 O << "first-order splice";
548 break;
549 case VPInstruction::BranchOnCond:
550 O << "branch-on-cond";
551 break;
552 case VPInstruction::CalculateTripCountMinusVF:
553 O << "TC > VF ? TC - VF : 0";
554 break;
555 case VPInstruction::CanonicalIVIncrementForPart:
556 O << "VF * Part +";
557 break;
558 case VPInstruction::BranchOnCount:
559 O << "branch-on-count";
560 break;
561 case VPInstruction::ComputeReductionResult:
562 O << "compute-reduction-result";
563 break;
564 default:
565 O << Instruction::getOpcodeName(getOpcode());
568 printFlags(O);
569 printOperands(O, SlotTracker);
571 if (auto DL = getDebugLoc()) {
572 O << ", !dbg ";
573 DL.print(O);
576 #endif
578 void VPWidenCallRecipe::execute(VPTransformState &State) {
579 assert(State.VF.isVector() && "not widening");
580 auto &CI = *cast<CallInst>(getUnderlyingInstr());
581 assert(!isa<DbgInfoIntrinsic>(CI) &&
582 "DbgInfoIntrinsic should have been dropped during VPlan construction");
583 State.setDebugLocFrom(getDebugLoc());
585 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
586 FunctionType *VFTy = nullptr;
587 if (Variant)
588 VFTy = Variant->getFunctionType();
589 for (unsigned Part = 0; Part < State.UF; ++Part) {
590 SmallVector<Type *, 2> TysForDecl;
591 // Add return type if intrinsic is overloaded on it.
592 if (UseIntrinsic &&
593 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
594 TysForDecl.push_back(
595 VectorType::get(CI.getType()->getScalarType(), State.VF));
596 SmallVector<Value *, 4> Args;
597 for (const auto &I : enumerate(operands())) {
598 // Some intrinsics have a scalar argument - don't replace it with a
599 // vector.
600 Value *Arg;
601 if (UseIntrinsic &&
602 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
603 Arg = State.get(I.value(), VPIteration(0, 0));
604 // Some vectorized function variants may also take a scalar argument,
605 // e.g. linear parameters for pointers. This needs to be the scalar value
606 // from the start of the respective part when interleaving.
607 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
608 Arg = State.get(I.value(), VPIteration(Part, 0));
609 else
610 Arg = State.get(I.value(), Part);
611 if (UseIntrinsic &&
612 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
613 TysForDecl.push_back(Arg->getType());
614 Args.push_back(Arg);
617 Function *VectorF;
618 if (UseIntrinsic) {
619 // Use vector version of the intrinsic.
620 Module *M = State.Builder.GetInsertBlock()->getModule();
621 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
622 assert(VectorF && "Can't retrieve vector intrinsic.");
623 } else {
624 #ifndef NDEBUG
625 assert(Variant != nullptr && "Can't create vector function.");
626 #endif
627 VectorF = Variant;
630 SmallVector<OperandBundleDef, 1> OpBundles;
631 CI.getOperandBundlesAsDefs(OpBundles);
632 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
634 if (isa<FPMathOperator>(V))
635 V->copyFastMathFlags(&CI);
637 State.set(this, V, Part);
638 State.addMetadata(V, &CI);
642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
643 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
644 VPSlotTracker &SlotTracker) const {
645 O << Indent << "WIDEN-CALL ";
647 auto *CI = cast<CallInst>(getUnderlyingInstr());
648 if (CI->getType()->isVoidTy())
649 O << "void ";
650 else {
651 printAsOperand(O, SlotTracker);
652 O << " = ";
655 O << "call @" << CI->getCalledFunction()->getName() << "(";
656 printOperands(O, SlotTracker);
657 O << ")";
659 if (VectorIntrinsicID)
660 O << " (using vector intrinsic)";
661 else {
662 O << " (using library function";
663 if (Variant->hasName())
664 O << ": " << Variant->getName();
665 O << ")";
669 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
670 VPSlotTracker &SlotTracker) const {
671 O << Indent << "WIDEN-SELECT ";
672 printAsOperand(O, SlotTracker);
673 O << " = select ";
674 getOperand(0)->printAsOperand(O, SlotTracker);
675 O << ", ";
676 getOperand(1)->printAsOperand(O, SlotTracker);
677 O << ", ";
678 getOperand(2)->printAsOperand(O, SlotTracker);
679 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
681 #endif
683 void VPWidenSelectRecipe::execute(VPTransformState &State) {
684 State.setDebugLocFrom(getDebugLoc());
686 // The condition can be loop invariant but still defined inside the
687 // loop. This means that we can't just use the original 'cond' value.
688 // We have to take the 'vectorized' value and pick the first lane.
689 // Instcombine will make this a no-op.
690 auto *InvarCond =
691 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
693 for (unsigned Part = 0; Part < State.UF; ++Part) {
694 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
695 Value *Op0 = State.get(getOperand(1), Part);
696 Value *Op1 = State.get(getOperand(2), Part);
697 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
698 State.set(this, Sel, Part);
699 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
703 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
704 const FastMathFlags &FMF) {
705 AllowReassoc = FMF.allowReassoc();
706 NoNaNs = FMF.noNaNs();
707 NoInfs = FMF.noInfs();
708 NoSignedZeros = FMF.noSignedZeros();
709 AllowReciprocal = FMF.allowReciprocal();
710 AllowContract = FMF.allowContract();
711 ApproxFunc = FMF.approxFunc();
714 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
715 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
716 switch (OpType) {
717 case OperationType::Cmp:
718 O << " " << CmpInst::getPredicateName(getPredicate());
719 break;
720 case OperationType::DisjointOp:
721 if (DisjointFlags.IsDisjoint)
722 O << " disjoint";
723 break;
724 case OperationType::PossiblyExactOp:
725 if (ExactFlags.IsExact)
726 O << " exact";
727 break;
728 case OperationType::OverflowingBinOp:
729 if (WrapFlags.HasNUW)
730 O << " nuw";
731 if (WrapFlags.HasNSW)
732 O << " nsw";
733 break;
734 case OperationType::FPMathOp:
735 getFastMathFlags().print(O);
736 break;
737 case OperationType::GEPOp:
738 if (GEPFlags.IsInBounds)
739 O << " inbounds";
740 break;
741 case OperationType::NonNegOp:
742 if (NonNegFlags.NonNeg)
743 O << " nneg";
744 break;
745 case OperationType::Other:
746 break;
748 if (getNumOperands() > 0)
749 O << " ";
751 #endif
753 void VPWidenRecipe::execute(VPTransformState &State) {
754 State.setDebugLocFrom(getDebugLoc());
755 auto &Builder = State.Builder;
756 switch (Opcode) {
757 case Instruction::Call:
758 case Instruction::Br:
759 case Instruction::PHI:
760 case Instruction::GetElementPtr:
761 case Instruction::Select:
762 llvm_unreachable("This instruction is handled by a different recipe.");
763 case Instruction::UDiv:
764 case Instruction::SDiv:
765 case Instruction::SRem:
766 case Instruction::URem:
767 case Instruction::Add:
768 case Instruction::FAdd:
769 case Instruction::Sub:
770 case Instruction::FSub:
771 case Instruction::FNeg:
772 case Instruction::Mul:
773 case Instruction::FMul:
774 case Instruction::FDiv:
775 case Instruction::FRem:
776 case Instruction::Shl:
777 case Instruction::LShr:
778 case Instruction::AShr:
779 case Instruction::And:
780 case Instruction::Or:
781 case Instruction::Xor: {
782 // Just widen unops and binops.
783 for (unsigned Part = 0; Part < State.UF; ++Part) {
784 SmallVector<Value *, 2> Ops;
785 for (VPValue *VPOp : operands())
786 Ops.push_back(State.get(VPOp, Part));
788 Value *V = Builder.CreateNAryOp(Opcode, Ops);
790 if (auto *VecOp = dyn_cast<Instruction>(V))
791 setFlags(VecOp);
793 // Use this vector value for all users of the original instruction.
794 State.set(this, V, Part);
795 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
798 break;
800 case Instruction::Freeze: {
801 for (unsigned Part = 0; Part < State.UF; ++Part) {
802 Value *Op = State.get(getOperand(0), Part);
804 Value *Freeze = Builder.CreateFreeze(Op);
805 State.set(this, Freeze, Part);
807 break;
809 case Instruction::ICmp:
810 case Instruction::FCmp: {
811 // Widen compares. Generate vector compares.
812 bool FCmp = Opcode == Instruction::FCmp;
813 for (unsigned Part = 0; Part < State.UF; ++Part) {
814 Value *A = State.get(getOperand(0), Part);
815 Value *B = State.get(getOperand(1), Part);
816 Value *C = nullptr;
817 if (FCmp) {
818 // Propagate fast math flags.
819 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
820 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
821 Builder.setFastMathFlags(I->getFastMathFlags());
822 C = Builder.CreateFCmp(getPredicate(), A, B);
823 } else {
824 C = Builder.CreateICmp(getPredicate(), A, B);
826 State.set(this, C, Part);
827 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
830 break;
832 default:
833 // This instruction is not vectorized by simple widening.
834 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
835 << Instruction::getOpcodeName(Opcode));
836 llvm_unreachable("Unhandled instruction!");
837 } // end of switch.
839 #if !defined(NDEBUG)
840 // Verify that VPlan type inference results agree with the type of the
841 // generated values.
842 for (unsigned Part = 0; Part < State.UF; ++Part) {
843 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
844 State.VF) == State.get(this, Part)->getType() &&
845 "inferred type and type from generated instructions do not match");
847 #endif
850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
851 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
852 VPSlotTracker &SlotTracker) const {
853 O << Indent << "WIDEN ";
854 printAsOperand(O, SlotTracker);
855 O << " = " << Instruction::getOpcodeName(Opcode);
856 printFlags(O);
857 printOperands(O, SlotTracker);
859 #endif
861 void VPWidenCastRecipe::execute(VPTransformState &State) {
862 State.setDebugLocFrom(getDebugLoc());
863 auto &Builder = State.Builder;
864 /// Vectorize casts.
865 assert(State.VF.isVector() && "Not vectorizing?");
866 Type *DestTy = VectorType::get(getResultType(), State.VF);
867 VPValue *Op = getOperand(0);
868 for (unsigned Part = 0; Part < State.UF; ++Part) {
869 if (Part > 0 && Op->isLiveIn()) {
870 // FIXME: Remove once explicit unrolling is implemented using VPlan.
871 State.set(this, State.get(this, 0), Part);
872 continue;
874 Value *A = State.get(Op, Part);
875 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
876 State.set(this, Cast, Part);
877 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
882 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
883 VPSlotTracker &SlotTracker) const {
884 O << Indent << "WIDEN-CAST ";
885 printAsOperand(O, SlotTracker);
886 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
887 printFlags(O);
888 printOperands(O, SlotTracker);
889 O << " to " << *getResultType();
891 #endif
893 /// This function adds
894 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
895 /// to each vector element of Val. The sequence starts at StartIndex.
896 /// \p Opcode is relevant for FP induction variable.
897 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
898 Instruction::BinaryOps BinOp, ElementCount VF,
899 IRBuilderBase &Builder) {
900 assert(VF.isVector() && "only vector VFs are supported");
902 // Create and check the types.
903 auto *ValVTy = cast<VectorType>(Val->getType());
904 ElementCount VLen = ValVTy->getElementCount();
906 Type *STy = Val->getType()->getScalarType();
907 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
908 "Induction Step must be an integer or FP");
909 assert(Step->getType() == STy && "Step has wrong type");
911 SmallVector<Constant *, 8> Indices;
913 // Create a vector of consecutive numbers from zero to VF.
914 VectorType *InitVecValVTy = ValVTy;
915 if (STy->isFloatingPointTy()) {
916 Type *InitVecValSTy =
917 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
918 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
920 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
922 // Splat the StartIdx
923 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
925 if (STy->isIntegerTy()) {
926 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
927 Step = Builder.CreateVectorSplat(VLen, Step);
928 assert(Step->getType() == Val->getType() && "Invalid step vec");
929 // FIXME: The newly created binary instructions should contain nsw/nuw
930 // flags, which can be found from the original scalar operations.
931 Step = Builder.CreateMul(InitVec, Step);
932 return Builder.CreateAdd(Val, Step, "induction");
935 // Floating point induction.
936 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
937 "Binary Opcode should be specified for FP induction");
938 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
939 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
941 Step = Builder.CreateVectorSplat(VLen, Step);
942 Value *MulOp = Builder.CreateFMul(InitVec, Step);
943 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
946 /// A helper function that returns an integer or floating-point constant with
947 /// value C.
948 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
949 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
950 : ConstantFP::get(Ty, C);
953 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
954 ElementCount VF) {
955 assert(FTy->isFloatingPointTy() && "Expected floating point type!");
956 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
957 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
958 return B.CreateUIToFP(RuntimeVF, FTy);
961 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
962 assert(!State.Instance && "Int or FP induction being replicated.");
964 Value *Start = getStartValue()->getLiveInIRValue();
965 const InductionDescriptor &ID = getInductionDescriptor();
966 TruncInst *Trunc = getTruncInst();
967 IRBuilderBase &Builder = State.Builder;
968 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
969 assert(State.VF.isVector() && "must have vector VF");
971 // The value from the original loop to which we are mapping the new induction
972 // variable.
973 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
975 // Fast-math-flags propagate from the original induction instruction.
976 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
977 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
978 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
980 // Now do the actual transformations, and start with fetching the step value.
981 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
983 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
984 "Expected either an induction phi-node or a truncate of it!");
986 // Construct the initial value of the vector IV in the vector loop preheader
987 auto CurrIP = Builder.saveIP();
988 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
989 Builder.SetInsertPoint(VectorPH->getTerminator());
990 if (isa<TruncInst>(EntryVal)) {
991 assert(Start->getType()->isIntegerTy() &&
992 "Truncation requires an integer type");
993 auto *TruncType = cast<IntegerType>(EntryVal->getType());
994 Step = Builder.CreateTrunc(Step, TruncType);
995 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
998 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
999 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1000 Value *SteppedStart = getStepVector(
1001 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1003 // We create vector phi nodes for both integer and floating-point induction
1004 // variables. Here, we determine the kind of arithmetic we will perform.
1005 Instruction::BinaryOps AddOp;
1006 Instruction::BinaryOps MulOp;
1007 if (Step->getType()->isIntegerTy()) {
1008 AddOp = Instruction::Add;
1009 MulOp = Instruction::Mul;
1010 } else {
1011 AddOp = ID.getInductionOpcode();
1012 MulOp = Instruction::FMul;
1015 // Multiply the vectorization factor by the step using integer or
1016 // floating-point arithmetic as appropriate.
1017 Type *StepType = Step->getType();
1018 Value *RuntimeVF;
1019 if (Step->getType()->isFloatingPointTy())
1020 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1021 else
1022 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1023 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1025 // Create a vector splat to use in the induction update.
1027 // FIXME: If the step is non-constant, we create the vector splat with
1028 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1029 // handle a constant vector splat.
1030 Value *SplatVF = isa<Constant>(Mul)
1031 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1032 : Builder.CreateVectorSplat(State.VF, Mul);
1033 Builder.restoreIP(CurrIP);
1035 // We may need to add the step a number of times, depending on the unroll
1036 // factor. The last of those goes into the PHI.
1037 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1038 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1039 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1040 Instruction *LastInduction = VecInd;
1041 for (unsigned Part = 0; Part < State.UF; ++Part) {
1042 State.set(this, LastInduction, Part);
1044 if (isa<TruncInst>(EntryVal))
1045 State.addMetadata(LastInduction, EntryVal);
1047 LastInduction = cast<Instruction>(
1048 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1049 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1052 LastInduction->setName("vec.ind.next");
1053 VecInd->addIncoming(SteppedStart, VectorPH);
1054 // Add induction update using an incorrect block temporarily. The phi node
1055 // will be fixed after VPlan execution. Note that at this point the latch
1056 // block cannot be used, as it does not exist yet.
1057 // TODO: Model increment value in VPlan, by turning the recipe into a
1058 // multi-def and a subclass of VPHeaderPHIRecipe.
1059 VecInd->addIncoming(LastInduction, VectorPH);
1062 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1063 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1064 VPSlotTracker &SlotTracker) const {
1065 O << Indent << "WIDEN-INDUCTION";
1066 if (getTruncInst()) {
1067 O << "\\l\"";
1068 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
1069 O << " +\n" << Indent << "\" ";
1070 getVPValue(0)->printAsOperand(O, SlotTracker);
1071 } else
1072 O << " " << VPlanIngredient(IV);
1074 O << ", ";
1075 getStepValue()->printAsOperand(O, SlotTracker);
1077 #endif
1079 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1080 // The step may be defined by a recipe in the preheader (e.g. if it requires
1081 // SCEV expansion), but for the canonical induction the step is required to be
1082 // 1, which is represented as live-in.
1083 if (getStepValue()->getDefiningRecipe())
1084 return false;
1085 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1086 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1087 return StartC && StartC->isZero() && StepC && StepC->isOne();
1090 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1091 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1092 VPSlotTracker &SlotTracker) const {
1093 O << Indent;
1094 printAsOperand(O, SlotTracker);
1095 O << Indent << "= DERIVED-IV ";
1096 getStartValue()->printAsOperand(O, SlotTracker);
1097 O << " + ";
1098 getCanonicalIV()->printAsOperand(O, SlotTracker);
1099 O << " * ";
1100 getStepValue()->printAsOperand(O, SlotTracker);
1102 if (TruncResultTy)
1103 O << " (truncated to " << *TruncResultTy << ")";
1105 #endif
1107 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1108 // Fast-math-flags propagate from the original induction instruction.
1109 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1110 if (hasFastMathFlags())
1111 State.Builder.setFastMathFlags(getFastMathFlags());
1113 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1114 /// variable on which to base the steps, \p Step is the size of the step.
1116 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1117 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1118 IRBuilderBase &Builder = State.Builder;
1120 // Ensure step has the same type as that of scalar IV.
1121 Type *BaseIVTy = BaseIV->getType()->getScalarType();
1122 if (BaseIVTy != Step->getType()) {
1123 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
1124 // avoid separate truncate here.
1125 assert(Step->getType()->isIntegerTy() &&
1126 "Truncation requires an integer step");
1127 Step = State.Builder.CreateTrunc(Step, BaseIVTy);
1130 // We build scalar steps for both integer and floating-point induction
1131 // variables. Here, we determine the kind of arithmetic we will perform.
1132 Instruction::BinaryOps AddOp;
1133 Instruction::BinaryOps MulOp;
1134 if (BaseIVTy->isIntegerTy()) {
1135 AddOp = Instruction::Add;
1136 MulOp = Instruction::Mul;
1137 } else {
1138 AddOp = InductionOpcode;
1139 MulOp = Instruction::FMul;
1142 // Determine the number of scalars we need to generate for each unroll
1143 // iteration.
1144 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1145 // Compute the scalar steps and save the results in State.
1146 Type *IntStepTy =
1147 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1148 Type *VecIVTy = nullptr;
1149 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1150 if (!FirstLaneOnly && State.VF.isScalable()) {
1151 VecIVTy = VectorType::get(BaseIVTy, State.VF);
1152 UnitStepVec =
1153 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1154 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1155 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1158 unsigned StartPart = 0;
1159 unsigned EndPart = State.UF;
1160 unsigned StartLane = 0;
1161 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1162 if (State.Instance) {
1163 StartPart = State.Instance->Part;
1164 EndPart = StartPart + 1;
1165 StartLane = State.Instance->Lane.getKnownLane();
1166 EndLane = StartLane + 1;
1168 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1169 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1171 if (!FirstLaneOnly && State.VF.isScalable()) {
1172 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1173 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1174 if (BaseIVTy->isFloatingPointTy())
1175 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1176 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1177 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1178 State.set(this, Add, Part);
1179 // It's useful to record the lane values too for the known minimum number
1180 // of elements so we do those below. This improves the code quality when
1181 // trying to extract the first element, for example.
1184 if (BaseIVTy->isFloatingPointTy())
1185 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1187 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1188 Value *StartIdx = Builder.CreateBinOp(
1189 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1190 // The step returned by `createStepForVF` is a runtime-evaluated value
1191 // when VF is scalable. Otherwise, it should be folded into a Constant.
1192 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1193 "Expected StartIdx to be folded to a constant when VF is not "
1194 "scalable");
1195 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1196 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1197 State.set(this, Add, VPIteration(Part, Lane));
1202 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1203 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1204 VPSlotTracker &SlotTracker) const {
1205 O << Indent;
1206 printAsOperand(O, SlotTracker);
1207 O << " = SCALAR-STEPS ";
1208 printOperands(O, SlotTracker);
1210 #endif
1212 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1213 assert(State.VF.isVector() && "not widening");
1214 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1215 // Construct a vector GEP by widening the operands of the scalar GEP as
1216 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1217 // results in a vector of pointers when at least one operand of the GEP
1218 // is vector-typed. Thus, to keep the representation compact, we only use
1219 // vector-typed operands for loop-varying values.
1221 if (areAllOperandsInvariant()) {
1222 // If we are vectorizing, but the GEP has only loop-invariant operands,
1223 // the GEP we build (by only using vector-typed operands for
1224 // loop-varying values) would be a scalar pointer. Thus, to ensure we
1225 // produce a vector of pointers, we need to either arbitrarily pick an
1226 // operand to broadcast, or broadcast a clone of the original GEP.
1227 // Here, we broadcast a clone of the original.
1229 // TODO: If at some point we decide to scalarize instructions having
1230 // loop-invariant operands, this special case will no longer be
1231 // required. We would add the scalarization decision to
1232 // collectLoopScalars() and teach getVectorValue() to broadcast
1233 // the lane-zero scalar value.
1234 SmallVector<Value *> Ops;
1235 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1236 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1238 auto *NewGEP =
1239 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1240 ArrayRef(Ops).drop_front(), "", isInBounds());
1241 for (unsigned Part = 0; Part < State.UF; ++Part) {
1242 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1243 State.set(this, EntryPart, Part);
1244 State.addMetadata(EntryPart, GEP);
1246 } else {
1247 // If the GEP has at least one loop-varying operand, we are sure to
1248 // produce a vector of pointers. But if we are only unrolling, we want
1249 // to produce a scalar GEP for each unroll part. Thus, the GEP we
1250 // produce with the code below will be scalar (if VF == 1) or vector
1251 // (otherwise). Note that for the unroll-only case, we still maintain
1252 // values in the vector mapping with initVector, as we do for other
1253 // instructions.
1254 for (unsigned Part = 0; Part < State.UF; ++Part) {
1255 // The pointer operand of the new GEP. If it's loop-invariant, we
1256 // won't broadcast it.
1257 auto *Ptr = isPointerLoopInvariant()
1258 ? State.get(getOperand(0), VPIteration(0, 0))
1259 : State.get(getOperand(0), Part);
1261 // Collect all the indices for the new GEP. If any index is
1262 // loop-invariant, we won't broadcast it.
1263 SmallVector<Value *, 4> Indices;
1264 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1265 VPValue *Operand = getOperand(I);
1266 if (isIndexLoopInvariant(I - 1))
1267 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1268 else
1269 Indices.push_back(State.get(Operand, Part));
1272 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1273 // but it should be a vector, otherwise.
1274 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1275 Indices, "", isInBounds());
1276 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1277 "NewGEP is not a pointer vector");
1278 State.set(this, NewGEP, Part);
1279 State.addMetadata(NewGEP, GEP);
1284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1285 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1286 VPSlotTracker &SlotTracker) const {
1287 O << Indent << "WIDEN-GEP ";
1288 O << (isPointerLoopInvariant() ? "Inv" : "Var");
1289 for (size_t I = 0; I < getNumOperands() - 1; ++I)
1290 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1292 O << " ";
1293 printAsOperand(O, SlotTracker);
1294 O << " = getelementptr";
1295 printFlags(O);
1296 printOperands(O, SlotTracker);
1298 #endif
1300 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1301 auto &Builder = State.Builder;
1302 State.setDebugLocFrom(getDebugLoc());
1303 for (unsigned Part = 0; Part < State.UF; ++Part) {
1304 // Calculate the pointer for the specific unroll-part.
1305 Value *PartPtr = nullptr;
1306 // Use i32 for the gep index type when the value is constant,
1307 // or query DataLayout for a more suitable index type otherwise.
1308 const DataLayout &DL =
1309 Builder.GetInsertBlock()->getModule()->getDataLayout();
1310 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1311 ? DL.getIndexType(IndexedTy->getPointerTo())
1312 : Builder.getInt32Ty();
1313 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1314 bool InBounds = isInBounds();
1315 if (IsReverse) {
1316 // If the address is consecutive but reversed, then the
1317 // wide store needs to start at the last vector element.
1318 // RunTimeVF = VScale * VF.getKnownMinValue()
1319 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1320 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1321 // NumElt = -Part * RunTimeVF
1322 Value *NumElt = Builder.CreateMul(
1323 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1324 // LastLane = 1 - RunTimeVF
1325 Value *LastLane =
1326 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1327 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1328 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1329 } else {
1330 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1331 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1334 State.set(this, PartPtr, Part);
1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1339 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1340 VPSlotTracker &SlotTracker) const {
1341 O << Indent;
1342 printAsOperand(O, SlotTracker);
1343 O << " = vector-pointer ";
1344 if (IsReverse)
1345 O << "(reverse) ";
1347 printOperands(O, SlotTracker);
1349 #endif
1351 void VPBlendRecipe::execute(VPTransformState &State) {
1352 State.setDebugLocFrom(getDebugLoc());
1353 // We know that all PHIs in non-header blocks are converted into
1354 // selects, so we don't have to worry about the insertion order and we
1355 // can just use the builder.
1356 // At this point we generate the predication tree. There may be
1357 // duplications since this is a simple recursive scan, but future
1358 // optimizations will clean it up.
1360 unsigned NumIncoming = getNumIncomingValues();
1362 // Generate a sequence of selects of the form:
1363 // SELECT(Mask3, In3,
1364 // SELECT(Mask2, In2,
1365 // SELECT(Mask1, In1,
1366 // In0)))
1367 // Note that Mask0 is never used: lanes for which no path reaches this phi and
1368 // are essentially undef are taken from In0.
1369 VectorParts Entry(State.UF);
1370 for (unsigned In = 0; In < NumIncoming; ++In) {
1371 for (unsigned Part = 0; Part < State.UF; ++Part) {
1372 // We might have single edge PHIs (blocks) - use an identity
1373 // 'select' for the first PHI operand.
1374 Value *In0 = State.get(getIncomingValue(In), Part);
1375 if (In == 0)
1376 Entry[Part] = In0; // Initialize with the first incoming value.
1377 else {
1378 // Select between the current value and the previous incoming edge
1379 // based on the incoming mask.
1380 Value *Cond = State.get(getMask(In), Part);
1381 Entry[Part] =
1382 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1386 for (unsigned Part = 0; Part < State.UF; ++Part)
1387 State.set(this, Entry[Part], Part);
1390 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1391 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1392 VPSlotTracker &SlotTracker) const {
1393 O << Indent << "BLEND ";
1394 printAsOperand(O, SlotTracker);
1395 O << " =";
1396 if (getNumIncomingValues() == 1) {
1397 // Not a User of any mask: not really blending, this is a
1398 // single-predecessor phi.
1399 O << " ";
1400 getIncomingValue(0)->printAsOperand(O, SlotTracker);
1401 } else {
1402 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1403 O << " ";
1404 getIncomingValue(I)->printAsOperand(O, SlotTracker);
1405 O << "/";
1406 getMask(I)->printAsOperand(O, SlotTracker);
1411 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1412 VPSlotTracker &SlotTracker) const {
1413 O << Indent << "REDUCE ";
1414 printAsOperand(O, SlotTracker);
1415 O << " = ";
1416 getChainOp()->printAsOperand(O, SlotTracker);
1417 O << " +";
1418 if (isa<FPMathOperator>(getUnderlyingInstr()))
1419 O << getUnderlyingInstr()->getFastMathFlags();
1420 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1421 getVecOp()->printAsOperand(O, SlotTracker);
1422 if (getCondOp()) {
1423 O << ", ";
1424 getCondOp()->printAsOperand(O, SlotTracker);
1426 O << ")";
1427 if (RdxDesc.IntermediateStore)
1428 O << " (with final reduction value stored in invariant address sank "
1429 "outside of loop)";
1431 #endif
1433 bool VPReplicateRecipe::shouldPack() const {
1434 // Find if the recipe is used by a widened recipe via an intervening
1435 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1436 return any_of(users(), [](const VPUser *U) {
1437 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1438 return any_of(PredR->users(), [PredR](const VPUser *U) {
1439 return !U->usesScalars(PredR);
1441 return false;
1445 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1446 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1447 VPSlotTracker &SlotTracker) const {
1448 O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1450 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1451 printAsOperand(O, SlotTracker);
1452 O << " = ";
1454 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1455 O << "call";
1456 printFlags(O);
1457 O << "@" << CB->getCalledFunction()->getName() << "(";
1458 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1459 O, [&O, &SlotTracker](VPValue *Op) {
1460 Op->printAsOperand(O, SlotTracker);
1462 O << ")";
1463 } else {
1464 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1465 printFlags(O);
1466 printOperands(O, SlotTracker);
1469 if (shouldPack())
1470 O << " (S->V)";
1472 #endif
1474 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1475 assert(State.Instance && "Branch on Mask works only on single instance.");
1477 unsigned Part = State.Instance->Part;
1478 unsigned Lane = State.Instance->Lane.getKnownLane();
1480 Value *ConditionBit = nullptr;
1481 VPValue *BlockInMask = getMask();
1482 if (BlockInMask) {
1483 ConditionBit = State.get(BlockInMask, Part);
1484 if (ConditionBit->getType()->isVectorTy())
1485 ConditionBit = State.Builder.CreateExtractElement(
1486 ConditionBit, State.Builder.getInt32(Lane));
1487 } else // Block in mask is all-one.
1488 ConditionBit = State.Builder.getTrue();
1490 // Replace the temporary unreachable terminator with a new conditional branch,
1491 // whose two destinations will be set later when they are created.
1492 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1493 assert(isa<UnreachableInst>(CurrentTerminator) &&
1494 "Expected to replace unreachable terminator with conditional branch.");
1495 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1496 CondBr->setSuccessor(0, nullptr);
1497 ReplaceInstWithInst(CurrentTerminator, CondBr);
1500 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1501 assert(State.Instance && "Predicated instruction PHI works per instance.");
1502 Instruction *ScalarPredInst =
1503 cast<Instruction>(State.get(getOperand(0), *State.Instance));
1504 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1505 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1506 assert(PredicatingBB && "Predicated block has no single predecessor.");
1507 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1508 "operand must be VPReplicateRecipe");
1510 // By current pack/unpack logic we need to generate only a single phi node: if
1511 // a vector value for the predicated instruction exists at this point it means
1512 // the instruction has vector users only, and a phi for the vector value is
1513 // needed. In this case the recipe of the predicated instruction is marked to
1514 // also do that packing, thereby "hoisting" the insert-element sequence.
1515 // Otherwise, a phi node for the scalar value is needed.
1516 unsigned Part = State.Instance->Part;
1517 if (State.hasVectorValue(getOperand(0), Part)) {
1518 Value *VectorValue = State.get(getOperand(0), Part);
1519 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1520 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1521 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1522 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1523 if (State.hasVectorValue(this, Part))
1524 State.reset(this, VPhi, Part);
1525 else
1526 State.set(this, VPhi, Part);
1527 // NOTE: Currently we need to update the value of the operand, so the next
1528 // predicated iteration inserts its generated value in the correct vector.
1529 State.reset(getOperand(0), VPhi, Part);
1530 } else {
1531 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1532 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1533 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1534 PredicatingBB);
1535 Phi->addIncoming(ScalarPredInst, PredicatedBB);
1536 if (State.hasScalarValue(this, *State.Instance))
1537 State.reset(this, Phi, *State.Instance);
1538 else
1539 State.set(this, Phi, *State.Instance);
1540 // NOTE: Currently we need to update the value of the operand, so the next
1541 // predicated iteration inserts its generated value in the correct vector.
1542 State.reset(getOperand(0), Phi, *State.Instance);
1546 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1547 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1548 VPSlotTracker &SlotTracker) const {
1549 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1550 printAsOperand(O, SlotTracker);
1551 O << " = ";
1552 printOperands(O, SlotTracker);
1555 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1556 VPSlotTracker &SlotTracker) const {
1557 O << Indent << "WIDEN ";
1559 if (!isStore()) {
1560 getVPSingleValue()->printAsOperand(O, SlotTracker);
1561 O << " = ";
1563 O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1565 printOperands(O, SlotTracker);
1567 #endif
1569 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1570 Value *Start = getStartValue()->getLiveInIRValue();
1571 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1572 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1574 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1575 EntryPart->addIncoming(Start, VectorPH);
1576 EntryPart->setDebugLoc(getDebugLoc());
1577 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1578 State.set(this, EntryPart, Part);
1581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1582 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1583 VPSlotTracker &SlotTracker) const {
1584 O << Indent << "EMIT ";
1585 printAsOperand(O, SlotTracker);
1586 O << " = CANONICAL-INDUCTION ";
1587 printOperands(O, SlotTracker);
1589 #endif
1591 bool VPCanonicalIVPHIRecipe::isCanonical(
1592 InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
1593 Type *Ty) const {
1594 // The types must match and it must be an integer induction.
1595 if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
1596 return false;
1597 // Start must match the start value of this canonical induction.
1598 if (Start != getStartValue())
1599 return false;
1601 // If the step is defined by a recipe, it is not a ConstantInt.
1602 if (Step->getDefiningRecipe())
1603 return false;
1605 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1606 return StepC && StepC->isOne();
1609 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1610 return IsScalarAfterVectorization &&
1611 (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1614 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1615 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1616 VPSlotTracker &SlotTracker) const {
1617 O << Indent << "EMIT ";
1618 printAsOperand(O, SlotTracker);
1619 O << " = WIDEN-POINTER-INDUCTION ";
1620 getStartValue()->printAsOperand(O, SlotTracker);
1621 O << ", " << *IndDesc.getStep();
1623 #endif
1625 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1626 assert(!State.Instance && "cannot be used in per-lane");
1627 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1628 SCEVExpander Exp(SE, DL, "induction");
1630 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1631 &*State.Builder.GetInsertPoint());
1632 assert(!State.ExpandedSCEVs.contains(Expr) &&
1633 "Same SCEV expanded multiple times");
1634 State.ExpandedSCEVs[Expr] = Res;
1635 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1636 State.set(this, Res, {Part, 0});
1639 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1640 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1641 VPSlotTracker &SlotTracker) const {
1642 O << Indent << "EMIT ";
1643 getVPSingleValue()->printAsOperand(O, SlotTracker);
1644 O << " = EXPAND SCEV " << *Expr;
1646 #endif
1648 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1649 Value *CanonicalIV = State.get(getOperand(0), 0);
1650 Type *STy = CanonicalIV->getType();
1651 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1652 ElementCount VF = State.VF;
1653 Value *VStart = VF.isScalar()
1654 ? CanonicalIV
1655 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1656 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1657 Value *VStep = createStepForVF(Builder, STy, VF, Part);
1658 if (VF.isVector()) {
1659 VStep = Builder.CreateVectorSplat(VF, VStep);
1660 VStep =
1661 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1663 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1664 State.set(this, CanonicalVectorIV, Part);
1668 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1669 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1670 VPSlotTracker &SlotTracker) const {
1671 O << Indent << "EMIT ";
1672 printAsOperand(O, SlotTracker);
1673 O << " = WIDEN-CANONICAL-INDUCTION ";
1674 printOperands(O, SlotTracker);
1676 #endif
1678 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1679 auto &Builder = State.Builder;
1680 // Create a vector from the initial value.
1681 auto *VectorInit = getStartValue()->getLiveInIRValue();
1683 Type *VecTy = State.VF.isScalar()
1684 ? VectorInit->getType()
1685 : VectorType::get(VectorInit->getType(), State.VF);
1687 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1688 if (State.VF.isVector()) {
1689 auto *IdxTy = Builder.getInt32Ty();
1690 auto *One = ConstantInt::get(IdxTy, 1);
1691 IRBuilder<>::InsertPointGuard Guard(Builder);
1692 Builder.SetInsertPoint(VectorPH->getTerminator());
1693 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1694 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1695 VectorInit = Builder.CreateInsertElement(
1696 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1699 // Create a phi node for the new recurrence.
1700 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1701 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1702 EntryPart->addIncoming(VectorInit, VectorPH);
1703 State.set(this, EntryPart, 0);
1706 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1707 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1708 VPSlotTracker &SlotTracker) const {
1709 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1710 printAsOperand(O, SlotTracker);
1711 O << " = phi ";
1712 printOperands(O, SlotTracker);
1714 #endif
1716 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1717 auto &Builder = State.Builder;
1719 // Reductions do not have to start at zero. They can start with
1720 // any loop invariant values.
1721 VPValue *StartVPV = getStartValue();
1722 Value *StartV = StartVPV->getLiveInIRValue();
1724 // In order to support recurrences we need to be able to vectorize Phi nodes.
1725 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1726 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1727 // this value when we vectorize all of the instructions that use the PHI.
1728 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1729 Type *VecTy = ScalarPHI ? StartV->getType()
1730 : VectorType::get(StartV->getType(), State.VF);
1732 BasicBlock *HeaderBB = State.CFG.PrevBB;
1733 assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1734 "recipe must be in the vector loop header");
1735 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1736 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1737 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1738 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1739 State.set(this, EntryPart, Part);
1742 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1744 Value *Iden = nullptr;
1745 RecurKind RK = RdxDesc.getRecurrenceKind();
1746 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1747 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
1748 // MinMax and AnyOf reductions have the start value as their identity.
1749 if (ScalarPHI) {
1750 Iden = StartV;
1751 } else {
1752 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1753 Builder.SetInsertPoint(VectorPH->getTerminator());
1754 StartV = Iden =
1755 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1757 } else {
1758 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1759 RdxDesc.getFastMathFlags());
1761 if (!ScalarPHI) {
1762 Iden = Builder.CreateVectorSplat(State.VF, Iden);
1763 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1764 Builder.SetInsertPoint(VectorPH->getTerminator());
1765 Constant *Zero = Builder.getInt32(0);
1766 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1770 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1771 Value *EntryPart = State.get(this, Part);
1772 // Make sure to add the reduction start value only to the
1773 // first unroll part.
1774 Value *StartVal = (Part == 0) ? StartV : Iden;
1775 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1780 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1781 VPSlotTracker &SlotTracker) const {
1782 O << Indent << "WIDEN-REDUCTION-PHI ";
1784 printAsOperand(O, SlotTracker);
1785 O << " = phi ";
1786 printOperands(O, SlotTracker);
1788 #endif
1790 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1791 assert(EnableVPlanNativePath &&
1792 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1794 Value *Op0 = State.get(getOperand(0), 0);
1795 Type *VecTy = Op0->getType();
1796 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1797 State.set(this, VecPhi, 0);
1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1801 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1802 VPSlotTracker &SlotTracker) const {
1803 O << Indent << "WIDEN-PHI ";
1805 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1806 // Unless all incoming values are modeled in VPlan print the original PHI
1807 // directly.
1808 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1809 // values as VPValues.
1810 if (getNumOperands() != OriginalPhi->getNumOperands()) {
1811 O << VPlanIngredient(OriginalPhi);
1812 return;
1815 printAsOperand(O, SlotTracker);
1816 O << " = phi ";
1817 printOperands(O, SlotTracker);
1819 #endif
1821 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1822 // remove VPActiveLaneMaskPHIRecipe.
1823 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1824 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1825 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1826 Value *StartMask = State.get(getOperand(0), Part);
1827 PHINode *EntryPart =
1828 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1829 EntryPart->addIncoming(StartMask, VectorPH);
1830 EntryPart->setDebugLoc(getDebugLoc());
1831 State.set(this, EntryPart, Part);
1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1836 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1837 VPSlotTracker &SlotTracker) const {
1838 O << Indent << "ACTIVE-LANE-MASK-PHI ";
1840 printAsOperand(O, SlotTracker);
1841 O << " = phi ";
1842 printOperands(O, SlotTracker);
1844 #endif