1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file contains implementations for different VPlan recipes.
12 //===----------------------------------------------------------------------===//
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
37 using VectorParts
= SmallVector
<Value
*, 2>;
40 extern cl::opt
<bool> EnableVPlanNativePath
;
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
46 bool VPRecipeBase::mayWriteToMemory() const {
47 switch (getVPDefID()) {
49 return cast
<VPInterleaveRecipe
>(this)->getNumStoreOperands() > 0;
50 case VPWidenMemoryInstructionSC
: {
51 return cast
<VPWidenMemoryInstructionRecipe
>(this)->isStore();
55 return cast
<Instruction
>(getVPSingleValue()->getUnderlyingValue())
57 case VPBranchOnMaskSC
:
58 case VPScalarIVStepsSC
:
63 case VPWidenCanonicalIVSC
:
66 case VPWidenIntOrFpInductionSC
:
69 case VPWidenSelectSC
: {
70 const Instruction
*I
=
71 dyn_cast_or_null
<Instruction
>(getVPSingleValue()->getUnderlyingValue());
73 assert((!I
|| !I
->mayWriteToMemory()) &&
74 "underlying instruction may write to memory");
82 bool VPRecipeBase::mayReadFromMemory() const {
83 switch (getVPDefID()) {
84 case VPWidenMemoryInstructionSC
: {
85 return !cast
<VPWidenMemoryInstructionRecipe
>(this)->isStore();
89 return cast
<Instruction
>(getVPSingleValue()->getUnderlyingValue())
90 ->mayReadFromMemory();
91 case VPBranchOnMaskSC
:
92 case VPScalarIVStepsSC
:
97 case VPWidenCanonicalIVSC
:
100 case VPWidenIntOrFpInductionSC
:
103 case VPWidenSelectSC
: {
104 const Instruction
*I
=
105 dyn_cast_or_null
<Instruction
>(getVPSingleValue()->getUnderlyingValue());
107 assert((!I
|| !I
->mayReadFromMemory()) &&
108 "underlying instruction may read from memory");
116 bool VPRecipeBase::mayHaveSideEffects() const {
117 switch (getVPDefID()) {
119 case VPPredInstPHISC
:
121 case VPInstructionSC
:
122 switch (cast
<VPInstruction
>(this)->getOpcode()) {
123 case Instruction::Or
:
124 case Instruction::ICmp
:
125 case Instruction::Select
:
126 case VPInstruction::Not
:
127 case VPInstruction::CalculateTripCountMinusVF
:
128 case VPInstruction::CanonicalIVIncrementForPart
:
134 return cast
<Instruction
>(getVPSingleValue()->getUnderlyingValue())
135 ->mayHaveSideEffects();
138 case VPScalarIVStepsSC
:
139 case VPWidenCanonicalIVSC
:
142 case VPWidenIntOrFpInductionSC
:
144 case VPWidenPointerInductionSC
:
146 case VPWidenSelectSC
: {
147 const Instruction
*I
=
148 dyn_cast_or_null
<Instruction
>(getVPSingleValue()->getUnderlyingValue());
150 assert((!I
|| !I
->mayHaveSideEffects()) &&
151 "underlying instruction has side-effects");
155 return mayWriteToMemory();
156 case VPWidenMemoryInstructionSC
:
157 assert(cast
<VPWidenMemoryInstructionRecipe
>(this)
159 .mayHaveSideEffects() == mayWriteToMemory() &&
160 "mayHaveSideffects result for ingredient differs from this "
162 return mayWriteToMemory();
163 case VPReplicateSC
: {
164 auto *R
= cast
<VPReplicateRecipe
>(this);
165 return R
->getUnderlyingInstr()->mayHaveSideEffects();
172 void VPLiveOut::fixPhi(VPlan
&Plan
, VPTransformState
&State
) {
173 auto Lane
= VPLane::getLastLaneForVF(State
.VF
);
174 VPValue
*ExitValue
= getOperand(0);
175 if (vputils::isUniformAfterVectorization(ExitValue
))
176 Lane
= VPLane::getFirstLane();
177 VPBasicBlock
*MiddleVPBB
=
178 cast
<VPBasicBlock
>(Plan
.getVectorLoopRegion()->getSingleSuccessor());
179 assert(MiddleVPBB
->getNumSuccessors() == 0 &&
180 "the middle block must not have any successors");
181 BasicBlock
*MiddleBB
= State
.CFG
.VPBB2IRBB
[MiddleVPBB
];
182 Phi
->addIncoming(State
.get(ExitValue
, VPIteration(State
.UF
- 1, Lane
)),
186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187 void VPLiveOut::print(raw_ostream
&O
, VPSlotTracker
&SlotTracker
) const {
189 getPhi()->printAsOperand(O
);
191 getOperand(0)->printAsOperand(O
, SlotTracker
);
196 void VPRecipeBase::insertBefore(VPRecipeBase
*InsertPos
) {
197 assert(!Parent
&& "Recipe already in some VPBasicBlock");
198 assert(InsertPos
->getParent() &&
199 "Insertion position not in any VPBasicBlock");
200 Parent
= InsertPos
->getParent();
201 Parent
->getRecipeList().insert(InsertPos
->getIterator(), this);
204 void VPRecipeBase::insertBefore(VPBasicBlock
&BB
,
205 iplist
<VPRecipeBase
>::iterator I
) {
206 assert(!Parent
&& "Recipe already in some VPBasicBlock");
207 assert(I
== BB
.end() || I
->getParent() == &BB
);
209 BB
.getRecipeList().insert(I
, this);
212 void VPRecipeBase::insertAfter(VPRecipeBase
*InsertPos
) {
213 assert(!Parent
&& "Recipe already in some VPBasicBlock");
214 assert(InsertPos
->getParent() &&
215 "Insertion position not in any VPBasicBlock");
216 Parent
= InsertPos
->getParent();
217 Parent
->getRecipeList().insertAfter(InsertPos
->getIterator(), this);
220 void VPRecipeBase::removeFromParent() {
221 assert(getParent() && "Recipe not in any VPBasicBlock");
222 getParent()->getRecipeList().remove(getIterator());
226 iplist
<VPRecipeBase
>::iterator
VPRecipeBase::eraseFromParent() {
227 assert(getParent() && "Recipe not in any VPBasicBlock");
228 return getParent()->getRecipeList().erase(getIterator());
231 void VPRecipeBase::moveAfter(VPRecipeBase
*InsertPos
) {
233 insertAfter(InsertPos
);
236 void VPRecipeBase::moveBefore(VPBasicBlock
&BB
,
237 iplist
<VPRecipeBase
>::iterator I
) {
242 FastMathFlags
VPRecipeWithIRFlags::getFastMathFlags() const {
243 assert(OpType
== OperationType::FPMathOp
&&
244 "recipe doesn't have fast math flags");
246 Res
.setAllowReassoc(FMFs
.AllowReassoc
);
247 Res
.setNoNaNs(FMFs
.NoNaNs
);
248 Res
.setNoInfs(FMFs
.NoInfs
);
249 Res
.setNoSignedZeros(FMFs
.NoSignedZeros
);
250 Res
.setAllowReciprocal(FMFs
.AllowReciprocal
);
251 Res
.setAllowContract(FMFs
.AllowContract
);
252 Res
.setApproxFunc(FMFs
.ApproxFunc
);
256 VPInstruction::VPInstruction(unsigned Opcode
, CmpInst::Predicate Pred
,
257 VPValue
*A
, VPValue
*B
, DebugLoc DL
,
259 : VPRecipeWithIRFlags(VPDef::VPInstructionSC
, ArrayRef
<VPValue
*>({A
, B
}),
261 Opcode(Opcode
), Name(Name
.str()) {
262 assert(Opcode
== Instruction::ICmp
&&
263 "only ICmp predicates supported at the moment");
266 VPInstruction::VPInstruction(unsigned Opcode
,
267 std::initializer_list
<VPValue
*> Operands
,
268 FastMathFlags FMFs
, DebugLoc DL
, const Twine
&Name
)
269 : VPRecipeWithIRFlags(VPDef::VPInstructionSC
, Operands
, FMFs
, DL
),
270 Opcode(Opcode
), Name(Name
.str()) {
271 // Make sure the VPInstruction is a floating-point operation.
272 assert(isFPMathOp() && "this op can't take fast-math flags");
275 Value
*VPInstruction::generateInstruction(VPTransformState
&State
,
277 IRBuilderBase
&Builder
= State
.Builder
;
278 Builder
.SetCurrentDebugLocation(getDebugLoc());
280 if (Instruction::isBinaryOp(getOpcode())) {
281 if (Part
!= 0 && vputils::onlyFirstPartUsed(this))
282 return State
.get(this, 0);
284 Value
*A
= State
.get(getOperand(0), Part
);
285 Value
*B
= State
.get(getOperand(1), Part
);
287 Builder
.CreateBinOp((Instruction::BinaryOps
)getOpcode(), A
, B
, Name
);
288 if (auto *I
= dyn_cast
<Instruction
>(Res
))
293 switch (getOpcode()) {
294 case VPInstruction::Not
: {
295 Value
*A
= State
.get(getOperand(0), Part
);
296 return Builder
.CreateNot(A
, Name
);
298 case Instruction::ICmp
: {
299 Value
*A
= State
.get(getOperand(0), Part
);
300 Value
*B
= State
.get(getOperand(1), Part
);
301 return Builder
.CreateCmp(getPredicate(), A
, B
, Name
);
303 case Instruction::Select
: {
304 Value
*Cond
= State
.get(getOperand(0), Part
);
305 Value
*Op1
= State
.get(getOperand(1), Part
);
306 Value
*Op2
= State
.get(getOperand(2), Part
);
307 return Builder
.CreateSelect(Cond
, Op1
, Op2
, Name
);
309 case VPInstruction::ActiveLaneMask
: {
310 // Get first lane of vector induction variable.
311 Value
*VIVElem0
= State
.get(getOperand(0), VPIteration(Part
, 0));
312 // Get the original loop tripcount.
313 Value
*ScalarTC
= State
.get(getOperand(1), VPIteration(Part
, 0));
315 auto *Int1Ty
= Type::getInt1Ty(Builder
.getContext());
316 auto *PredTy
= VectorType::get(Int1Ty
, State
.VF
);
317 return Builder
.CreateIntrinsic(Intrinsic::get_active_lane_mask
,
318 {PredTy
, ScalarTC
->getType()},
319 {VIVElem0
, ScalarTC
}, nullptr, Name
);
321 case VPInstruction::FirstOrderRecurrenceSplice
: {
322 // Generate code to combine the previous and current values in vector v3.
325 // v_init = vector(..., ..., ..., a[-1])
329 // i = phi [0, vector.ph], [i+4, vector.body]
330 // v1 = phi [v_init, vector.ph], [v2, vector.body]
331 // v2 = a[i, i+1, i+2, i+3];
332 // v3 = vector(v1(3), v2(0, 1, 2))
334 // For the first part, use the recurrence phi (v1), otherwise v2.
335 auto *V1
= State
.get(getOperand(0), 0);
336 Value
*PartMinus1
= Part
== 0 ? V1
: State
.get(getOperand(1), Part
- 1);
337 if (!PartMinus1
->getType()->isVectorTy())
339 Value
*V2
= State
.get(getOperand(1), Part
);
340 return Builder
.CreateVectorSplice(PartMinus1
, V2
, -1, Name
);
342 case VPInstruction::CalculateTripCountMinusVF
: {
343 Value
*ScalarTC
= State
.get(getOperand(0), {0, 0});
345 createStepForVF(Builder
, ScalarTC
->getType(), State
.VF
, State
.UF
);
346 Value
*Sub
= Builder
.CreateSub(ScalarTC
, Step
);
347 Value
*Cmp
= Builder
.CreateICmp(CmpInst::Predicate::ICMP_UGT
, ScalarTC
, Step
);
348 Value
*Zero
= ConstantInt::get(ScalarTC
->getType(), 0);
349 return Builder
.CreateSelect(Cmp
, Sub
, Zero
);
351 case VPInstruction::CanonicalIVIncrementForPart
: {
352 auto *IV
= State
.get(getOperand(0), VPIteration(0, 0));
356 // The canonical IV is incremented by the vectorization factor (num of SIMD
357 // elements) times the unroll part.
358 Value
*Step
= createStepForVF(Builder
, IV
->getType(), State
.VF
, Part
);
359 return Builder
.CreateAdd(IV
, Step
, Name
, hasNoUnsignedWrap(),
362 case VPInstruction::BranchOnCond
: {
366 Value
*Cond
= State
.get(getOperand(0), VPIteration(Part
, 0));
367 VPRegionBlock
*ParentRegion
= getParent()->getParent();
368 VPBasicBlock
*Header
= ParentRegion
->getEntryBasicBlock();
370 // Replace the temporary unreachable terminator with a new conditional
371 // branch, hooking it up to backward destination for exiting blocks now and
372 // to forward destination(s) later when they are created.
374 Builder
.CreateCondBr(Cond
, Builder
.GetInsertBlock(), nullptr);
376 if (getParent()->isExiting())
377 CondBr
->setSuccessor(1, State
.CFG
.VPBB2IRBB
[Header
]);
379 CondBr
->setSuccessor(0, nullptr);
380 Builder
.GetInsertBlock()->getTerminator()->eraseFromParent();
383 case VPInstruction::BranchOnCount
: {
386 // First create the compare.
387 Value
*IV
= State
.get(getOperand(0), Part
);
388 Value
*TC
= State
.get(getOperand(1), Part
);
389 Value
*Cond
= Builder
.CreateICmpEQ(IV
, TC
);
391 // Now create the branch.
392 auto *Plan
= getParent()->getPlan();
393 VPRegionBlock
*TopRegion
= Plan
->getVectorLoopRegion();
394 VPBasicBlock
*Header
= TopRegion
->getEntry()->getEntryBasicBlock();
396 // Replace the temporary unreachable terminator with a new conditional
397 // branch, hooking it up to backward destination (the header) now and to the
398 // forward destination (the exit/middle block) later when it is created.
399 // Note that CreateCondBr expects a valid BB as first argument, so we need
400 // to set it to nullptr later.
401 BranchInst
*CondBr
= Builder
.CreateCondBr(Cond
, Builder
.GetInsertBlock(),
402 State
.CFG
.VPBB2IRBB
[Header
]);
403 CondBr
->setSuccessor(0, nullptr);
404 Builder
.GetInsertBlock()->getTerminator()->eraseFromParent();
407 case VPInstruction::ComputeReductionResult
: {
409 return State
.get(this, 0);
411 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
412 // and will be removed by breaking up the recipe further.
413 auto *PhiR
= cast
<VPReductionPHIRecipe
>(getOperand(0));
414 auto *OrigPhi
= cast
<PHINode
>(PhiR
->getUnderlyingValue());
415 // Get its reduction variable descriptor.
416 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
418 RecurKind RK
= RdxDesc
.getRecurrenceKind();
420 State
.setDebugLocFrom(getDebugLoc());
422 VPValue
*LoopExitingDef
= getOperand(1);
423 Type
*PhiTy
= OrigPhi
->getType();
424 VectorParts
RdxParts(State
.UF
);
425 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
426 RdxParts
[Part
] = State
.get(LoopExitingDef
, Part
);
428 // If the vector reduction can be performed in a smaller type, we truncate
429 // then extend the loop exit value to enable InstCombine to evaluate the
430 // entire expression in the smaller type.
431 // TODO: Handle this in truncateToMinBW.
432 if (State
.VF
.isVector() && PhiTy
!= RdxDesc
.getRecurrenceType()) {
433 Type
*RdxVecTy
= VectorType::get(RdxDesc
.getRecurrenceType(), State
.VF
);
434 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
435 RdxParts
[Part
] = Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
437 // Reduce all of the unrolled parts into a single vector.
438 Value
*ReducedPartRdx
= RdxParts
[0];
439 unsigned Op
= RecurrenceDescriptor::getOpcode(RK
);
441 if (PhiR
->isOrdered()) {
442 ReducedPartRdx
= RdxParts
[State
.UF
- 1];
444 // Floating-point operations should have some FMF to enable the reduction.
445 IRBuilderBase::FastMathFlagGuard
FMFG(Builder
);
446 Builder
.setFastMathFlags(RdxDesc
.getFastMathFlags());
447 for (unsigned Part
= 1; Part
< State
.UF
; ++Part
) {
448 Value
*RdxPart
= RdxParts
[Part
];
449 if (Op
!= Instruction::ICmp
&& Op
!= Instruction::FCmp
)
450 ReducedPartRdx
= Builder
.CreateBinOp(
451 (Instruction::BinaryOps
)Op
, RdxPart
, ReducedPartRdx
, "bin.rdx");
452 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK
)) {
453 TrackingVH
<Value
> ReductionStartValue
=
454 RdxDesc
.getRecurrenceStartValue();
455 ReducedPartRdx
= createAnyOfOp(Builder
, ReductionStartValue
, RK
,
456 ReducedPartRdx
, RdxPart
);
458 ReducedPartRdx
= createMinMaxOp(Builder
, RK
, ReducedPartRdx
, RdxPart
);
462 // Create the reduction after the loop. Note that inloop reductions create
463 // the target reduction in the loop using a Reduction recipe.
464 if (State
.VF
.isVector() && !PhiR
->isInLoop()) {
466 createTargetReduction(Builder
, RdxDesc
, ReducedPartRdx
, OrigPhi
);
467 // If the reduction can be performed in a smaller type, we need to extend
468 // the reduction to the wider type before we branch to the original loop.
469 if (PhiTy
!= RdxDesc
.getRecurrenceType())
470 ReducedPartRdx
= RdxDesc
.isSigned()
471 ? Builder
.CreateSExt(ReducedPartRdx
, PhiTy
)
472 : Builder
.CreateZExt(ReducedPartRdx
, PhiTy
);
475 // If there were stores of the reduction value to a uniform memory address
476 // inside the loop, create the final store here.
477 if (StoreInst
*SI
= RdxDesc
.IntermediateStore
) {
478 auto *NewSI
= Builder
.CreateAlignedStore(
479 ReducedPartRdx
, SI
->getPointerOperand(), SI
->getAlign());
480 propagateMetadata(NewSI
, SI
);
483 return ReducedPartRdx
;
486 llvm_unreachable("Unsupported opcode for instruction");
491 bool VPInstruction::isFPMathOp() const {
492 // Inspired by FPMathOperator::classof. Notable differences are that we don't
493 // support Call, PHI and Select opcodes here yet.
494 return Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FMul
||
495 Opcode
== Instruction::FNeg
|| Opcode
== Instruction::FSub
||
496 Opcode
== Instruction::FDiv
|| Opcode
== Instruction::FRem
||
497 Opcode
== Instruction::FCmp
|| Opcode
== Instruction::Select
;
501 void VPInstruction::execute(VPTransformState
&State
) {
502 assert(!State
.Instance
&& "VPInstruction executing an Instance");
503 IRBuilderBase::FastMathFlagGuard
FMFGuard(State
.Builder
);
504 assert((hasFastMathFlags() == isFPMathOp() ||
505 getOpcode() == Instruction::Select
) &&
506 "Recipe not a FPMathOp but has fast-math flags?");
507 if (hasFastMathFlags())
508 State
.Builder
.setFastMathFlags(getFastMathFlags());
509 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
510 Value
*GeneratedValue
= generateInstruction(State
, Part
);
513 assert(GeneratedValue
&& "generateInstruction must produce a value");
514 State
.set(this, GeneratedValue
, Part
);
518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
519 void VPInstruction::dump() const {
520 VPSlotTracker
SlotTracker(getParent()->getPlan());
521 print(dbgs(), "", SlotTracker
);
524 void VPInstruction::print(raw_ostream
&O
, const Twine
&Indent
,
525 VPSlotTracker
&SlotTracker
) const {
526 O
<< Indent
<< "EMIT ";
529 printAsOperand(O
, SlotTracker
);
533 switch (getOpcode()) {
534 case VPInstruction::Not
:
537 case VPInstruction::SLPLoad
:
538 O
<< "combined load";
540 case VPInstruction::SLPStore
:
541 O
<< "combined store";
543 case VPInstruction::ActiveLaneMask
:
544 O
<< "active lane mask";
546 case VPInstruction::FirstOrderRecurrenceSplice
:
547 O
<< "first-order splice";
549 case VPInstruction::BranchOnCond
:
550 O
<< "branch-on-cond";
552 case VPInstruction::CalculateTripCountMinusVF
:
553 O
<< "TC > VF ? TC - VF : 0";
555 case VPInstruction::CanonicalIVIncrementForPart
:
558 case VPInstruction::BranchOnCount
:
559 O
<< "branch-on-count";
561 case VPInstruction::ComputeReductionResult
:
562 O
<< "compute-reduction-result";
565 O
<< Instruction::getOpcodeName(getOpcode());
569 printOperands(O
, SlotTracker
);
571 if (auto DL
= getDebugLoc()) {
578 void VPWidenCallRecipe::execute(VPTransformState
&State
) {
579 assert(State
.VF
.isVector() && "not widening");
580 auto &CI
= *cast
<CallInst
>(getUnderlyingInstr());
581 assert(!isa
<DbgInfoIntrinsic
>(CI
) &&
582 "DbgInfoIntrinsic should have been dropped during VPlan construction");
583 State
.setDebugLocFrom(getDebugLoc());
585 bool UseIntrinsic
= VectorIntrinsicID
!= Intrinsic::not_intrinsic
;
586 FunctionType
*VFTy
= nullptr;
588 VFTy
= Variant
->getFunctionType();
589 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
590 SmallVector
<Type
*, 2> TysForDecl
;
591 // Add return type if intrinsic is overloaded on it.
593 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID
, -1))
594 TysForDecl
.push_back(
595 VectorType::get(CI
.getType()->getScalarType(), State
.VF
));
596 SmallVector
<Value
*, 4> Args
;
597 for (const auto &I
: enumerate(operands())) {
598 // Some intrinsics have a scalar argument - don't replace it with a
602 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID
, I
.index()))
603 Arg
= State
.get(I
.value(), VPIteration(0, 0));
604 // Some vectorized function variants may also take a scalar argument,
605 // e.g. linear parameters for pointers. This needs to be the scalar value
606 // from the start of the respective part when interleaving.
607 else if (VFTy
&& !VFTy
->getParamType(I
.index())->isVectorTy())
608 Arg
= State
.get(I
.value(), VPIteration(Part
, 0));
610 Arg
= State
.get(I
.value(), Part
);
612 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID
, I
.index()))
613 TysForDecl
.push_back(Arg
->getType());
619 // Use vector version of the intrinsic.
620 Module
*M
= State
.Builder
.GetInsertBlock()->getModule();
621 VectorF
= Intrinsic::getDeclaration(M
, VectorIntrinsicID
, TysForDecl
);
622 assert(VectorF
&& "Can't retrieve vector intrinsic.");
625 assert(Variant
!= nullptr && "Can't create vector function.");
630 SmallVector
<OperandBundleDef
, 1> OpBundles
;
631 CI
.getOperandBundlesAsDefs(OpBundles
);
632 CallInst
*V
= State
.Builder
.CreateCall(VectorF
, Args
, OpBundles
);
634 if (isa
<FPMathOperator
>(V
))
635 V
->copyFastMathFlags(&CI
);
637 State
.set(this, V
, Part
);
638 State
.addMetadata(V
, &CI
);
642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
643 void VPWidenCallRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
644 VPSlotTracker
&SlotTracker
) const {
645 O
<< Indent
<< "WIDEN-CALL ";
647 auto *CI
= cast
<CallInst
>(getUnderlyingInstr());
648 if (CI
->getType()->isVoidTy())
651 printAsOperand(O
, SlotTracker
);
655 O
<< "call @" << CI
->getCalledFunction()->getName() << "(";
656 printOperands(O
, SlotTracker
);
659 if (VectorIntrinsicID
)
660 O
<< " (using vector intrinsic)";
662 O
<< " (using library function";
663 if (Variant
->hasName())
664 O
<< ": " << Variant
->getName();
669 void VPWidenSelectRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
670 VPSlotTracker
&SlotTracker
) const {
671 O
<< Indent
<< "WIDEN-SELECT ";
672 printAsOperand(O
, SlotTracker
);
674 getOperand(0)->printAsOperand(O
, SlotTracker
);
676 getOperand(1)->printAsOperand(O
, SlotTracker
);
678 getOperand(2)->printAsOperand(O
, SlotTracker
);
679 O
<< (isInvariantCond() ? " (condition is loop invariant)" : "");
683 void VPWidenSelectRecipe::execute(VPTransformState
&State
) {
684 State
.setDebugLocFrom(getDebugLoc());
686 // The condition can be loop invariant but still defined inside the
687 // loop. This means that we can't just use the original 'cond' value.
688 // We have to take the 'vectorized' value and pick the first lane.
689 // Instcombine will make this a no-op.
691 isInvariantCond() ? State
.get(getCond(), VPIteration(0, 0)) : nullptr;
693 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
694 Value
*Cond
= InvarCond
? InvarCond
: State
.get(getCond(), Part
);
695 Value
*Op0
= State
.get(getOperand(1), Part
);
696 Value
*Op1
= State
.get(getOperand(2), Part
);
697 Value
*Sel
= State
.Builder
.CreateSelect(Cond
, Op0
, Op1
);
698 State
.set(this, Sel
, Part
);
699 State
.addMetadata(Sel
, dyn_cast_or_null
<Instruction
>(getUnderlyingValue()));
703 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
704 const FastMathFlags
&FMF
) {
705 AllowReassoc
= FMF
.allowReassoc();
706 NoNaNs
= FMF
.noNaNs();
707 NoInfs
= FMF
.noInfs();
708 NoSignedZeros
= FMF
.noSignedZeros();
709 AllowReciprocal
= FMF
.allowReciprocal();
710 AllowContract
= FMF
.allowContract();
711 ApproxFunc
= FMF
.approxFunc();
714 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
715 void VPRecipeWithIRFlags::printFlags(raw_ostream
&O
) const {
717 case OperationType::Cmp
:
718 O
<< " " << CmpInst::getPredicateName(getPredicate());
720 case OperationType::DisjointOp
:
721 if (DisjointFlags
.IsDisjoint
)
724 case OperationType::PossiblyExactOp
:
725 if (ExactFlags
.IsExact
)
728 case OperationType::OverflowingBinOp
:
729 if (WrapFlags
.HasNUW
)
731 if (WrapFlags
.HasNSW
)
734 case OperationType::FPMathOp
:
735 getFastMathFlags().print(O
);
737 case OperationType::GEPOp
:
738 if (GEPFlags
.IsInBounds
)
741 case OperationType::NonNegOp
:
742 if (NonNegFlags
.NonNeg
)
745 case OperationType::Other
:
748 if (getNumOperands() > 0)
753 void VPWidenRecipe::execute(VPTransformState
&State
) {
754 State
.setDebugLocFrom(getDebugLoc());
755 auto &Builder
= State
.Builder
;
757 case Instruction::Call
:
758 case Instruction::Br
:
759 case Instruction::PHI
:
760 case Instruction::GetElementPtr
:
761 case Instruction::Select
:
762 llvm_unreachable("This instruction is handled by a different recipe.");
763 case Instruction::UDiv
:
764 case Instruction::SDiv
:
765 case Instruction::SRem
:
766 case Instruction::URem
:
767 case Instruction::Add
:
768 case Instruction::FAdd
:
769 case Instruction::Sub
:
770 case Instruction::FSub
:
771 case Instruction::FNeg
:
772 case Instruction::Mul
:
773 case Instruction::FMul
:
774 case Instruction::FDiv
:
775 case Instruction::FRem
:
776 case Instruction::Shl
:
777 case Instruction::LShr
:
778 case Instruction::AShr
:
779 case Instruction::And
:
780 case Instruction::Or
:
781 case Instruction::Xor
: {
782 // Just widen unops and binops.
783 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
784 SmallVector
<Value
*, 2> Ops
;
785 for (VPValue
*VPOp
: operands())
786 Ops
.push_back(State
.get(VPOp
, Part
));
788 Value
*V
= Builder
.CreateNAryOp(Opcode
, Ops
);
790 if (auto *VecOp
= dyn_cast
<Instruction
>(V
))
793 // Use this vector value for all users of the original instruction.
794 State
.set(this, V
, Part
);
795 State
.addMetadata(V
, dyn_cast_or_null
<Instruction
>(getUnderlyingValue()));
800 case Instruction::Freeze
: {
801 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
802 Value
*Op
= State
.get(getOperand(0), Part
);
804 Value
*Freeze
= Builder
.CreateFreeze(Op
);
805 State
.set(this, Freeze
, Part
);
809 case Instruction::ICmp
:
810 case Instruction::FCmp
: {
811 // Widen compares. Generate vector compares.
812 bool FCmp
= Opcode
== Instruction::FCmp
;
813 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
814 Value
*A
= State
.get(getOperand(0), Part
);
815 Value
*B
= State
.get(getOperand(1), Part
);
818 // Propagate fast math flags.
819 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
820 if (auto *I
= dyn_cast_or_null
<Instruction
>(getUnderlyingValue()))
821 Builder
.setFastMathFlags(I
->getFastMathFlags());
822 C
= Builder
.CreateFCmp(getPredicate(), A
, B
);
824 C
= Builder
.CreateICmp(getPredicate(), A
, B
);
826 State
.set(this, C
, Part
);
827 State
.addMetadata(C
, dyn_cast_or_null
<Instruction
>(getUnderlyingValue()));
833 // This instruction is not vectorized by simple widening.
834 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
835 << Instruction::getOpcodeName(Opcode
));
836 llvm_unreachable("Unhandled instruction!");
840 // Verify that VPlan type inference results agree with the type of the
842 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
843 assert(VectorType::get(State
.TypeAnalysis
.inferScalarType(this),
844 State
.VF
) == State
.get(this, Part
)->getType() &&
845 "inferred type and type from generated instructions do not match");
850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
851 void VPWidenRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
852 VPSlotTracker
&SlotTracker
) const {
853 O
<< Indent
<< "WIDEN ";
854 printAsOperand(O
, SlotTracker
);
855 O
<< " = " << Instruction::getOpcodeName(Opcode
);
857 printOperands(O
, SlotTracker
);
861 void VPWidenCastRecipe::execute(VPTransformState
&State
) {
862 State
.setDebugLocFrom(getDebugLoc());
863 auto &Builder
= State
.Builder
;
865 assert(State
.VF
.isVector() && "Not vectorizing?");
866 Type
*DestTy
= VectorType::get(getResultType(), State
.VF
);
867 VPValue
*Op
= getOperand(0);
868 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
869 if (Part
> 0 && Op
->isLiveIn()) {
870 // FIXME: Remove once explicit unrolling is implemented using VPlan.
871 State
.set(this, State
.get(this, 0), Part
);
874 Value
*A
= State
.get(Op
, Part
);
875 Value
*Cast
= Builder
.CreateCast(Instruction::CastOps(Opcode
), A
, DestTy
);
876 State
.set(this, Cast
, Part
);
877 State
.addMetadata(Cast
, cast_or_null
<Instruction
>(getUnderlyingValue()));
881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
882 void VPWidenCastRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
883 VPSlotTracker
&SlotTracker
) const {
884 O
<< Indent
<< "WIDEN-CAST ";
885 printAsOperand(O
, SlotTracker
);
886 O
<< " = " << Instruction::getOpcodeName(Opcode
) << " ";
888 printOperands(O
, SlotTracker
);
889 O
<< " to " << *getResultType();
893 /// This function adds
894 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
895 /// to each vector element of Val. The sequence starts at StartIndex.
896 /// \p Opcode is relevant for FP induction variable.
897 static Value
*getStepVector(Value
*Val
, Value
*StartIdx
, Value
*Step
,
898 Instruction::BinaryOps BinOp
, ElementCount VF
,
899 IRBuilderBase
&Builder
) {
900 assert(VF
.isVector() && "only vector VFs are supported");
902 // Create and check the types.
903 auto *ValVTy
= cast
<VectorType
>(Val
->getType());
904 ElementCount VLen
= ValVTy
->getElementCount();
906 Type
*STy
= Val
->getType()->getScalarType();
907 assert((STy
->isIntegerTy() || STy
->isFloatingPointTy()) &&
908 "Induction Step must be an integer or FP");
909 assert(Step
->getType() == STy
&& "Step has wrong type");
911 SmallVector
<Constant
*, 8> Indices
;
913 // Create a vector of consecutive numbers from zero to VF.
914 VectorType
*InitVecValVTy
= ValVTy
;
915 if (STy
->isFloatingPointTy()) {
916 Type
*InitVecValSTy
=
917 IntegerType::get(STy
->getContext(), STy
->getScalarSizeInBits());
918 InitVecValVTy
= VectorType::get(InitVecValSTy
, VLen
);
920 Value
*InitVec
= Builder
.CreateStepVector(InitVecValVTy
);
922 // Splat the StartIdx
923 Value
*StartIdxSplat
= Builder
.CreateVectorSplat(VLen
, StartIdx
);
925 if (STy
->isIntegerTy()) {
926 InitVec
= Builder
.CreateAdd(InitVec
, StartIdxSplat
);
927 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
928 assert(Step
->getType() == Val
->getType() && "Invalid step vec");
929 // FIXME: The newly created binary instructions should contain nsw/nuw
930 // flags, which can be found from the original scalar operations.
931 Step
= Builder
.CreateMul(InitVec
, Step
);
932 return Builder
.CreateAdd(Val
, Step
, "induction");
935 // Floating point induction.
936 assert((BinOp
== Instruction::FAdd
|| BinOp
== Instruction::FSub
) &&
937 "Binary Opcode should be specified for FP induction");
938 InitVec
= Builder
.CreateUIToFP(InitVec
, ValVTy
);
939 InitVec
= Builder
.CreateFAdd(InitVec
, StartIdxSplat
);
941 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
942 Value
*MulOp
= Builder
.CreateFMul(InitVec
, Step
);
943 return Builder
.CreateBinOp(BinOp
, Val
, MulOp
, "induction");
946 /// A helper function that returns an integer or floating-point constant with
948 static Constant
*getSignedIntOrFpConstant(Type
*Ty
, int64_t C
) {
949 return Ty
->isIntegerTy() ? ConstantInt::getSigned(Ty
, C
)
950 : ConstantFP::get(Ty
, C
);
953 static Value
*getRuntimeVFAsFloat(IRBuilderBase
&B
, Type
*FTy
,
955 assert(FTy
->isFloatingPointTy() && "Expected floating point type!");
956 Type
*IntTy
= IntegerType::get(FTy
->getContext(), FTy
->getScalarSizeInBits());
957 Value
*RuntimeVF
= getRuntimeVF(B
, IntTy
, VF
);
958 return B
.CreateUIToFP(RuntimeVF
, FTy
);
961 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState
&State
) {
962 assert(!State
.Instance
&& "Int or FP induction being replicated.");
964 Value
*Start
= getStartValue()->getLiveInIRValue();
965 const InductionDescriptor
&ID
= getInductionDescriptor();
966 TruncInst
*Trunc
= getTruncInst();
967 IRBuilderBase
&Builder
= State
.Builder
;
968 assert(IV
->getType() == ID
.getStartValue()->getType() && "Types must match");
969 assert(State
.VF
.isVector() && "must have vector VF");
971 // The value from the original loop to which we are mapping the new induction
973 Instruction
*EntryVal
= Trunc
? cast
<Instruction
>(Trunc
) : IV
;
975 // Fast-math-flags propagate from the original induction instruction.
976 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
977 if (ID
.getInductionBinOp() && isa
<FPMathOperator
>(ID
.getInductionBinOp()))
978 Builder
.setFastMathFlags(ID
.getInductionBinOp()->getFastMathFlags());
980 // Now do the actual transformations, and start with fetching the step value.
981 Value
*Step
= State
.get(getStepValue(), VPIteration(0, 0));
983 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
984 "Expected either an induction phi-node or a truncate of it!");
986 // Construct the initial value of the vector IV in the vector loop preheader
987 auto CurrIP
= Builder
.saveIP();
988 BasicBlock
*VectorPH
= State
.CFG
.getPreheaderBBFor(this);
989 Builder
.SetInsertPoint(VectorPH
->getTerminator());
990 if (isa
<TruncInst
>(EntryVal
)) {
991 assert(Start
->getType()->isIntegerTy() &&
992 "Truncation requires an integer type");
993 auto *TruncType
= cast
<IntegerType
>(EntryVal
->getType());
994 Step
= Builder
.CreateTrunc(Step
, TruncType
);
995 Start
= Builder
.CreateCast(Instruction::Trunc
, Start
, TruncType
);
998 Value
*Zero
= getSignedIntOrFpConstant(Start
->getType(), 0);
999 Value
*SplatStart
= Builder
.CreateVectorSplat(State
.VF
, Start
);
1000 Value
*SteppedStart
= getStepVector(
1001 SplatStart
, Zero
, Step
, ID
.getInductionOpcode(), State
.VF
, State
.Builder
);
1003 // We create vector phi nodes for both integer and floating-point induction
1004 // variables. Here, we determine the kind of arithmetic we will perform.
1005 Instruction::BinaryOps AddOp
;
1006 Instruction::BinaryOps MulOp
;
1007 if (Step
->getType()->isIntegerTy()) {
1008 AddOp
= Instruction::Add
;
1009 MulOp
= Instruction::Mul
;
1011 AddOp
= ID
.getInductionOpcode();
1012 MulOp
= Instruction::FMul
;
1015 // Multiply the vectorization factor by the step using integer or
1016 // floating-point arithmetic as appropriate.
1017 Type
*StepType
= Step
->getType();
1019 if (Step
->getType()->isFloatingPointTy())
1020 RuntimeVF
= getRuntimeVFAsFloat(Builder
, StepType
, State
.VF
);
1022 RuntimeVF
= getRuntimeVF(Builder
, StepType
, State
.VF
);
1023 Value
*Mul
= Builder
.CreateBinOp(MulOp
, Step
, RuntimeVF
);
1025 // Create a vector splat to use in the induction update.
1027 // FIXME: If the step is non-constant, we create the vector splat with
1028 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1029 // handle a constant vector splat.
1030 Value
*SplatVF
= isa
<Constant
>(Mul
)
1031 ? ConstantVector::getSplat(State
.VF
, cast
<Constant
>(Mul
))
1032 : Builder
.CreateVectorSplat(State
.VF
, Mul
);
1033 Builder
.restoreIP(CurrIP
);
1035 // We may need to add the step a number of times, depending on the unroll
1036 // factor. The last of those goes into the PHI.
1037 PHINode
*VecInd
= PHINode::Create(SteppedStart
->getType(), 2, "vec.ind");
1038 VecInd
->insertBefore(State
.CFG
.PrevBB
->getFirstInsertionPt());
1039 VecInd
->setDebugLoc(EntryVal
->getDebugLoc());
1040 Instruction
*LastInduction
= VecInd
;
1041 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
1042 State
.set(this, LastInduction
, Part
);
1044 if (isa
<TruncInst
>(EntryVal
))
1045 State
.addMetadata(LastInduction
, EntryVal
);
1047 LastInduction
= cast
<Instruction
>(
1048 Builder
.CreateBinOp(AddOp
, LastInduction
, SplatVF
, "step.add"));
1049 LastInduction
->setDebugLoc(EntryVal
->getDebugLoc());
1052 LastInduction
->setName("vec.ind.next");
1053 VecInd
->addIncoming(SteppedStart
, VectorPH
);
1054 // Add induction update using an incorrect block temporarily. The phi node
1055 // will be fixed after VPlan execution. Note that at this point the latch
1056 // block cannot be used, as it does not exist yet.
1057 // TODO: Model increment value in VPlan, by turning the recipe into a
1058 // multi-def and a subclass of VPHeaderPHIRecipe.
1059 VecInd
->addIncoming(LastInduction
, VectorPH
);
1062 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1063 void VPWidenIntOrFpInductionRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1064 VPSlotTracker
&SlotTracker
) const {
1065 O
<< Indent
<< "WIDEN-INDUCTION";
1066 if (getTruncInst()) {
1068 O
<< " +\n" << Indent
<< "\" " << VPlanIngredient(IV
) << "\\l\"";
1069 O
<< " +\n" << Indent
<< "\" ";
1070 getVPValue(0)->printAsOperand(O
, SlotTracker
);
1072 O
<< " " << VPlanIngredient(IV
);
1075 getStepValue()->printAsOperand(O
, SlotTracker
);
1079 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1080 // The step may be defined by a recipe in the preheader (e.g. if it requires
1081 // SCEV expansion), but for the canonical induction the step is required to be
1082 // 1, which is represented as live-in.
1083 if (getStepValue()->getDefiningRecipe())
1085 auto *StepC
= dyn_cast
<ConstantInt
>(getStepValue()->getLiveInIRValue());
1086 auto *StartC
= dyn_cast
<ConstantInt
>(getStartValue()->getLiveInIRValue());
1087 return StartC
&& StartC
->isZero() && StepC
&& StepC
->isOne();
1090 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1091 void VPDerivedIVRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1092 VPSlotTracker
&SlotTracker
) const {
1094 printAsOperand(O
, SlotTracker
);
1095 O
<< Indent
<< "= DERIVED-IV ";
1096 getStartValue()->printAsOperand(O
, SlotTracker
);
1098 getCanonicalIV()->printAsOperand(O
, SlotTracker
);
1100 getStepValue()->printAsOperand(O
, SlotTracker
);
1103 O
<< " (truncated to " << *TruncResultTy
<< ")";
1107 void VPScalarIVStepsRecipe::execute(VPTransformState
&State
) {
1108 // Fast-math-flags propagate from the original induction instruction.
1109 IRBuilder
<>::FastMathFlagGuard
FMFG(State
.Builder
);
1110 if (hasFastMathFlags())
1111 State
.Builder
.setFastMathFlags(getFastMathFlags());
1113 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1114 /// variable on which to base the steps, \p Step is the size of the step.
1116 Value
*BaseIV
= State
.get(getOperand(0), VPIteration(0, 0));
1117 Value
*Step
= State
.get(getStepValue(), VPIteration(0, 0));
1118 IRBuilderBase
&Builder
= State
.Builder
;
1120 // Ensure step has the same type as that of scalar IV.
1121 Type
*BaseIVTy
= BaseIV
->getType()->getScalarType();
1122 if (BaseIVTy
!= Step
->getType()) {
1123 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
1124 // avoid separate truncate here.
1125 assert(Step
->getType()->isIntegerTy() &&
1126 "Truncation requires an integer step");
1127 Step
= State
.Builder
.CreateTrunc(Step
, BaseIVTy
);
1130 // We build scalar steps for both integer and floating-point induction
1131 // variables. Here, we determine the kind of arithmetic we will perform.
1132 Instruction::BinaryOps AddOp
;
1133 Instruction::BinaryOps MulOp
;
1134 if (BaseIVTy
->isIntegerTy()) {
1135 AddOp
= Instruction::Add
;
1136 MulOp
= Instruction::Mul
;
1138 AddOp
= InductionOpcode
;
1139 MulOp
= Instruction::FMul
;
1142 // Determine the number of scalars we need to generate for each unroll
1144 bool FirstLaneOnly
= vputils::onlyFirstLaneUsed(this);
1145 // Compute the scalar steps and save the results in State.
1147 IntegerType::get(BaseIVTy
->getContext(), BaseIVTy
->getScalarSizeInBits());
1148 Type
*VecIVTy
= nullptr;
1149 Value
*UnitStepVec
= nullptr, *SplatStep
= nullptr, *SplatIV
= nullptr;
1150 if (!FirstLaneOnly
&& State
.VF
.isScalable()) {
1151 VecIVTy
= VectorType::get(BaseIVTy
, State
.VF
);
1153 Builder
.CreateStepVector(VectorType::get(IntStepTy
, State
.VF
));
1154 SplatStep
= Builder
.CreateVectorSplat(State
.VF
, Step
);
1155 SplatIV
= Builder
.CreateVectorSplat(State
.VF
, BaseIV
);
1158 unsigned StartPart
= 0;
1159 unsigned EndPart
= State
.UF
;
1160 unsigned StartLane
= 0;
1161 unsigned EndLane
= FirstLaneOnly
? 1 : State
.VF
.getKnownMinValue();
1162 if (State
.Instance
) {
1163 StartPart
= State
.Instance
->Part
;
1164 EndPart
= StartPart
+ 1;
1165 StartLane
= State
.Instance
->Lane
.getKnownLane();
1166 EndLane
= StartLane
+ 1;
1168 for (unsigned Part
= StartPart
; Part
< EndPart
; ++Part
) {
1169 Value
*StartIdx0
= createStepForVF(Builder
, IntStepTy
, State
.VF
, Part
);
1171 if (!FirstLaneOnly
&& State
.VF
.isScalable()) {
1172 auto *SplatStartIdx
= Builder
.CreateVectorSplat(State
.VF
, StartIdx0
);
1173 auto *InitVec
= Builder
.CreateAdd(SplatStartIdx
, UnitStepVec
);
1174 if (BaseIVTy
->isFloatingPointTy())
1175 InitVec
= Builder
.CreateSIToFP(InitVec
, VecIVTy
);
1176 auto *Mul
= Builder
.CreateBinOp(MulOp
, InitVec
, SplatStep
);
1177 auto *Add
= Builder
.CreateBinOp(AddOp
, SplatIV
, Mul
);
1178 State
.set(this, Add
, Part
);
1179 // It's useful to record the lane values too for the known minimum number
1180 // of elements so we do those below. This improves the code quality when
1181 // trying to extract the first element, for example.
1184 if (BaseIVTy
->isFloatingPointTy())
1185 StartIdx0
= Builder
.CreateSIToFP(StartIdx0
, BaseIVTy
);
1187 for (unsigned Lane
= StartLane
; Lane
< EndLane
; ++Lane
) {
1188 Value
*StartIdx
= Builder
.CreateBinOp(
1189 AddOp
, StartIdx0
, getSignedIntOrFpConstant(BaseIVTy
, Lane
));
1190 // The step returned by `createStepForVF` is a runtime-evaluated value
1191 // when VF is scalable. Otherwise, it should be folded into a Constant.
1192 assert((State
.VF
.isScalable() || isa
<Constant
>(StartIdx
)) &&
1193 "Expected StartIdx to be folded to a constant when VF is not "
1195 auto *Mul
= Builder
.CreateBinOp(MulOp
, StartIdx
, Step
);
1196 auto *Add
= Builder
.CreateBinOp(AddOp
, BaseIV
, Mul
);
1197 State
.set(this, Add
, VPIteration(Part
, Lane
));
1202 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1203 void VPScalarIVStepsRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1204 VPSlotTracker
&SlotTracker
) const {
1206 printAsOperand(O
, SlotTracker
);
1207 O
<< " = SCALAR-STEPS ";
1208 printOperands(O
, SlotTracker
);
1212 void VPWidenGEPRecipe::execute(VPTransformState
&State
) {
1213 assert(State
.VF
.isVector() && "not widening");
1214 auto *GEP
= cast
<GetElementPtrInst
>(getUnderlyingInstr());
1215 // Construct a vector GEP by widening the operands of the scalar GEP as
1216 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1217 // results in a vector of pointers when at least one operand of the GEP
1218 // is vector-typed. Thus, to keep the representation compact, we only use
1219 // vector-typed operands for loop-varying values.
1221 if (areAllOperandsInvariant()) {
1222 // If we are vectorizing, but the GEP has only loop-invariant operands,
1223 // the GEP we build (by only using vector-typed operands for
1224 // loop-varying values) would be a scalar pointer. Thus, to ensure we
1225 // produce a vector of pointers, we need to either arbitrarily pick an
1226 // operand to broadcast, or broadcast a clone of the original GEP.
1227 // Here, we broadcast a clone of the original.
1229 // TODO: If at some point we decide to scalarize instructions having
1230 // loop-invariant operands, this special case will no longer be
1231 // required. We would add the scalarization decision to
1232 // collectLoopScalars() and teach getVectorValue() to broadcast
1233 // the lane-zero scalar value.
1234 SmallVector
<Value
*> Ops
;
1235 for (unsigned I
= 0, E
= getNumOperands(); I
!= E
; I
++)
1236 Ops
.push_back(State
.get(getOperand(I
), VPIteration(0, 0)));
1239 State
.Builder
.CreateGEP(GEP
->getSourceElementType(), Ops
[0],
1240 ArrayRef(Ops
).drop_front(), "", isInBounds());
1241 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
1242 Value
*EntryPart
= State
.Builder
.CreateVectorSplat(State
.VF
, NewGEP
);
1243 State
.set(this, EntryPart
, Part
);
1244 State
.addMetadata(EntryPart
, GEP
);
1247 // If the GEP has at least one loop-varying operand, we are sure to
1248 // produce a vector of pointers. But if we are only unrolling, we want
1249 // to produce a scalar GEP for each unroll part. Thus, the GEP we
1250 // produce with the code below will be scalar (if VF == 1) or vector
1251 // (otherwise). Note that for the unroll-only case, we still maintain
1252 // values in the vector mapping with initVector, as we do for other
1254 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
1255 // The pointer operand of the new GEP. If it's loop-invariant, we
1256 // won't broadcast it.
1257 auto *Ptr
= isPointerLoopInvariant()
1258 ? State
.get(getOperand(0), VPIteration(0, 0))
1259 : State
.get(getOperand(0), Part
);
1261 // Collect all the indices for the new GEP. If any index is
1262 // loop-invariant, we won't broadcast it.
1263 SmallVector
<Value
*, 4> Indices
;
1264 for (unsigned I
= 1, E
= getNumOperands(); I
< E
; I
++) {
1265 VPValue
*Operand
= getOperand(I
);
1266 if (isIndexLoopInvariant(I
- 1))
1267 Indices
.push_back(State
.get(Operand
, VPIteration(0, 0)));
1269 Indices
.push_back(State
.get(Operand
, Part
));
1272 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1273 // but it should be a vector, otherwise.
1274 auto *NewGEP
= State
.Builder
.CreateGEP(GEP
->getSourceElementType(), Ptr
,
1275 Indices
, "", isInBounds());
1276 assert((State
.VF
.isScalar() || NewGEP
->getType()->isVectorTy()) &&
1277 "NewGEP is not a pointer vector");
1278 State
.set(this, NewGEP
, Part
);
1279 State
.addMetadata(NewGEP
, GEP
);
1284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1285 void VPWidenGEPRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1286 VPSlotTracker
&SlotTracker
) const {
1287 O
<< Indent
<< "WIDEN-GEP ";
1288 O
<< (isPointerLoopInvariant() ? "Inv" : "Var");
1289 for (size_t I
= 0; I
< getNumOperands() - 1; ++I
)
1290 O
<< "[" << (isIndexLoopInvariant(I
) ? "Inv" : "Var") << "]";
1293 printAsOperand(O
, SlotTracker
);
1294 O
<< " = getelementptr";
1296 printOperands(O
, SlotTracker
);
1300 void VPVectorPointerRecipe ::execute(VPTransformState
&State
) {
1301 auto &Builder
= State
.Builder
;
1302 State
.setDebugLocFrom(getDebugLoc());
1303 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
1304 // Calculate the pointer for the specific unroll-part.
1305 Value
*PartPtr
= nullptr;
1306 // Use i32 for the gep index type when the value is constant,
1307 // or query DataLayout for a more suitable index type otherwise.
1308 const DataLayout
&DL
=
1309 Builder
.GetInsertBlock()->getModule()->getDataLayout();
1310 Type
*IndexTy
= State
.VF
.isScalable() && (IsReverse
|| Part
> 0)
1311 ? DL
.getIndexType(IndexedTy
->getPointerTo())
1312 : Builder
.getInt32Ty();
1313 Value
*Ptr
= State
.get(getOperand(0), VPIteration(0, 0));
1314 bool InBounds
= isInBounds();
1316 // If the address is consecutive but reversed, then the
1317 // wide store needs to start at the last vector element.
1318 // RunTimeVF = VScale * VF.getKnownMinValue()
1319 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1320 Value
*RunTimeVF
= getRuntimeVF(Builder
, IndexTy
, State
.VF
);
1321 // NumElt = -Part * RunTimeVF
1322 Value
*NumElt
= Builder
.CreateMul(
1323 ConstantInt::get(IndexTy
, -(int64_t)Part
), RunTimeVF
);
1324 // LastLane = 1 - RunTimeVF
1326 Builder
.CreateSub(ConstantInt::get(IndexTy
, 1), RunTimeVF
);
1327 PartPtr
= Builder
.CreateGEP(IndexedTy
, Ptr
, NumElt
, "", InBounds
);
1328 PartPtr
= Builder
.CreateGEP(IndexedTy
, PartPtr
, LastLane
, "", InBounds
);
1330 Value
*Increment
= createStepForVF(Builder
, IndexTy
, State
.VF
, Part
);
1331 PartPtr
= Builder
.CreateGEP(IndexedTy
, Ptr
, Increment
, "", InBounds
);
1334 State
.set(this, PartPtr
, Part
);
1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1339 void VPVectorPointerRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1340 VPSlotTracker
&SlotTracker
) const {
1342 printAsOperand(O
, SlotTracker
);
1343 O
<< " = vector-pointer ";
1347 printOperands(O
, SlotTracker
);
1351 void VPBlendRecipe::execute(VPTransformState
&State
) {
1352 State
.setDebugLocFrom(getDebugLoc());
1353 // We know that all PHIs in non-header blocks are converted into
1354 // selects, so we don't have to worry about the insertion order and we
1355 // can just use the builder.
1356 // At this point we generate the predication tree. There may be
1357 // duplications since this is a simple recursive scan, but future
1358 // optimizations will clean it up.
1360 unsigned NumIncoming
= getNumIncomingValues();
1362 // Generate a sequence of selects of the form:
1363 // SELECT(Mask3, In3,
1364 // SELECT(Mask2, In2,
1365 // SELECT(Mask1, In1,
1367 // Note that Mask0 is never used: lanes for which no path reaches this phi and
1368 // are essentially undef are taken from In0.
1369 VectorParts
Entry(State
.UF
);
1370 for (unsigned In
= 0; In
< NumIncoming
; ++In
) {
1371 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
1372 // We might have single edge PHIs (blocks) - use an identity
1373 // 'select' for the first PHI operand.
1374 Value
*In0
= State
.get(getIncomingValue(In
), Part
);
1376 Entry
[Part
] = In0
; // Initialize with the first incoming value.
1378 // Select between the current value and the previous incoming edge
1379 // based on the incoming mask.
1380 Value
*Cond
= State
.get(getMask(In
), Part
);
1382 State
.Builder
.CreateSelect(Cond
, In0
, Entry
[Part
], "predphi");
1386 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
1387 State
.set(this, Entry
[Part
], Part
);
1390 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1391 void VPBlendRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1392 VPSlotTracker
&SlotTracker
) const {
1393 O
<< Indent
<< "BLEND ";
1394 printAsOperand(O
, SlotTracker
);
1396 if (getNumIncomingValues() == 1) {
1397 // Not a User of any mask: not really blending, this is a
1398 // single-predecessor phi.
1400 getIncomingValue(0)->printAsOperand(O
, SlotTracker
);
1402 for (unsigned I
= 0, E
= getNumIncomingValues(); I
< E
; ++I
) {
1404 getIncomingValue(I
)->printAsOperand(O
, SlotTracker
);
1406 getMask(I
)->printAsOperand(O
, SlotTracker
);
1411 void VPReductionRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1412 VPSlotTracker
&SlotTracker
) const {
1413 O
<< Indent
<< "REDUCE ";
1414 printAsOperand(O
, SlotTracker
);
1416 getChainOp()->printAsOperand(O
, SlotTracker
);
1418 if (isa
<FPMathOperator
>(getUnderlyingInstr()))
1419 O
<< getUnderlyingInstr()->getFastMathFlags();
1420 O
<< " reduce." << Instruction::getOpcodeName(RdxDesc
.getOpcode()) << " (";
1421 getVecOp()->printAsOperand(O
, SlotTracker
);
1424 getCondOp()->printAsOperand(O
, SlotTracker
);
1427 if (RdxDesc
.IntermediateStore
)
1428 O
<< " (with final reduction value stored in invariant address sank "
1433 bool VPReplicateRecipe::shouldPack() const {
1434 // Find if the recipe is used by a widened recipe via an intervening
1435 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1436 return any_of(users(), [](const VPUser
*U
) {
1437 if (auto *PredR
= dyn_cast
<VPPredInstPHIRecipe
>(U
))
1438 return any_of(PredR
->users(), [PredR
](const VPUser
*U
) {
1439 return !U
->usesScalars(PredR
);
1445 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1446 void VPReplicateRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1447 VPSlotTracker
&SlotTracker
) const {
1448 O
<< Indent
<< (IsUniform
? "CLONE " : "REPLICATE ");
1450 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1451 printAsOperand(O
, SlotTracker
);
1454 if (auto *CB
= dyn_cast
<CallBase
>(getUnderlyingInstr())) {
1457 O
<< "@" << CB
->getCalledFunction()->getName() << "(";
1458 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1459 O
, [&O
, &SlotTracker
](VPValue
*Op
) {
1460 Op
->printAsOperand(O
, SlotTracker
);
1464 O
<< Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1466 printOperands(O
, SlotTracker
);
1474 void VPBranchOnMaskRecipe::execute(VPTransformState
&State
) {
1475 assert(State
.Instance
&& "Branch on Mask works only on single instance.");
1477 unsigned Part
= State
.Instance
->Part
;
1478 unsigned Lane
= State
.Instance
->Lane
.getKnownLane();
1480 Value
*ConditionBit
= nullptr;
1481 VPValue
*BlockInMask
= getMask();
1483 ConditionBit
= State
.get(BlockInMask
, Part
);
1484 if (ConditionBit
->getType()->isVectorTy())
1485 ConditionBit
= State
.Builder
.CreateExtractElement(
1486 ConditionBit
, State
.Builder
.getInt32(Lane
));
1487 } else // Block in mask is all-one.
1488 ConditionBit
= State
.Builder
.getTrue();
1490 // Replace the temporary unreachable terminator with a new conditional branch,
1491 // whose two destinations will be set later when they are created.
1492 auto *CurrentTerminator
= State
.CFG
.PrevBB
->getTerminator();
1493 assert(isa
<UnreachableInst
>(CurrentTerminator
) &&
1494 "Expected to replace unreachable terminator with conditional branch.");
1495 auto *CondBr
= BranchInst::Create(State
.CFG
.PrevBB
, nullptr, ConditionBit
);
1496 CondBr
->setSuccessor(0, nullptr);
1497 ReplaceInstWithInst(CurrentTerminator
, CondBr
);
1500 void VPPredInstPHIRecipe::execute(VPTransformState
&State
) {
1501 assert(State
.Instance
&& "Predicated instruction PHI works per instance.");
1502 Instruction
*ScalarPredInst
=
1503 cast
<Instruction
>(State
.get(getOperand(0), *State
.Instance
));
1504 BasicBlock
*PredicatedBB
= ScalarPredInst
->getParent();
1505 BasicBlock
*PredicatingBB
= PredicatedBB
->getSinglePredecessor();
1506 assert(PredicatingBB
&& "Predicated block has no single predecessor.");
1507 assert(isa
<VPReplicateRecipe
>(getOperand(0)) &&
1508 "operand must be VPReplicateRecipe");
1510 // By current pack/unpack logic we need to generate only a single phi node: if
1511 // a vector value for the predicated instruction exists at this point it means
1512 // the instruction has vector users only, and a phi for the vector value is
1513 // needed. In this case the recipe of the predicated instruction is marked to
1514 // also do that packing, thereby "hoisting" the insert-element sequence.
1515 // Otherwise, a phi node for the scalar value is needed.
1516 unsigned Part
= State
.Instance
->Part
;
1517 if (State
.hasVectorValue(getOperand(0), Part
)) {
1518 Value
*VectorValue
= State
.get(getOperand(0), Part
);
1519 InsertElementInst
*IEI
= cast
<InsertElementInst
>(VectorValue
);
1520 PHINode
*VPhi
= State
.Builder
.CreatePHI(IEI
->getType(), 2);
1521 VPhi
->addIncoming(IEI
->getOperand(0), PredicatingBB
); // Unmodified vector.
1522 VPhi
->addIncoming(IEI
, PredicatedBB
); // New vector with inserted element.
1523 if (State
.hasVectorValue(this, Part
))
1524 State
.reset(this, VPhi
, Part
);
1526 State
.set(this, VPhi
, Part
);
1527 // NOTE: Currently we need to update the value of the operand, so the next
1528 // predicated iteration inserts its generated value in the correct vector.
1529 State
.reset(getOperand(0), VPhi
, Part
);
1531 Type
*PredInstType
= getOperand(0)->getUnderlyingValue()->getType();
1532 PHINode
*Phi
= State
.Builder
.CreatePHI(PredInstType
, 2);
1533 Phi
->addIncoming(PoisonValue::get(ScalarPredInst
->getType()),
1535 Phi
->addIncoming(ScalarPredInst
, PredicatedBB
);
1536 if (State
.hasScalarValue(this, *State
.Instance
))
1537 State
.reset(this, Phi
, *State
.Instance
);
1539 State
.set(this, Phi
, *State
.Instance
);
1540 // NOTE: Currently we need to update the value of the operand, so the next
1541 // predicated iteration inserts its generated value in the correct vector.
1542 State
.reset(getOperand(0), Phi
, *State
.Instance
);
1546 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1547 void VPPredInstPHIRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1548 VPSlotTracker
&SlotTracker
) const {
1549 O
<< Indent
<< "PHI-PREDICATED-INSTRUCTION ";
1550 printAsOperand(O
, SlotTracker
);
1552 printOperands(O
, SlotTracker
);
1555 void VPWidenMemoryInstructionRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1556 VPSlotTracker
&SlotTracker
) const {
1557 O
<< Indent
<< "WIDEN ";
1560 getVPSingleValue()->printAsOperand(O
, SlotTracker
);
1563 O
<< Instruction::getOpcodeName(Ingredient
.getOpcode()) << " ";
1565 printOperands(O
, SlotTracker
);
1569 void VPCanonicalIVPHIRecipe::execute(VPTransformState
&State
) {
1570 Value
*Start
= getStartValue()->getLiveInIRValue();
1571 PHINode
*EntryPart
= PHINode::Create(Start
->getType(), 2, "index");
1572 EntryPart
->insertBefore(State
.CFG
.PrevBB
->getFirstInsertionPt());
1574 BasicBlock
*VectorPH
= State
.CFG
.getPreheaderBBFor(this);
1575 EntryPart
->addIncoming(Start
, VectorPH
);
1576 EntryPart
->setDebugLoc(getDebugLoc());
1577 for (unsigned Part
= 0, UF
= State
.UF
; Part
< UF
; ++Part
)
1578 State
.set(this, EntryPart
, Part
);
1581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1582 void VPCanonicalIVPHIRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1583 VPSlotTracker
&SlotTracker
) const {
1584 O
<< Indent
<< "EMIT ";
1585 printAsOperand(O
, SlotTracker
);
1586 O
<< " = CANONICAL-INDUCTION ";
1587 printOperands(O
, SlotTracker
);
1591 bool VPCanonicalIVPHIRecipe::isCanonical(
1592 InductionDescriptor::InductionKind Kind
, VPValue
*Start
, VPValue
*Step
,
1594 // The types must match and it must be an integer induction.
1595 if (Ty
!= getScalarType() || Kind
!= InductionDescriptor::IK_IntInduction
)
1597 // Start must match the start value of this canonical induction.
1598 if (Start
!= getStartValue())
1601 // If the step is defined by a recipe, it is not a ConstantInt.
1602 if (Step
->getDefiningRecipe())
1605 ConstantInt
*StepC
= dyn_cast
<ConstantInt
>(Step
->getLiveInIRValue());
1606 return StepC
&& StepC
->isOne();
1609 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF
) {
1610 return IsScalarAfterVectorization
&&
1611 (!VF
.isScalable() || vputils::onlyFirstLaneUsed(this));
1614 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1615 void VPWidenPointerInductionRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1616 VPSlotTracker
&SlotTracker
) const {
1617 O
<< Indent
<< "EMIT ";
1618 printAsOperand(O
, SlotTracker
);
1619 O
<< " = WIDEN-POINTER-INDUCTION ";
1620 getStartValue()->printAsOperand(O
, SlotTracker
);
1621 O
<< ", " << *IndDesc
.getStep();
1625 void VPExpandSCEVRecipe::execute(VPTransformState
&State
) {
1626 assert(!State
.Instance
&& "cannot be used in per-lane");
1627 const DataLayout
&DL
= State
.CFG
.PrevBB
->getModule()->getDataLayout();
1628 SCEVExpander
Exp(SE
, DL
, "induction");
1630 Value
*Res
= Exp
.expandCodeFor(Expr
, Expr
->getType(),
1631 &*State
.Builder
.GetInsertPoint());
1632 assert(!State
.ExpandedSCEVs
.contains(Expr
) &&
1633 "Same SCEV expanded multiple times");
1634 State
.ExpandedSCEVs
[Expr
] = Res
;
1635 for (unsigned Part
= 0, UF
= State
.UF
; Part
< UF
; ++Part
)
1636 State
.set(this, Res
, {Part
, 0});
1639 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1640 void VPExpandSCEVRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1641 VPSlotTracker
&SlotTracker
) const {
1642 O
<< Indent
<< "EMIT ";
1643 getVPSingleValue()->printAsOperand(O
, SlotTracker
);
1644 O
<< " = EXPAND SCEV " << *Expr
;
1648 void VPWidenCanonicalIVRecipe::execute(VPTransformState
&State
) {
1649 Value
*CanonicalIV
= State
.get(getOperand(0), 0);
1650 Type
*STy
= CanonicalIV
->getType();
1651 IRBuilder
<> Builder(State
.CFG
.PrevBB
->getTerminator());
1652 ElementCount VF
= State
.VF
;
1653 Value
*VStart
= VF
.isScalar()
1655 : Builder
.CreateVectorSplat(VF
, CanonicalIV
, "broadcast");
1656 for (unsigned Part
= 0, UF
= State
.UF
; Part
< UF
; ++Part
) {
1657 Value
*VStep
= createStepForVF(Builder
, STy
, VF
, Part
);
1658 if (VF
.isVector()) {
1659 VStep
= Builder
.CreateVectorSplat(VF
, VStep
);
1661 Builder
.CreateAdd(VStep
, Builder
.CreateStepVector(VStep
->getType()));
1663 Value
*CanonicalVectorIV
= Builder
.CreateAdd(VStart
, VStep
, "vec.iv");
1664 State
.set(this, CanonicalVectorIV
, Part
);
1668 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1669 void VPWidenCanonicalIVRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1670 VPSlotTracker
&SlotTracker
) const {
1671 O
<< Indent
<< "EMIT ";
1672 printAsOperand(O
, SlotTracker
);
1673 O
<< " = WIDEN-CANONICAL-INDUCTION ";
1674 printOperands(O
, SlotTracker
);
1678 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState
&State
) {
1679 auto &Builder
= State
.Builder
;
1680 // Create a vector from the initial value.
1681 auto *VectorInit
= getStartValue()->getLiveInIRValue();
1683 Type
*VecTy
= State
.VF
.isScalar()
1684 ? VectorInit
->getType()
1685 : VectorType::get(VectorInit
->getType(), State
.VF
);
1687 BasicBlock
*VectorPH
= State
.CFG
.getPreheaderBBFor(this);
1688 if (State
.VF
.isVector()) {
1689 auto *IdxTy
= Builder
.getInt32Ty();
1690 auto *One
= ConstantInt::get(IdxTy
, 1);
1691 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1692 Builder
.SetInsertPoint(VectorPH
->getTerminator());
1693 auto *RuntimeVF
= getRuntimeVF(Builder
, IdxTy
, State
.VF
);
1694 auto *LastIdx
= Builder
.CreateSub(RuntimeVF
, One
);
1695 VectorInit
= Builder
.CreateInsertElement(
1696 PoisonValue::get(VecTy
), VectorInit
, LastIdx
, "vector.recur.init");
1699 // Create a phi node for the new recurrence.
1700 PHINode
*EntryPart
= PHINode::Create(VecTy
, 2, "vector.recur");
1701 EntryPart
->insertBefore(State
.CFG
.PrevBB
->getFirstInsertionPt());
1702 EntryPart
->addIncoming(VectorInit
, VectorPH
);
1703 State
.set(this, EntryPart
, 0);
1706 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1707 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1708 VPSlotTracker
&SlotTracker
) const {
1709 O
<< Indent
<< "FIRST-ORDER-RECURRENCE-PHI ";
1710 printAsOperand(O
, SlotTracker
);
1712 printOperands(O
, SlotTracker
);
1716 void VPReductionPHIRecipe::execute(VPTransformState
&State
) {
1717 auto &Builder
= State
.Builder
;
1719 // Reductions do not have to start at zero. They can start with
1720 // any loop invariant values.
1721 VPValue
*StartVPV
= getStartValue();
1722 Value
*StartV
= StartVPV
->getLiveInIRValue();
1724 // In order to support recurrences we need to be able to vectorize Phi nodes.
1725 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1726 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1727 // this value when we vectorize all of the instructions that use the PHI.
1728 bool ScalarPHI
= State
.VF
.isScalar() || IsInLoop
;
1729 Type
*VecTy
= ScalarPHI
? StartV
->getType()
1730 : VectorType::get(StartV
->getType(), State
.VF
);
1732 BasicBlock
*HeaderBB
= State
.CFG
.PrevBB
;
1733 assert(State
.CurrentVectorLoop
->getHeader() == HeaderBB
&&
1734 "recipe must be in the vector loop header");
1735 unsigned LastPartForNewPhi
= isOrdered() ? 1 : State
.UF
;
1736 for (unsigned Part
= 0; Part
< LastPartForNewPhi
; ++Part
) {
1737 Instruction
*EntryPart
= PHINode::Create(VecTy
, 2, "vec.phi");
1738 EntryPart
->insertBefore(HeaderBB
->getFirstInsertionPt());
1739 State
.set(this, EntryPart
, Part
);
1742 BasicBlock
*VectorPH
= State
.CFG
.getPreheaderBBFor(this);
1744 Value
*Iden
= nullptr;
1745 RecurKind RK
= RdxDesc
.getRecurrenceKind();
1746 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK
) ||
1747 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK
)) {
1748 // MinMax and AnyOf reductions have the start value as their identity.
1752 IRBuilderBase::InsertPointGuard
IPBuilder(Builder
);
1753 Builder
.SetInsertPoint(VectorPH
->getTerminator());
1755 Builder
.CreateVectorSplat(State
.VF
, StartV
, "minmax.ident");
1758 Iden
= RdxDesc
.getRecurrenceIdentity(RK
, VecTy
->getScalarType(),
1759 RdxDesc
.getFastMathFlags());
1762 Iden
= Builder
.CreateVectorSplat(State
.VF
, Iden
);
1763 IRBuilderBase::InsertPointGuard
IPBuilder(Builder
);
1764 Builder
.SetInsertPoint(VectorPH
->getTerminator());
1765 Constant
*Zero
= Builder
.getInt32(0);
1766 StartV
= Builder
.CreateInsertElement(Iden
, StartV
, Zero
);
1770 for (unsigned Part
= 0; Part
< LastPartForNewPhi
; ++Part
) {
1771 Value
*EntryPart
= State
.get(this, Part
);
1772 // Make sure to add the reduction start value only to the
1773 // first unroll part.
1774 Value
*StartVal
= (Part
== 0) ? StartV
: Iden
;
1775 cast
<PHINode
>(EntryPart
)->addIncoming(StartVal
, VectorPH
);
1779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1780 void VPReductionPHIRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1781 VPSlotTracker
&SlotTracker
) const {
1782 O
<< Indent
<< "WIDEN-REDUCTION-PHI ";
1784 printAsOperand(O
, SlotTracker
);
1786 printOperands(O
, SlotTracker
);
1790 void VPWidenPHIRecipe::execute(VPTransformState
&State
) {
1791 assert(EnableVPlanNativePath
&&
1792 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1794 Value
*Op0
= State
.get(getOperand(0), 0);
1795 Type
*VecTy
= Op0
->getType();
1796 Value
*VecPhi
= State
.Builder
.CreatePHI(VecTy
, 2, "vec.phi");
1797 State
.set(this, VecPhi
, 0);
1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1801 void VPWidenPHIRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1802 VPSlotTracker
&SlotTracker
) const {
1803 O
<< Indent
<< "WIDEN-PHI ";
1805 auto *OriginalPhi
= cast
<PHINode
>(getUnderlyingValue());
1806 // Unless all incoming values are modeled in VPlan print the original PHI
1808 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1809 // values as VPValues.
1810 if (getNumOperands() != OriginalPhi
->getNumOperands()) {
1811 O
<< VPlanIngredient(OriginalPhi
);
1815 printAsOperand(O
, SlotTracker
);
1817 printOperands(O
, SlotTracker
);
1821 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1822 // remove VPActiveLaneMaskPHIRecipe.
1823 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState
&State
) {
1824 BasicBlock
*VectorPH
= State
.CFG
.getPreheaderBBFor(this);
1825 for (unsigned Part
= 0, UF
= State
.UF
; Part
< UF
; ++Part
) {
1826 Value
*StartMask
= State
.get(getOperand(0), Part
);
1827 PHINode
*EntryPart
=
1828 State
.Builder
.CreatePHI(StartMask
->getType(), 2, "active.lane.mask");
1829 EntryPart
->addIncoming(StartMask
, VectorPH
);
1830 EntryPart
->setDebugLoc(getDebugLoc());
1831 State
.set(this, EntryPart
, Part
);
1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1836 void VPActiveLaneMaskPHIRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
1837 VPSlotTracker
&SlotTracker
) const {
1838 O
<< Indent
<< "ACTIVE-LANE-MASK-PHI ";
1840 printAsOperand(O
, SlotTracker
);
1842 printOperands(O
, SlotTracker
);