1 //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This is the LLVM vectorization plan. It represents a candidate for
11 /// vectorization, allowing to plan and optimize how to vectorize a given loop
12 /// before generating LLVM-IR.
13 /// The vectorizer uses vectorization plans to estimate the costs of potential
14 /// candidates and if profitable to execute the desired plan, generating vector
17 //===----------------------------------------------------------------------===//
20 #include "LoopVectorizationPlanner.h"
22 #include "VPlanPatternMatch.h"
23 #include "VPlanTransforms.h"
24 #include "VPlanUtils.h"
25 #include "llvm/ADT/PostOrderIterator.h"
26 #include "llvm/ADT/STLExtras.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/Twine.h"
30 #include "llvm/Analysis/DomTreeUpdater.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CFG.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instruction.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/IR/Value.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/GraphWriter.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/LoopVersioning.h"
50 using namespace llvm::VPlanPatternMatch
;
53 extern cl::opt
<bool> EnableVPlanNativePath
;
55 extern cl::opt
<unsigned> ForceTargetInstructionCost
;
57 static cl::opt
<bool> PrintVPlansInDotFormat(
58 "vplan-print-in-dot-format", cl::Hidden
,
59 cl::desc("Use dot format instead of plain text when dumping VPlans"));
61 #define DEBUG_TYPE "loop-vectorize"
63 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
64 raw_ostream
&llvm::operator<<(raw_ostream
&OS
, const VPValue
&V
) {
65 const VPInstruction
*Instr
= dyn_cast
<VPInstruction
>(&V
);
66 VPSlotTracker
SlotTracker(
67 (Instr
&& Instr
->getParent()) ? Instr
->getParent()->getPlan() : nullptr);
68 V
.print(OS
, SlotTracker
);
73 Value
*VPLane::getAsRuntimeExpr(IRBuilderBase
&Builder
,
74 const ElementCount
&VF
) const {
76 case VPLane::Kind::ScalableLast
:
77 // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
78 return Builder
.CreateSub(getRuntimeVF(Builder
, Builder
.getInt32Ty(), VF
),
79 Builder
.getInt32(VF
.getKnownMinValue() - Lane
));
80 case VPLane::Kind::First
:
81 return Builder
.getInt32(Lane
);
83 llvm_unreachable("Unknown lane kind");
86 VPValue::VPValue(const unsigned char SC
, Value
*UV
, VPDef
*Def
)
87 : SubclassID(SC
), UnderlyingVal(UV
), Def(Def
) {
89 Def
->addDefinedValue(this);
93 assert(Users
.empty() && "trying to delete a VPValue with remaining users");
95 Def
->removeDefinedValue(this);
98 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
99 void VPValue::print(raw_ostream
&OS
, VPSlotTracker
&SlotTracker
) const {
100 if (const VPRecipeBase
*R
= dyn_cast_or_null
<VPRecipeBase
>(Def
))
101 R
->print(OS
, "", SlotTracker
);
103 printAsOperand(OS
, SlotTracker
);
106 void VPValue::dump() const {
107 const VPRecipeBase
*Instr
= dyn_cast_or_null
<VPRecipeBase
>(this->Def
);
108 VPSlotTracker
SlotTracker(
109 (Instr
&& Instr
->getParent()) ? Instr
->getParent()->getPlan() : nullptr);
110 print(dbgs(), SlotTracker
);
114 void VPDef::dump() const {
115 const VPRecipeBase
*Instr
= dyn_cast_or_null
<VPRecipeBase
>(this);
116 VPSlotTracker
SlotTracker(
117 (Instr
&& Instr
->getParent()) ? Instr
->getParent()->getPlan() : nullptr);
118 print(dbgs(), "", SlotTracker
);
123 VPRecipeBase
*VPValue::getDefiningRecipe() {
124 return cast_or_null
<VPRecipeBase
>(Def
);
127 const VPRecipeBase
*VPValue::getDefiningRecipe() const {
128 return cast_or_null
<VPRecipeBase
>(Def
);
131 // Get the top-most entry block of \p Start. This is the entry block of the
132 // containing VPlan. This function is templated to support both const and non-const blocks
133 template <typename T
> static T
*getPlanEntry(T
*Start
) {
136 while ((Next
= Next
->getParent()))
139 SmallSetVector
<T
*, 8> WorkList
;
140 WorkList
.insert(Current
);
142 for (unsigned i
= 0; i
< WorkList
.size(); i
++) {
143 T
*Current
= WorkList
[i
];
144 if (Current
->getNumPredecessors() == 0)
146 auto &Predecessors
= Current
->getPredecessors();
147 WorkList
.insert(Predecessors
.begin(), Predecessors
.end());
150 llvm_unreachable("VPlan without any entry node without predecessors");
153 VPlan
*VPBlockBase::getPlan() { return getPlanEntry(this)->Plan
; }
155 const VPlan
*VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan
; }
157 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
158 const VPBasicBlock
*VPBlockBase::getEntryBasicBlock() const {
159 const VPBlockBase
*Block
= this;
160 while (const VPRegionBlock
*Region
= dyn_cast
<VPRegionBlock
>(Block
))
161 Block
= Region
->getEntry();
162 return cast
<VPBasicBlock
>(Block
);
165 VPBasicBlock
*VPBlockBase::getEntryBasicBlock() {
166 VPBlockBase
*Block
= this;
167 while (VPRegionBlock
*Region
= dyn_cast
<VPRegionBlock
>(Block
))
168 Block
= Region
->getEntry();
169 return cast
<VPBasicBlock
>(Block
);
172 void VPBlockBase::setPlan(VPlan
*ParentPlan
) {
173 assert(ParentPlan
->getEntry() == this && "Can only set plan on its entry.");
177 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
178 const VPBasicBlock
*VPBlockBase::getExitingBasicBlock() const {
179 const VPBlockBase
*Block
= this;
180 while (const VPRegionBlock
*Region
= dyn_cast
<VPRegionBlock
>(Block
))
181 Block
= Region
->getExiting();
182 return cast
<VPBasicBlock
>(Block
);
185 VPBasicBlock
*VPBlockBase::getExitingBasicBlock() {
186 VPBlockBase
*Block
= this;
187 while (VPRegionBlock
*Region
= dyn_cast
<VPRegionBlock
>(Block
))
188 Block
= Region
->getExiting();
189 return cast
<VPBasicBlock
>(Block
);
192 VPBlockBase
*VPBlockBase::getEnclosingBlockWithSuccessors() {
193 if (!Successors
.empty() || !Parent
)
195 assert(Parent
->getExiting() == this &&
196 "Block w/o successors not the exiting block of its parent.");
197 return Parent
->getEnclosingBlockWithSuccessors();
200 VPBlockBase
*VPBlockBase::getEnclosingBlockWithPredecessors() {
201 if (!Predecessors
.empty() || !Parent
)
203 assert(Parent
->getEntry() == this &&
204 "Block w/o predecessors not the entry of its parent.");
205 return Parent
->getEnclosingBlockWithPredecessors();
208 void VPBlockBase::deleteCFG(VPBlockBase
*Entry
) {
209 for (VPBlockBase
*Block
: to_vector(vp_depth_first_shallow(Entry
)))
213 VPBasicBlock::iterator
VPBasicBlock::getFirstNonPhi() {
214 iterator It
= begin();
215 while (It
!= end() && It
->isPhi())
220 VPTransformState::VPTransformState(const TargetTransformInfo
*TTI
,
221 ElementCount VF
, unsigned UF
, LoopInfo
*LI
,
222 DominatorTree
*DT
, IRBuilderBase
&Builder
,
223 InnerLoopVectorizer
*ILV
, VPlan
*Plan
,
225 : TTI(TTI
), VF(VF
), CFG(DT
), LI(LI
), Builder(Builder
), ILV(ILV
), Plan(Plan
),
226 LVer(nullptr), TypeAnalysis(CanonicalIVTy
) {}
228 Value
*VPTransformState::get(VPValue
*Def
, const VPLane
&Lane
) {
230 return Def
->getLiveInIRValue();
232 if (hasScalarValue(Def
, Lane
))
233 return Data
.VPV2Scalars
[Def
][Lane
.mapToCacheIndex(VF
)];
235 if (!Lane
.isFirstLane() && vputils::isUniformAfterVectorization(Def
) &&
236 hasScalarValue(Def
, VPLane::getFirstLane())) {
237 return Data
.VPV2Scalars
[Def
][0];
240 assert(hasVectorValue(Def
));
241 auto *VecPart
= Data
.VPV2Vector
[Def
];
242 if (!VecPart
->getType()->isVectorTy()) {
243 assert(Lane
.isFirstLane() && "cannot get lane > 0 for scalar");
246 // TODO: Cache created scalar values.
247 Value
*LaneV
= Lane
.getAsRuntimeExpr(Builder
, VF
);
248 auto *Extract
= Builder
.CreateExtractElement(VecPart
, LaneV
);
249 // set(Def, Extract, Instance);
253 Value
*VPTransformState::get(VPValue
*Def
, bool NeedsScalar
) {
255 assert((VF
.isScalar() || Def
->isLiveIn() || hasVectorValue(Def
) ||
256 !vputils::onlyFirstLaneUsed(Def
) ||
257 (hasScalarValue(Def
, VPLane(0)) &&
258 Data
.VPV2Scalars
[Def
].size() == 1)) &&
259 "Trying to access a single scalar per part but has multiple scalars "
261 return get(Def
, VPLane(0));
264 // If Values have been set for this Def return the one relevant for \p Part.
265 if (hasVectorValue(Def
))
266 return Data
.VPV2Vector
[Def
];
268 auto GetBroadcastInstrs
= [this, Def
](Value
*V
) {
269 bool SafeToHoist
= Def
->isDefinedOutsideLoopRegions();
272 // Place the code for broadcasting invariant variables in the new preheader.
273 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
275 BasicBlock
*LoopVectorPreHeader
=
276 CFG
.VPBB2IRBB
[Plan
->getVectorPreheader()];
277 if (LoopVectorPreHeader
)
278 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
281 // Place the code for broadcasting invariant variables in the new preheader.
282 // Broadcast the scalar into all locations in the vector.
283 Value
*Shuf
= Builder
.CreateVectorSplat(VF
, V
, "broadcast");
288 if (!hasScalarValue(Def
, {0})) {
289 assert(Def
->isLiveIn() && "expected a live-in");
290 Value
*IRV
= Def
->getLiveInIRValue();
291 Value
*B
= GetBroadcastInstrs(IRV
);
296 Value
*ScalarValue
= get(Def
, VPLane(0));
297 // If we aren't vectorizing, we can just copy the scalar map values over
298 // to the vector map.
300 set(Def
, ScalarValue
);
304 bool IsUniform
= vputils::isUniformAfterVectorization(Def
);
306 VPLane
LastLane(IsUniform
? 0 : VF
.getKnownMinValue() - 1);
307 // Check if there is a scalar value for the selected lane.
308 if (!hasScalarValue(Def
, LastLane
)) {
309 // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
310 // VPExpandSCEVRecipes can also be uniform.
311 assert((isa
<VPWidenIntOrFpInductionRecipe
, VPScalarIVStepsRecipe
,
312 VPExpandSCEVRecipe
>(Def
->getDefiningRecipe())) &&
313 "unexpected recipe found to be invariant");
318 auto *LastInst
= cast
<Instruction
>(get(Def
, LastLane
));
319 // Set the insert point after the last scalarized instruction or after the
320 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
321 // will directly follow the scalar definitions.
322 auto OldIP
= Builder
.saveIP();
324 isa
<PHINode
>(LastInst
)
325 ? BasicBlock::iterator(LastInst
->getParent()->getFirstNonPHI())
326 : std::next(BasicBlock::iterator(LastInst
));
327 Builder
.SetInsertPoint(&*NewIP
);
329 // However, if we are vectorizing, we need to construct the vector values.
330 // If the value is known to be uniform after vectorization, we can just
331 // broadcast the scalar value corresponding to lane zero. Otherwise, we
332 // construct the vector values using insertelement instructions. Since the
333 // resulting vectors are stored in State, we will only generate the
334 // insertelements once.
335 Value
*VectorValue
= nullptr;
337 VectorValue
= GetBroadcastInstrs(ScalarValue
);
338 set(Def
, VectorValue
);
340 // Initialize packing with insertelements to start from undef.
341 assert(!VF
.isScalable() && "VF is assumed to be non scalable.");
342 Value
*Undef
= PoisonValue::get(VectorType::get(LastInst
->getType(), VF
));
344 for (unsigned Lane
= 0; Lane
< VF
.getKnownMinValue(); ++Lane
)
345 packScalarIntoVectorValue(Def
, Lane
);
346 VectorValue
= get(Def
);
348 Builder
.restoreIP(OldIP
);
352 BasicBlock
*VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase
*R
) {
353 VPRegionBlock
*LoopRegion
= R
->getParent()->getEnclosingLoopRegion();
354 return VPBB2IRBB
[LoopRegion
->getPreheaderVPBB()];
357 void VPTransformState::addNewMetadata(Instruction
*To
,
358 const Instruction
*Orig
) {
359 // If the loop was versioned with memchecks, add the corresponding no-alias
361 if (LVer
&& isa
<LoadInst
, StoreInst
>(Orig
))
362 LVer
->annotateInstWithNoAlias(To
, Orig
);
365 void VPTransformState::addMetadata(Value
*To
, Instruction
*From
) {
366 // No source instruction to transfer metadata from?
370 if (Instruction
*ToI
= dyn_cast
<Instruction
>(To
)) {
371 propagateMetadata(ToI
, From
);
372 addNewMetadata(ToI
, From
);
376 void VPTransformState::setDebugLocFrom(DebugLoc DL
) {
377 const DILocation
*DIL
= DL
;
378 // When a FSDiscriminator is enabled, we don't need to add the multiply
379 // factors to the discriminators.
381 Builder
.GetInsertBlock()
383 ->shouldEmitDebugInfoForProfiling() &&
384 !EnableFSDiscriminator
) {
385 // FIXME: For scalable vectors, assume vscale=1.
386 unsigned UF
= Plan
->getUF();
388 DIL
->cloneByMultiplyingDuplicationFactor(UF
* VF
.getKnownMinValue());
390 Builder
.SetCurrentDebugLocation(*NewDIL
);
392 LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
393 << DIL
->getFilename() << " Line: " << DIL
->getLine());
395 Builder
.SetCurrentDebugLocation(DIL
);
398 void VPTransformState::packScalarIntoVectorValue(VPValue
*Def
,
399 const VPLane
&Lane
) {
400 Value
*ScalarInst
= get(Def
, Lane
);
401 Value
*VectorValue
= get(Def
);
402 VectorValue
= Builder
.CreateInsertElement(VectorValue
, ScalarInst
,
403 Lane
.getAsRuntimeExpr(Builder
, VF
));
404 set(Def
, VectorValue
);
408 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState
&CFG
) {
409 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
410 // Pred stands for Predessor. Prev stands for Previous - last visited/created.
411 BasicBlock
*PrevBB
= CFG
.PrevBB
;
412 BasicBlock
*NewBB
= BasicBlock::Create(PrevBB
->getContext(), getName(),
413 PrevBB
->getParent(), CFG
.ExitBB
);
414 LLVM_DEBUG(dbgs() << "LV: created " << NewBB
->getName() << '\n');
419 void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState
&CFG
) {
420 BasicBlock
*NewBB
= CFG
.VPBB2IRBB
[this];
421 // Hook up the new basic block to its predecessors.
422 for (VPBlockBase
*PredVPBlock
: getHierarchicalPredecessors()) {
423 VPBasicBlock
*PredVPBB
= PredVPBlock
->getExitingBasicBlock();
424 auto &PredVPSuccessors
= PredVPBB
->getHierarchicalSuccessors();
425 BasicBlock
*PredBB
= CFG
.VPBB2IRBB
[PredVPBB
];
427 assert(PredBB
&& "Predecessor basic-block not found building successor.");
428 auto *PredBBTerminator
= PredBB
->getTerminator();
429 LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB
->getName() << '\n');
431 auto *TermBr
= dyn_cast
<BranchInst
>(PredBBTerminator
);
432 if (isa
<UnreachableInst
>(PredBBTerminator
)) {
433 assert(PredVPSuccessors
.size() == 1 &&
434 "Predecessor ending w/o branch must have single successor.");
435 DebugLoc DL
= PredBBTerminator
->getDebugLoc();
436 PredBBTerminator
->eraseFromParent();
437 auto *Br
= BranchInst::Create(NewBB
, PredBB
);
439 } else if (TermBr
&& !TermBr
->isConditional()) {
440 TermBr
->setSuccessor(0, NewBB
);
442 // Set each forward successor here when it is created, excluding
443 // backedges. A backward successor is set when the branch is created.
444 unsigned idx
= PredVPSuccessors
.front() == this ? 0 : 1;
446 (!TermBr
->getSuccessor(idx
) ||
447 (isa
<VPIRBasicBlock
>(this) && TermBr
->getSuccessor(idx
) == NewBB
)) &&
448 "Trying to reset an existing successor block.");
449 TermBr
->setSuccessor(idx
, NewBB
);
451 CFG
.DTU
.applyUpdates({{DominatorTree::Insert
, PredBB
, NewBB
}});
455 void VPIRBasicBlock::execute(VPTransformState
*State
) {
456 assert(getHierarchicalSuccessors().size() <= 2 &&
457 "VPIRBasicBlock can have at most two successors at the moment!");
458 State
->Builder
.SetInsertPoint(IRBB
->getTerminator());
459 State
->CFG
.PrevBB
= IRBB
;
460 State
->CFG
.VPBB2IRBB
[this] = IRBB
;
461 executeRecipes(State
, IRBB
);
462 // Create a branch instruction to terminate IRBB if one was not created yet
464 if (getSingleSuccessor() && isa
<UnreachableInst
>(IRBB
->getTerminator())) {
465 auto *Br
= State
->Builder
.CreateBr(IRBB
);
466 Br
->setOperand(0, nullptr);
467 IRBB
->getTerminator()->eraseFromParent();
470 (getNumSuccessors() == 0 || isa
<BranchInst
>(IRBB
->getTerminator())) &&
471 "other blocks must be terminated by a branch");
474 connectToPredecessors(State
->CFG
);
477 void VPBasicBlock::execute(VPTransformState
*State
) {
478 bool Replica
= bool(State
->Lane
);
479 BasicBlock
*NewBB
= State
->CFG
.PrevBB
; // Reuse it if possible.
481 auto IsReplicateRegion
= [](VPBlockBase
*BB
) {
482 auto *R
= dyn_cast_or_null
<VPRegionBlock
>(BB
);
483 return R
&& R
->isReplicator();
486 // 1. Create an IR basic block.
487 if (this == getPlan()->getVectorPreheader() ||
488 (Replica
&& this == getParent()->getEntry()) ||
489 IsReplicateRegion(getSingleHierarchicalPredecessor())) {
490 // Reuse the previous basic block if the current VPBB is either
491 // * the vector preheader,
492 // * the entry to a replicate region, or
493 // * the exit of a replicate region.
494 State
->CFG
.VPBB2IRBB
[this] = NewBB
;
496 NewBB
= createEmptyBasicBlock(State
->CFG
);
498 State
->Builder
.SetInsertPoint(NewBB
);
499 // Temporarily terminate with unreachable until CFG is rewired.
500 UnreachableInst
*Terminator
= State
->Builder
.CreateUnreachable();
501 // Register NewBB in its loop. In innermost loops its the same for all
503 if (State
->CurrentVectorLoop
)
504 State
->CurrentVectorLoop
->addBasicBlockToLoop(NewBB
, *State
->LI
);
505 State
->Builder
.SetInsertPoint(Terminator
);
507 State
->CFG
.PrevBB
= NewBB
;
508 State
->CFG
.VPBB2IRBB
[this] = NewBB
;
509 connectToPredecessors(State
->CFG
);
512 // 2. Fill the IR basic block with IR instructions.
513 executeRecipes(State
, NewBB
);
516 void VPBasicBlock::dropAllReferences(VPValue
*NewValue
) {
517 for (VPRecipeBase
&R
: Recipes
) {
518 for (auto *Def
: R
.definedValues())
519 Def
->replaceAllUsesWith(NewValue
);
521 for (unsigned I
= 0, E
= R
.getNumOperands(); I
!= E
; I
++)
522 R
.setOperand(I
, NewValue
);
526 void VPBasicBlock::executeRecipes(VPTransformState
*State
, BasicBlock
*BB
) {
527 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
528 << " in BB:" << BB
->getName() << '\n');
530 State
->CFG
.PrevVPBB
= this;
532 for (VPRecipeBase
&Recipe
: Recipes
)
533 Recipe
.execute(*State
);
535 LLVM_DEBUG(dbgs() << "LV: filled BB:" << *BB
);
538 VPBasicBlock
*VPBasicBlock::splitAt(iterator SplitAt
) {
539 assert((SplitAt
== end() || SplitAt
->getParent() == this) &&
540 "can only split at a position in the same block");
542 SmallVector
<VPBlockBase
*, 2> Succs(successors());
543 // Create new empty block after the block to split.
544 auto *SplitBlock
= new VPBasicBlock(getName() + ".split");
545 VPBlockUtils::insertBlockAfter(SplitBlock
, this);
547 // Finally, move the recipes starting at SplitAt to new block.
548 for (VPRecipeBase
&ToMove
:
549 make_early_inc_range(make_range(SplitAt
, this->end())))
550 ToMove
.moveBefore(*SplitBlock
, SplitBlock
->end());
555 /// Return the enclosing loop region for region \p P. The templated version is
556 /// used to support both const and non-const block arguments.
557 template <typename T
> static T
*getEnclosingLoopRegionForRegion(T
*P
) {
558 if (P
&& P
->isReplicator()) {
560 assert(!cast
<VPRegionBlock
>(P
)->isReplicator() &&
561 "unexpected nested replicate regions");
566 VPRegionBlock
*VPBasicBlock::getEnclosingLoopRegion() {
567 return getEnclosingLoopRegionForRegion(getParent());
570 const VPRegionBlock
*VPBasicBlock::getEnclosingLoopRegion() const {
571 return getEnclosingLoopRegionForRegion(getParent());
574 static bool hasConditionalTerminator(const VPBasicBlock
*VPBB
) {
577 VPBB
->getNumSuccessors() < 2 &&
578 "block with multiple successors doesn't have a recipe as terminator");
582 const VPRecipeBase
*R
= &VPBB
->back();
583 bool IsCondBranch
= isa
<VPBranchOnMaskRecipe
>(R
) ||
584 match(R
, m_BranchOnCond(m_VPValue())) ||
585 match(R
, m_BranchOnCount(m_VPValue(), m_VPValue()));
588 if (VPBB
->getNumSuccessors() >= 2 ||
589 (VPBB
->isExiting() && !VPBB
->getParent()->isReplicator())) {
590 assert(IsCondBranch
&& "block with multiple successors not terminated by "
591 "conditional branch recipe");
598 "block with 0 or 1 successors terminated by conditional branch recipe");
602 VPRecipeBase
*VPBasicBlock::getTerminator() {
603 if (hasConditionalTerminator(this))
608 const VPRecipeBase
*VPBasicBlock::getTerminator() const {
609 if (hasConditionalTerminator(this))
614 bool VPBasicBlock::isExiting() const {
615 return getParent() && getParent()->getExitingBasicBlock() == this;
618 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
619 void VPBlockBase::printSuccessors(raw_ostream
&O
, const Twine
&Indent
) const {
620 if (getSuccessors().empty()) {
621 O
<< Indent
<< "No successors\n";
623 O
<< Indent
<< "Successor(s): ";
625 for (auto *Succ
: getSuccessors())
626 O
<< LS
<< Succ
->getName();
631 void VPBasicBlock::print(raw_ostream
&O
, const Twine
&Indent
,
632 VPSlotTracker
&SlotTracker
) const {
633 O
<< Indent
<< getName() << ":\n";
635 auto RecipeIndent
= Indent
+ " ";
636 for (const VPRecipeBase
&Recipe
: *this) {
637 Recipe
.print(O
, RecipeIndent
, SlotTracker
);
641 printSuccessors(O
, Indent
);
645 static std::pair
<VPBlockBase
*, VPBlockBase
*> cloneFrom(VPBlockBase
*Entry
);
647 // Clone the CFG for all nodes reachable from \p Entry, this includes cloning
648 // the blocks and their recipes. Operands of cloned recipes will NOT be updated.
649 // Remapping of operands must be done separately. Returns a pair with the new
650 // entry and exiting blocks of the cloned region. If \p Entry isn't part of a
651 // region, return nullptr for the exiting block.
652 static std::pair
<VPBlockBase
*, VPBlockBase
*> cloneFrom(VPBlockBase
*Entry
) {
653 DenseMap
<VPBlockBase
*, VPBlockBase
*> Old2NewVPBlocks
;
654 VPBlockBase
*Exiting
= nullptr;
655 bool InRegion
= Entry
->getParent();
656 // First, clone blocks reachable from Entry.
657 for (VPBlockBase
*BB
: vp_depth_first_shallow(Entry
)) {
658 VPBlockBase
*NewBB
= BB
->clone();
659 Old2NewVPBlocks
[BB
] = NewBB
;
660 if (InRegion
&& BB
->getNumSuccessors() == 0) {
661 assert(!Exiting
&& "Multiple exiting blocks?");
665 assert((!InRegion
|| Exiting
) && "regions must have a single exiting block");
667 // Second, update the predecessors & successors of the cloned blocks.
668 for (VPBlockBase
*BB
: vp_depth_first_shallow(Entry
)) {
669 VPBlockBase
*NewBB
= Old2NewVPBlocks
[BB
];
670 SmallVector
<VPBlockBase
*> NewPreds
;
671 for (VPBlockBase
*Pred
: BB
->getPredecessors()) {
672 NewPreds
.push_back(Old2NewVPBlocks
[Pred
]);
674 NewBB
->setPredecessors(NewPreds
);
675 SmallVector
<VPBlockBase
*> NewSuccs
;
676 for (VPBlockBase
*Succ
: BB
->successors()) {
677 NewSuccs
.push_back(Old2NewVPBlocks
[Succ
]);
679 NewBB
->setSuccessors(NewSuccs
);
683 // Verify that the order of predecessors and successors matches in the cloned
685 for (const auto &[OldBB
, NewBB
] :
686 zip(vp_depth_first_shallow(Entry
),
687 vp_depth_first_shallow(Old2NewVPBlocks
[Entry
]))) {
688 for (const auto &[OldPred
, NewPred
] :
689 zip(OldBB
->getPredecessors(), NewBB
->getPredecessors()))
690 assert(NewPred
== Old2NewVPBlocks
[OldPred
] && "Different predecessors");
692 for (const auto &[OldSucc
, NewSucc
] :
693 zip(OldBB
->successors(), NewBB
->successors()))
694 assert(NewSucc
== Old2NewVPBlocks
[OldSucc
] && "Different successors");
698 return std::make_pair(Old2NewVPBlocks
[Entry
],
699 Exiting
? Old2NewVPBlocks
[Exiting
] : nullptr);
702 VPRegionBlock
*VPRegionBlock::clone() {
703 const auto &[NewEntry
, NewExiting
] = cloneFrom(getEntry());
705 new VPRegionBlock(NewEntry
, NewExiting
, getName(), isReplicator());
706 for (VPBlockBase
*Block
: vp_depth_first_shallow(NewEntry
))
707 Block
->setParent(NewRegion
);
711 void VPRegionBlock::dropAllReferences(VPValue
*NewValue
) {
712 for (VPBlockBase
*Block
: vp_depth_first_shallow(Entry
))
713 // Drop all references in VPBasicBlocks and replace all uses with
715 Block
->dropAllReferences(NewValue
);
718 void VPRegionBlock::execute(VPTransformState
*State
) {
719 ReversePostOrderTraversal
<VPBlockShallowTraversalWrapper
<VPBlockBase
*>>
722 if (!isReplicator()) {
723 // Create and register the new vector loop.
724 Loop
*PrevLoop
= State
->CurrentVectorLoop
;
725 State
->CurrentVectorLoop
= State
->LI
->AllocateLoop();
726 BasicBlock
*VectorPH
= State
->CFG
.VPBB2IRBB
[getPreheaderVPBB()];
727 Loop
*ParentLoop
= State
->LI
->getLoopFor(VectorPH
);
729 // Insert the new loop into the loop nest and register the new basic blocks
730 // before calling any utilities such as SCEV that require valid LoopInfo.
732 ParentLoop
->addChildLoop(State
->CurrentVectorLoop
);
734 State
->LI
->addTopLevelLoop(State
->CurrentVectorLoop
);
736 // Visit the VPBlocks connected to "this", starting from it.
737 for (VPBlockBase
*Block
: RPOT
) {
738 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block
->getName() << '\n');
739 Block
->execute(State
);
742 State
->CurrentVectorLoop
= PrevLoop
;
746 assert(!State
->Lane
&& "Replicating a Region with non-null instance.");
748 // Enter replicating mode.
749 assert(!State
->VF
.isScalable() && "VF is assumed to be non scalable.");
750 State
->Lane
= VPLane(0);
751 for (unsigned Lane
= 0, VF
= State
->VF
.getKnownMinValue(); Lane
< VF
;
753 State
->Lane
= VPLane(Lane
, VPLane::Kind::First
);
754 // Visit the VPBlocks connected to \p this, starting from it.
755 for (VPBlockBase
*Block
: RPOT
) {
756 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block
->getName() << '\n');
757 Block
->execute(State
);
761 // Exit replicating mode.
765 InstructionCost
VPBasicBlock::cost(ElementCount VF
, VPCostContext
&Ctx
) {
766 InstructionCost Cost
= 0;
767 for (VPRecipeBase
&R
: Recipes
)
768 Cost
+= R
.cost(VF
, Ctx
);
772 InstructionCost
VPRegionBlock::cost(ElementCount VF
, VPCostContext
&Ctx
) {
773 if (!isReplicator()) {
774 InstructionCost Cost
= 0;
775 for (VPBlockBase
*Block
: vp_depth_first_shallow(getEntry()))
776 Cost
+= Block
->cost(VF
, Ctx
);
777 InstructionCost BackedgeCost
=
778 ForceTargetInstructionCost
.getNumOccurrences()
779 ? InstructionCost(ForceTargetInstructionCost
.getNumOccurrences())
780 : Ctx
.TTI
.getCFInstrCost(Instruction::Br
, TTI::TCK_RecipThroughput
);
781 LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost
<< " for VF " << VF
782 << ": vector loop backedge\n");
783 Cost
+= BackedgeCost
;
787 // Compute the cost of a replicate region. Replicating isn't supported for
788 // scalable vectors, return an invalid cost for them.
789 // TODO: Discard scalable VPlans with replicate recipes earlier after
792 return InstructionCost::getInvalid();
794 // First compute the cost of the conditionally executed recipes, followed by
795 // account for the branching cost, except if the mask is a header mask or
796 // uniform condition.
797 using namespace llvm::VPlanPatternMatch
;
798 VPBasicBlock
*Then
= cast
<VPBasicBlock
>(getEntry()->getSuccessors()[0]);
799 InstructionCost ThenCost
= Then
->cost(VF
, Ctx
);
801 // For the scalar case, we may not always execute the original predicated
802 // block, Thus, scale the block's cost by the probability of executing it.
804 return ThenCost
/ getReciprocalPredBlockProb();
809 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
810 void VPRegionBlock::print(raw_ostream
&O
, const Twine
&Indent
,
811 VPSlotTracker
&SlotTracker
) const {
812 O
<< Indent
<< (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
813 auto NewIndent
= Indent
+ " ";
814 for (auto *BlockBase
: vp_depth_first_shallow(Entry
)) {
816 BlockBase
->print(O
, NewIndent
, SlotTracker
);
818 O
<< Indent
<< "}\n";
820 printSuccessors(O
, Indent
);
824 VPlan::VPlan(VPBasicBlock
*OriginalPreheader
, VPValue
*TC
,
825 VPBasicBlock
*EntryVectorPreHeader
, VPIRBasicBlock
*ScalarHeader
)
826 : VPlan(OriginalPreheader
, TC
, ScalarHeader
) {
827 VPBlockUtils::connectBlocks(OriginalPreheader
, EntryVectorPreHeader
);
830 VPlan::VPlan(VPBasicBlock
*OriginalPreheader
,
831 VPBasicBlock
*EntryVectorPreHeader
, VPIRBasicBlock
*ScalarHeader
)
832 : VPlan(OriginalPreheader
, ScalarHeader
) {
833 VPBlockUtils::connectBlocks(OriginalPreheader
, EntryVectorPreHeader
);
839 for (VPBlockBase
*Block
: vp_depth_first_shallow(Entry
))
840 Block
->dropAllReferences(&DummyValue
);
842 VPBlockBase::deleteCFG(Entry
);
844 for (VPValue
*VPV
: VPLiveInsToFree
)
846 if (BackedgeTakenCount
)
847 delete BackedgeTakenCount
;
850 VPIRBasicBlock
*VPIRBasicBlock::fromBasicBlock(BasicBlock
*IRBB
) {
851 auto *VPIRBB
= new VPIRBasicBlock(IRBB
);
852 for (Instruction
&I
:
853 make_range(IRBB
->begin(), IRBB
->getTerminator()->getIterator()))
854 VPIRBB
->appendRecipe(new VPIRInstruction(I
));
858 VPlanPtr
VPlan::createInitialVPlan(Type
*InductionTy
,
859 PredicatedScalarEvolution
&PSE
,
860 bool RequiresScalarEpilogueCheck
,
861 bool TailFolded
, Loop
*TheLoop
) {
862 VPIRBasicBlock
*Entry
=
863 VPIRBasicBlock::fromBasicBlock(TheLoop
->getLoopPreheader());
864 VPBasicBlock
*VecPreheader
= new VPBasicBlock("vector.ph");
865 // Connect entry only to vector preheader initially. Entry will also be
866 // connected to the scalar preheader later, during skeleton creation when
867 // runtime guards are added as needed. Note that when executing the VPlan for
868 // an epilogue vector loop, the original entry block here will be replaced by
869 // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
870 // generating code for the main vector loop.
871 VPBlockUtils::connectBlocks(Entry
, VecPreheader
);
872 VPIRBasicBlock
*ScalarHeader
=
873 VPIRBasicBlock::fromBasicBlock(TheLoop
->getHeader());
874 auto Plan
= std::make_unique
<VPlan
>(Entry
, ScalarHeader
);
876 // Create SCEV and VPValue for the trip count.
877 // We use the symbolic max backedge-taken-count, which works also when
878 // vectorizing loops with uncountable early exits.
879 const SCEV
*BackedgeTakenCountSCEV
= PSE
.getSymbolicMaxBackedgeTakenCount();
880 assert(!isa
<SCEVCouldNotCompute
>(BackedgeTakenCountSCEV
) &&
881 "Invalid loop count");
882 ScalarEvolution
&SE
= *PSE
.getSE();
883 const SCEV
*TripCount
= SE
.getTripCountFromExitCount(BackedgeTakenCountSCEV
,
884 InductionTy
, TheLoop
);
886 vputils::getOrCreateVPValueForSCEVExpr(*Plan
, TripCount
, SE
);
888 // Create VPRegionBlock, with empty header and latch blocks, to be filled
889 // during processing later.
890 VPBasicBlock
*HeaderVPBB
= new VPBasicBlock("vector.body");
891 VPBasicBlock
*LatchVPBB
= new VPBasicBlock("vector.latch");
892 VPBlockUtils::insertBlockAfter(LatchVPBB
, HeaderVPBB
);
893 auto *TopRegion
= new VPRegionBlock(HeaderVPBB
, LatchVPBB
, "vector loop",
894 false /*isReplicator*/);
896 VPBlockUtils::insertBlockAfter(TopRegion
, VecPreheader
);
897 VPBasicBlock
*MiddleVPBB
= new VPBasicBlock("middle.block");
898 VPBlockUtils::insertBlockAfter(MiddleVPBB
, TopRegion
);
900 VPBasicBlock
*ScalarPH
= new VPBasicBlock("scalar.ph");
901 VPBlockUtils::connectBlocks(ScalarPH
, ScalarHeader
);
902 if (!RequiresScalarEpilogueCheck
) {
903 VPBlockUtils::connectBlocks(MiddleVPBB
, ScalarPH
);
907 // If needed, add a check in the middle block to see if we have completed
908 // all of the iterations in the first vector loop. Three cases:
909 // 1) If (N - N%VF) == N, then we *don't* need to run the remainder.
910 // Thus if tail is to be folded, we know we don't need to run the
911 // remainder and we can set the condition to true.
912 // 2) If we require a scalar epilogue, there is no conditional branch as
913 // we unconditionally branch to the scalar preheader. Do nothing.
914 // 3) Otherwise, construct a runtime check.
915 BasicBlock
*IRExitBlock
= TheLoop
->getUniqueLatchExitBlock();
916 auto *VPExitBlock
= VPIRBasicBlock::fromBasicBlock(IRExitBlock
);
917 // The connection order corresponds to the operands of the conditional branch.
918 VPBlockUtils::insertBlockAfter(VPExitBlock
, MiddleVPBB
);
919 VPBlockUtils::connectBlocks(MiddleVPBB
, ScalarPH
);
921 auto *ScalarLatchTerm
= TheLoop
->getLoopLatch()->getTerminator();
922 // Here we use the same DebugLoc as the scalar loop latch terminator instead
923 // of the corresponding compare because they may have ended up with
924 // different line numbers and we want to avoid awkward line stepping while
925 // debugging. Eg. if the compare has got a line number inside the loop.
926 VPBuilder
Builder(MiddleVPBB
);
929 ? Plan
->getOrAddLiveIn(ConstantInt::getTrue(
930 IntegerType::getInt1Ty(TripCount
->getType()->getContext())))
931 : Builder
.createICmp(CmpInst::ICMP_EQ
, Plan
->getTripCount(),
932 &Plan
->getVectorTripCount(),
933 ScalarLatchTerm
->getDebugLoc(), "cmp.n");
934 Builder
.createNaryOp(VPInstruction::BranchOnCond
, {Cmp
},
935 ScalarLatchTerm
->getDebugLoc());
939 void VPlan::prepareToExecute(Value
*TripCountV
, Value
*VectorTripCountV
,
940 VPTransformState
&State
) {
941 Type
*TCTy
= TripCountV
->getType();
942 // Check if the backedge taken count is needed, and if so build it.
943 if (BackedgeTakenCount
&& BackedgeTakenCount
->getNumUsers()) {
944 IRBuilder
<> Builder(State
.CFG
.PrevBB
->getTerminator());
945 auto *TCMO
= Builder
.CreateSub(TripCountV
, ConstantInt::get(TCTy
, 1),
946 "trip.count.minus.1");
947 BackedgeTakenCount
->setUnderlyingValue(TCMO
);
950 VectorTripCount
.setUnderlyingValue(VectorTripCountV
);
952 IRBuilder
<> Builder(State
.CFG
.PrevBB
->getTerminator());
953 // FIXME: Model VF * UF computation completely in VPlan.
954 assert(VFxUF
.getNumUsers() && "VFxUF expected to always have users");
955 unsigned UF
= getUF();
956 if (VF
.getNumUsers()) {
957 Value
*RuntimeVF
= getRuntimeVF(Builder
, TCTy
, State
.VF
);
958 VF
.setUnderlyingValue(RuntimeVF
);
959 VFxUF
.setUnderlyingValue(
960 UF
> 1 ? Builder
.CreateMul(RuntimeVF
, ConstantInt::get(TCTy
, UF
))
963 VFxUF
.setUnderlyingValue(createStepForVF(Builder
, TCTy
, State
.VF
, UF
));
967 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
968 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
969 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
970 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
971 static void replaceVPBBWithIRVPBB(VPBasicBlock
*VPBB
, BasicBlock
*IRBB
) {
972 VPIRBasicBlock
*IRVPBB
= VPIRBasicBlock::fromBasicBlock(IRBB
);
973 for (auto &R
: make_early_inc_range(*VPBB
)) {
974 assert(!R
.isPhi() && "Tried to move phi recipe to end of block");
975 R
.moveBefore(*IRVPBB
, IRVPBB
->end());
978 VPBlockUtils::reassociateBlocks(VPBB
, IRVPBB
);
983 /// Generate the code inside the preheader and body of the vectorized loop.
984 /// Assumes a single pre-header basic-block was created for this. Introduce
985 /// additional basic-blocks as needed, and fill them all.
986 void VPlan::execute(VPTransformState
*State
) {
987 // Initialize CFG state.
988 State
->CFG
.PrevVPBB
= nullptr;
989 State
->CFG
.ExitBB
= State
->CFG
.PrevBB
->getSingleSuccessor();
990 BasicBlock
*VectorPreHeader
= State
->CFG
.PrevBB
;
991 State
->Builder
.SetInsertPoint(VectorPreHeader
->getTerminator());
993 // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
994 cast
<BranchInst
>(VectorPreHeader
->getTerminator())->setSuccessor(0, nullptr);
995 State
->CFG
.DTU
.applyUpdates(
996 {{DominatorTree::Delete
, VectorPreHeader
, State
->CFG
.ExitBB
}});
998 // Replace regular VPBB's for the vector preheader, middle and scalar
999 // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR
1000 // blocks are created during skeleton creation, so we can only create the
1001 // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan
1003 BasicBlock
*MiddleBB
= State
->CFG
.ExitBB
;
1004 BasicBlock
*ScalarPh
= MiddleBB
->getSingleSuccessor();
1005 replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader
);
1006 replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB
);
1007 replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh
);
1009 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State
->VF
1010 << ", UF=" << getUF() << '\n');
1011 setName("Final VPlan");
1014 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State
->VF
1015 << ", UF=" << getUF() << '\n');
1016 setName("Final VPlan");
1019 // Disconnect the middle block from its single successor (the scalar loop
1020 // header) in both the CFG and DT. The branch will be recreated during VPlan
1022 auto *BrInst
= new UnreachableInst(MiddleBB
->getContext());
1023 BrInst
->insertBefore(MiddleBB
->getTerminator());
1024 MiddleBB
->getTerminator()->eraseFromParent();
1025 State
->CFG
.DTU
.applyUpdates({{DominatorTree::Delete
, MiddleBB
, ScalarPh
}});
1026 // Disconnect scalar preheader and scalar header, as the dominator tree edge
1027 // will be updated as part of VPlan execution. This allows keeping the DTU
1028 // logic generic during VPlan execution.
1029 State
->CFG
.DTU
.applyUpdates(
1030 {{DominatorTree::Delete
, ScalarPh
, ScalarPh
->getSingleSuccessor()}});
1032 ReversePostOrderTraversal
<VPBlockShallowTraversalWrapper
<VPBlockBase
*>> RPOT(
1034 // Generate code for the VPlan, in parts of the vector skeleton, loop body and
1035 // successor blocks including the middle, exit and scalar preheader blocks.
1036 for (VPBlockBase
*Block
: RPOT
)
1037 Block
->execute(State
);
1039 VPBasicBlock
*LatchVPBB
= getVectorLoopRegion()->getExitingBasicBlock();
1040 BasicBlock
*VectorLatchBB
= State
->CFG
.VPBB2IRBB
[LatchVPBB
];
1042 // Fix the latch value of canonical, reduction and first-order recurrences
1043 // phis in the vector loop.
1044 VPBasicBlock
*Header
= getVectorLoopRegion()->getEntryBasicBlock();
1045 for (VPRecipeBase
&R
: Header
->phis()) {
1046 // Skip phi-like recipes that generate their backedege values themselves.
1047 if (isa
<VPWidenPHIRecipe
>(&R
))
1050 if (isa
<VPWidenPointerInductionRecipe
, VPWidenIntOrFpInductionRecipe
>(&R
)) {
1051 PHINode
*Phi
= nullptr;
1052 if (isa
<VPWidenIntOrFpInductionRecipe
>(&R
)) {
1053 Phi
= cast
<PHINode
>(State
->get(R
.getVPSingleValue()));
1055 auto *WidenPhi
= cast
<VPWidenPointerInductionRecipe
>(&R
);
1056 assert(!WidenPhi
->onlyScalarsGenerated(State
->VF
.isScalable()) &&
1057 "recipe generating only scalars should have been replaced");
1058 auto *GEP
= cast
<GetElementPtrInst
>(State
->get(WidenPhi
));
1059 Phi
= cast
<PHINode
>(GEP
->getPointerOperand());
1062 Phi
->setIncomingBlock(1, VectorLatchBB
);
1064 // Move the last step to the end of the latch block. This ensures
1065 // consistent placement of all induction updates.
1066 Instruction
*Inc
= cast
<Instruction
>(Phi
->getIncomingValue(1));
1067 Inc
->moveBefore(VectorLatchBB
->getTerminator()->getPrevNode());
1069 // Use the steps for the last part as backedge value for the induction.
1070 if (auto *IV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&R
))
1071 Inc
->setOperand(0, State
->get(IV
->getLastUnrolledPartOperand()));
1075 auto *PhiR
= cast
<VPHeaderPHIRecipe
>(&R
);
1076 bool NeedsScalar
= isa
<VPScalarPHIRecipe
>(PhiR
) ||
1077 (isa
<VPReductionPHIRecipe
>(PhiR
) &&
1078 cast
<VPReductionPHIRecipe
>(PhiR
)->isInLoop());
1079 Value
*Phi
= State
->get(PhiR
, NeedsScalar
);
1080 Value
*Val
= State
->get(PhiR
->getBackedgeValue(), NeedsScalar
);
1081 cast
<PHINode
>(Phi
)->addIncoming(Val
, VectorLatchBB
);
1084 State
->CFG
.DTU
.flush();
1087 InstructionCost
VPlan::cost(ElementCount VF
, VPCostContext
&Ctx
) {
1088 // For now only return the cost of the vector loop region, ignoring any other
1089 // blocks, like the preheader or middle blocks.
1090 return getVectorLoopRegion()->cost(VF
, Ctx
);
1093 VPRegionBlock
*VPlan::getVectorLoopRegion() {
1094 // TODO: Cache if possible.
1095 for (VPBlockBase
*B
: vp_depth_first_shallow(getEntry()))
1096 if (auto *R
= dyn_cast
<VPRegionBlock
>(B
))
1101 const VPRegionBlock
*VPlan::getVectorLoopRegion() const {
1102 for (const VPBlockBase
*B
: vp_depth_first_shallow(getEntry()))
1103 if (auto *R
= dyn_cast
<VPRegionBlock
>(B
))
1108 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1109 void VPlan::printLiveIns(raw_ostream
&O
) const {
1110 VPSlotTracker
SlotTracker(this);
1112 if (VF
.getNumUsers() > 0) {
1114 VF
.printAsOperand(O
, SlotTracker
);
1118 if (VFxUF
.getNumUsers() > 0) {
1120 VFxUF
.printAsOperand(O
, SlotTracker
);
1124 if (VectorTripCount
.getNumUsers() > 0) {
1126 VectorTripCount
.printAsOperand(O
, SlotTracker
);
1127 O
<< " = vector-trip-count";
1130 if (BackedgeTakenCount
&& BackedgeTakenCount
->getNumUsers()) {
1132 BackedgeTakenCount
->printAsOperand(O
, SlotTracker
);
1133 O
<< " = backedge-taken count";
1137 if (TripCount
->isLiveIn())
1139 TripCount
->printAsOperand(O
, SlotTracker
);
1140 O
<< " = original trip-count";
1145 void VPlan::print(raw_ostream
&O
) const {
1146 VPSlotTracker
SlotTracker(this);
1148 O
<< "VPlan '" << getName() << "' {";
1152 ReversePostOrderTraversal
<VPBlockShallowTraversalWrapper
<const VPBlockBase
*>>
1154 for (const VPBlockBase
*Block
: RPOT
) {
1156 Block
->print(O
, "", SlotTracker
);
1162 std::string
VPlan::getName() const {
1164 raw_string_ostream
RSO(Out
);
1165 RSO
<< Name
<< " for ";
1167 RSO
<< "VF={" << VFs
[0];
1168 for (ElementCount VF
: drop_begin(VFs
))
1176 RSO
<< "UF={" << UFs
[0];
1177 for (unsigned UF
: drop_begin(UFs
))
1186 void VPlan::printDOT(raw_ostream
&O
) const {
1187 VPlanPrinter
Printer(O
, *this);
1192 void VPlan::dump() const { print(dbgs()); }
1195 static void remapOperands(VPBlockBase
*Entry
, VPBlockBase
*NewEntry
,
1196 DenseMap
<VPValue
*, VPValue
*> &Old2NewVPValues
) {
1197 // Update the operands of all cloned recipes starting at NewEntry. This
1198 // traverses all reachable blocks. This is done in two steps, to handle cycles
1200 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>>
1202 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>>
1203 NewDeepRPOT(NewEntry
);
1204 // First, collect all mappings from old to new VPValues defined by cloned
1206 for (const auto &[OldBB
, NewBB
] :
1207 zip(VPBlockUtils::blocksOnly
<VPBasicBlock
>(OldDeepRPOT
),
1208 VPBlockUtils::blocksOnly
<VPBasicBlock
>(NewDeepRPOT
))) {
1209 assert(OldBB
->getRecipeList().size() == NewBB
->getRecipeList().size() &&
1210 "blocks must have the same number of recipes");
1211 for (const auto &[OldR
, NewR
] : zip(*OldBB
, *NewBB
)) {
1212 assert(OldR
.getNumOperands() == NewR
.getNumOperands() &&
1213 "recipes must have the same number of operands");
1214 assert(OldR
.getNumDefinedValues() == NewR
.getNumDefinedValues() &&
1215 "recipes must define the same number of operands");
1216 for (const auto &[OldV
, NewV
] :
1217 zip(OldR
.definedValues(), NewR
.definedValues()))
1218 Old2NewVPValues
[OldV
] = NewV
;
1222 // Update all operands to use cloned VPValues.
1223 for (VPBasicBlock
*NewBB
:
1224 VPBlockUtils::blocksOnly
<VPBasicBlock
>(NewDeepRPOT
)) {
1225 for (VPRecipeBase
&NewR
: *NewBB
)
1226 for (unsigned I
= 0, E
= NewR
.getNumOperands(); I
!= E
; ++I
) {
1227 VPValue
*NewOp
= Old2NewVPValues
.lookup(NewR
.getOperand(I
));
1228 NewR
.setOperand(I
, NewOp
);
1233 VPlan
*VPlan::duplicate() {
1235 const auto &[NewEntry
, __
] = cloneFrom(Entry
);
1237 BasicBlock
*ScalarHeaderIRBB
= getScalarHeader()->getIRBasicBlock();
1238 VPIRBasicBlock
*NewScalarHeader
= cast
<VPIRBasicBlock
>(*find_if(
1239 vp_depth_first_shallow(NewEntry
), [ScalarHeaderIRBB
](VPBlockBase
*VPB
) {
1240 auto *VPIRBB
= dyn_cast
<VPIRBasicBlock
>(VPB
);
1241 return VPIRBB
&& VPIRBB
->getIRBasicBlock() == ScalarHeaderIRBB
;
1243 // Create VPlan, clone live-ins and remap operands in the cloned blocks.
1244 auto *NewPlan
= new VPlan(cast
<VPBasicBlock
>(NewEntry
), NewScalarHeader
);
1245 DenseMap
<VPValue
*, VPValue
*> Old2NewVPValues
;
1246 for (VPValue
*OldLiveIn
: VPLiveInsToFree
) {
1247 Old2NewVPValues
[OldLiveIn
] =
1248 NewPlan
->getOrAddLiveIn(OldLiveIn
->getLiveInIRValue());
1250 Old2NewVPValues
[&VectorTripCount
] = &NewPlan
->VectorTripCount
;
1251 Old2NewVPValues
[&VF
] = &NewPlan
->VF
;
1252 Old2NewVPValues
[&VFxUF
] = &NewPlan
->VFxUF
;
1253 if (BackedgeTakenCount
) {
1254 NewPlan
->BackedgeTakenCount
= new VPValue();
1255 Old2NewVPValues
[BackedgeTakenCount
] = NewPlan
->BackedgeTakenCount
;
1257 assert(TripCount
&& "trip count must be set");
1258 if (TripCount
->isLiveIn())
1259 Old2NewVPValues
[TripCount
] =
1260 NewPlan
->getOrAddLiveIn(TripCount
->getLiveInIRValue());
1261 // else NewTripCount will be created and inserted into Old2NewVPValues when
1262 // TripCount is cloned. In any case NewPlan->TripCount is updated below.
1264 remapOperands(Entry
, NewEntry
, Old2NewVPValues
);
1266 // Initialize remaining fields of cloned VPlan.
1269 // TODO: Adjust names.
1270 NewPlan
->Name
= Name
;
1271 assert(Old2NewVPValues
.contains(TripCount
) &&
1272 "TripCount must have been added to Old2NewVPValues");
1273 NewPlan
->TripCount
= Old2NewVPValues
[TripCount
];
1277 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1279 Twine
VPlanPrinter::getUID(const VPBlockBase
*Block
) {
1280 return (isa
<VPRegionBlock
>(Block
) ? "cluster_N" : "N") +
1281 Twine(getOrCreateBID(Block
));
1284 Twine
VPlanPrinter::getOrCreateName(const VPBlockBase
*Block
) {
1285 const std::string
&Name
= Block
->getName();
1288 return "VPB" + Twine(getOrCreateBID(Block
));
1291 void VPlanPrinter::dump() {
1294 OS
<< "digraph VPlan {\n";
1295 OS
<< "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
1296 if (!Plan
.getName().empty())
1297 OS
<< "\\n" << DOT::EscapeString(Plan
.getName());
1302 raw_string_ostream
SS(Str
);
1303 Plan
.printLiveIns(SS
);
1304 SmallVector
<StringRef
, 0> Lines
;
1305 StringRef(Str
).rtrim('\n').split(Lines
, "\n");
1306 for (auto Line
: Lines
)
1307 OS
<< DOT::EscapeString(Line
.str()) << "\\n";
1311 OS
<< "node [shape=rect, fontname=Courier, fontsize=30]\n";
1312 OS
<< "edge [fontname=Courier, fontsize=30]\n";
1313 OS
<< "compound=true\n";
1315 for (const VPBlockBase
*Block
: vp_depth_first_shallow(Plan
.getEntry()))
1321 void VPlanPrinter::dumpBlock(const VPBlockBase
*Block
) {
1322 if (const VPBasicBlock
*BasicBlock
= dyn_cast
<VPBasicBlock
>(Block
))
1323 dumpBasicBlock(BasicBlock
);
1324 else if (const VPRegionBlock
*Region
= dyn_cast
<VPRegionBlock
>(Block
))
1327 llvm_unreachable("Unsupported kind of VPBlock.");
1330 void VPlanPrinter::drawEdge(const VPBlockBase
*From
, const VPBlockBase
*To
,
1331 bool Hidden
, const Twine
&Label
) {
1332 // Due to "dot" we print an edge between two regions as an edge between the
1333 // exiting basic block and the entry basic of the respective regions.
1334 const VPBlockBase
*Tail
= From
->getExitingBasicBlock();
1335 const VPBlockBase
*Head
= To
->getEntryBasicBlock();
1336 OS
<< Indent
<< getUID(Tail
) << " -> " << getUID(Head
);
1337 OS
<< " [ label=\"" << Label
<< '\"';
1339 OS
<< " ltail=" << getUID(From
);
1341 OS
<< " lhead=" << getUID(To
);
1343 OS
<< "; splines=none";
1347 void VPlanPrinter::dumpEdges(const VPBlockBase
*Block
) {
1348 auto &Successors
= Block
->getSuccessors();
1349 if (Successors
.size() == 1)
1350 drawEdge(Block
, Successors
.front(), false, "");
1351 else if (Successors
.size() == 2) {
1352 drawEdge(Block
, Successors
.front(), false, "T");
1353 drawEdge(Block
, Successors
.back(), false, "F");
1355 unsigned SuccessorNumber
= 0;
1356 for (auto *Successor
: Successors
)
1357 drawEdge(Block
, Successor
, false, Twine(SuccessorNumber
++));
1361 void VPlanPrinter::dumpBasicBlock(const VPBasicBlock
*BasicBlock
) {
1362 // Implement dot-formatted dump by performing plain-text dump into the
1363 // temporary storage followed by some post-processing.
1364 OS
<< Indent
<< getUID(BasicBlock
) << " [label =\n";
1367 raw_string_ostream
SS(Str
);
1368 // Use no indentation as we need to wrap the lines into quotes ourselves.
1369 BasicBlock
->print(SS
, "", SlotTracker
);
1371 // We need to process each line of the output separately, so split
1372 // single-string plain-text dump.
1373 SmallVector
<StringRef
, 0> Lines
;
1374 StringRef(Str
).rtrim('\n').split(Lines
, "\n");
1376 auto EmitLine
= [&](StringRef Line
, StringRef Suffix
) {
1377 OS
<< Indent
<< '"' << DOT::EscapeString(Line
.str()) << "\\l\"" << Suffix
;
1380 // Don't need the "+" after the last line.
1381 for (auto Line
: make_range(Lines
.begin(), Lines
.end() - 1))
1382 EmitLine(Line
, " +\n");
1383 EmitLine(Lines
.back(), "\n");
1386 OS
<< Indent
<< "]\n";
1388 dumpEdges(BasicBlock
);
1391 void VPlanPrinter::dumpRegion(const VPRegionBlock
*Region
) {
1392 OS
<< Indent
<< "subgraph " << getUID(Region
) << " {\n";
1394 OS
<< Indent
<< "fontname=Courier\n"
1395 << Indent
<< "label=\""
1396 << DOT::EscapeString(Region
->isReplicator() ? "<xVFxUF> " : "<x1> ")
1397 << DOT::EscapeString(Region
->getName()) << "\"\n";
1398 // Dump the blocks of the region.
1399 assert(Region
->getEntry() && "Region contains no inner blocks.");
1400 for (const VPBlockBase
*Block
: vp_depth_first_shallow(Region
->getEntry()))
1403 OS
<< Indent
<< "}\n";
1407 void VPlanIngredient::print(raw_ostream
&O
) const {
1408 if (auto *Inst
= dyn_cast
<Instruction
>(V
)) {
1409 if (!Inst
->getType()->isVoidTy()) {
1410 Inst
->printAsOperand(O
, false);
1413 O
<< Inst
->getOpcodeName() << " ";
1414 unsigned E
= Inst
->getNumOperands();
1416 Inst
->getOperand(0)->printAsOperand(O
, false);
1417 for (unsigned I
= 1; I
< E
; ++I
)
1418 Inst
->getOperand(I
)->printAsOperand(O
<< ", ", false);
1421 V
->printAsOperand(O
, false);
1426 bool VPValue::isDefinedOutsideLoopRegions() const {
1427 return !hasDefiningRecipe() ||
1428 !getDefiningRecipe()->getParent()->getEnclosingLoopRegion();
1431 void VPValue::replaceAllUsesWith(VPValue
*New
) {
1432 replaceUsesWithIf(New
, [](VPUser
&, unsigned) { return true; });
1435 void VPValue::replaceUsesWithIf(
1437 llvm::function_ref
<bool(VPUser
&U
, unsigned Idx
)> ShouldReplace
) {
1438 // Note that this early exit is required for correctness; the implementation
1439 // below relies on the number of users for this VPValue to decrease, which
1440 // isn't the case if this == New.
1444 for (unsigned J
= 0; J
< getNumUsers();) {
1445 VPUser
*User
= Users
[J
];
1446 bool RemovedUser
= false;
1447 for (unsigned I
= 0, E
= User
->getNumOperands(); I
< E
; ++I
) {
1448 if (User
->getOperand(I
) != this || !ShouldReplace(*User
, I
))
1452 User
->setOperand(I
, New
);
1454 // If a user got removed after updating the current user, the next user to
1455 // update will be moved to the current position, so we only need to
1456 // increment the index if the number of users did not change.
1462 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1463 void VPValue::printAsOperand(raw_ostream
&OS
, VPSlotTracker
&Tracker
) const {
1464 OS
<< Tracker
.getOrCreateName(this);
1467 void VPUser::printOperands(raw_ostream
&O
, VPSlotTracker
&SlotTracker
) const {
1468 interleaveComma(operands(), O
, [&O
, &SlotTracker
](VPValue
*Op
) {
1469 Op
->printAsOperand(O
, SlotTracker
);
1474 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock
*Region
,
1476 InterleavedAccessInfo
&IAI
) {
1477 ReversePostOrderTraversal
<VPBlockShallowTraversalWrapper
<VPBlockBase
*>>
1478 RPOT(Region
->getEntry());
1479 for (VPBlockBase
*Base
: RPOT
) {
1480 visitBlock(Base
, Old2New
, IAI
);
1484 void VPInterleavedAccessInfo::visitBlock(VPBlockBase
*Block
, Old2NewTy
&Old2New
,
1485 InterleavedAccessInfo
&IAI
) {
1486 if (VPBasicBlock
*VPBB
= dyn_cast
<VPBasicBlock
>(Block
)) {
1487 for (VPRecipeBase
&VPI
: *VPBB
) {
1488 if (isa
<VPWidenPHIRecipe
>(&VPI
))
1490 assert(isa
<VPInstruction
>(&VPI
) && "Can only handle VPInstructions");
1491 auto *VPInst
= cast
<VPInstruction
>(&VPI
);
1493 auto *Inst
= dyn_cast_or_null
<Instruction
>(VPInst
->getUnderlyingValue());
1496 auto *IG
= IAI
.getInterleaveGroup(Inst
);
1500 auto NewIGIter
= Old2New
.find(IG
);
1501 if (NewIGIter
== Old2New
.end())
1502 Old2New
[IG
] = new InterleaveGroup
<VPInstruction
>(
1503 IG
->getFactor(), IG
->isReverse(), IG
->getAlign());
1505 if (Inst
== IG
->getInsertPos())
1506 Old2New
[IG
]->setInsertPos(VPInst
);
1508 InterleaveGroupMap
[VPInst
] = Old2New
[IG
];
1509 InterleaveGroupMap
[VPInst
]->insertMember(
1510 VPInst
, IG
->getIndex(Inst
),
1511 Align(IG
->isReverse() ? (-1) * int(IG
->getFactor())
1512 : IG
->getFactor()));
1514 } else if (VPRegionBlock
*Region
= dyn_cast
<VPRegionBlock
>(Block
))
1515 visitRegion(Region
, Old2New
, IAI
);
1517 llvm_unreachable("Unsupported kind of VPBlock.");
1520 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan
&Plan
,
1521 InterleavedAccessInfo
&IAI
) {
1523 visitRegion(Plan
.getVectorLoopRegion(), Old2New
, IAI
);
1526 void VPSlotTracker::assignName(const VPValue
*V
) {
1527 assert(!VPValue2Name
.contains(V
) && "VPValue already has a name!");
1528 auto *UV
= V
->getUnderlyingValue();
1529 auto *VPI
= dyn_cast_or_null
<VPInstruction
>(V
->getDefiningRecipe());
1530 if (!UV
&& !(VPI
&& !VPI
->getName().empty())) {
1531 VPValue2Name
[V
] = (Twine("vp<%") + Twine(NextSlot
) + ">").str();
1536 // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
1537 // appending ".Number" to the name if there are multiple uses.
1540 raw_string_ostream
S(Name
);
1541 UV
->printAsOperand(S
, false);
1543 Name
= VPI
->getName();
1545 assert(!Name
.empty() && "Name cannot be empty.");
1546 StringRef Prefix
= UV
? "ir<" : "vp<%";
1547 std::string BaseName
= (Twine(Prefix
) + Name
+ Twine(">")).str();
1549 // First assign the base name for V.
1550 const auto &[A
, _
] = VPValue2Name
.insert({V
, BaseName
});
1551 // Integer or FP constants with different types will result in he same string
1552 // due to stripping types.
1553 if (V
->isLiveIn() && isa
<ConstantInt
, ConstantFP
>(UV
))
1556 // If it is already used by C > 0 other VPValues, increase the version counter
1557 // C and use it for V.
1558 const auto &[C
, UseInserted
] = BaseName2Version
.insert({BaseName
, 0});
1561 A
->second
= (BaseName
+ Twine(".") + Twine(C
->second
)).str();
1565 void VPSlotTracker::assignNames(const VPlan
&Plan
) {
1566 if (Plan
.VF
.getNumUsers() > 0)
1567 assignName(&Plan
.VF
);
1568 if (Plan
.VFxUF
.getNumUsers() > 0)
1569 assignName(&Plan
.VFxUF
);
1570 assignName(&Plan
.VectorTripCount
);
1571 if (Plan
.BackedgeTakenCount
)
1572 assignName(Plan
.BackedgeTakenCount
);
1573 for (VPValue
*LI
: Plan
.VPLiveInsToFree
)
1576 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<const VPBlockBase
*>>
1577 RPOT(VPBlockDeepTraversalWrapper
<const VPBlockBase
*>(Plan
.getEntry()));
1578 for (const VPBasicBlock
*VPBB
:
1579 VPBlockUtils::blocksOnly
<const VPBasicBlock
>(RPOT
))
1583 void VPSlotTracker::assignNames(const VPBasicBlock
*VPBB
) {
1584 for (const VPRecipeBase
&Recipe
: *VPBB
)
1585 for (VPValue
*Def
: Recipe
.definedValues())
1589 std::string
VPSlotTracker::getOrCreateName(const VPValue
*V
) const {
1590 std::string Name
= VPValue2Name
.lookup(V
);
1594 // If no name was assigned, no VPlan was provided when creating the slot
1595 // tracker or it is not reachable from the provided VPlan. This can happen,
1596 // e.g. when trying to print a recipe that has not been inserted into a VPlan
1598 // TODO: Update VPSlotTracker constructor to assign names to recipes &
1599 // VPValues not associated with a VPlan, instead of constructing names ad-hoc
1601 const VPRecipeBase
*DefR
= V
->getDefiningRecipe();
1603 assert((!DefR
|| !DefR
->getParent() || !DefR
->getParent()->getPlan()) &&
1604 "VPValue defined by a recipe in a VPlan?");
1606 // Use the underlying value's name, if there is one.
1607 if (auto *UV
= V
->getUnderlyingValue()) {
1609 raw_string_ostream
S(Name
);
1610 UV
->printAsOperand(S
, false);
1611 return (Twine("ir<") + Name
+ ">").str();
1617 bool LoopVectorizationPlanner::getDecisionAndClampRange(
1618 const std::function
<bool(ElementCount
)> &Predicate
, VFRange
&Range
) {
1619 assert(!Range
.isEmpty() && "Trying to test an empty VF range.");
1620 bool PredicateAtRangeStart
= Predicate(Range
.Start
);
1622 for (ElementCount TmpVF
: VFRange(Range
.Start
* 2, Range
.End
))
1623 if (Predicate(TmpVF
) != PredicateAtRangeStart
) {
1628 return PredicateAtRangeStart
;
1631 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
1632 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
1633 /// of VF's starting at a given VF and extending it as much as possible. Each
1634 /// vectorization decision can potentially shorten this sub-range during
1636 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF
,
1637 ElementCount MaxVF
) {
1638 auto MaxVFTimes2
= MaxVF
* 2;
1639 for (ElementCount VF
= MinVF
; ElementCount::isKnownLT(VF
, MaxVFTimes2
);) {
1640 VFRange SubRange
= {VF
, MaxVFTimes2
};
1641 auto Plan
= buildVPlan(SubRange
);
1642 VPlanTransforms::optimize(*Plan
);
1643 VPlans
.push_back(std::move(Plan
));
1648 VPlan
&LoopVectorizationPlanner::getPlanFor(ElementCount VF
) const {
1649 assert(count_if(VPlans
,
1650 [VF
](const VPlanPtr
&Plan
) { return Plan
->hasVF(VF
); }) ==
1652 "Multiple VPlans for VF.");
1654 for (const VPlanPtr
&Plan
: VPlans
) {
1655 if (Plan
->hasVF(VF
))
1658 llvm_unreachable("No plan found!");
1661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662 void LoopVectorizationPlanner::printPlans(raw_ostream
&O
) {
1663 if (VPlans
.empty()) {
1664 O
<< "LV: No VPlans built.\n";
1667 for (const auto &Plan
: VPlans
)
1668 if (PrintVPlansInDotFormat
)
1675 TargetTransformInfo::OperandValueInfo
1676 VPCostContext::getOperandInfo(VPValue
*V
) const {
1680 return TTI::getOperandInfo(V
->getLiveInIRValue());