1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements a set of utility VPlan to VPlan transformations.
12 //===----------------------------------------------------------------------===//
14 #include "VPlanTransforms.h"
15 #include "VPRecipeBuilder.h"
16 #include "VPlanAnalysis.h"
18 #include "VPlanDominatorTree.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/Analysis/IVDescriptors.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/PatternMatch.h"
29 using namespace llvm::PatternMatch
;
31 void VPlanTransforms::VPInstructionsToVPRecipes(
33 function_ref
<const InductionDescriptor
*(PHINode
*)>
34 GetIntOrFpInductionDescriptor
,
35 ScalarEvolution
&SE
, const TargetLibraryInfo
&TLI
) {
37 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>> RPOT(
39 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(RPOT
)) {
40 VPRecipeBase
*Term
= VPBB
->getTerminator();
41 auto EndIter
= Term
? Term
->getIterator() : VPBB
->end();
42 // Introduce each ingredient into VPlan.
43 for (VPRecipeBase
&Ingredient
:
44 make_early_inc_range(make_range(VPBB
->begin(), EndIter
))) {
46 VPValue
*VPV
= Ingredient
.getVPSingleValue();
47 Instruction
*Inst
= cast
<Instruction
>(VPV
->getUnderlyingValue());
49 VPRecipeBase
*NewRecipe
= nullptr;
50 if (auto *VPPhi
= dyn_cast
<VPWidenPHIRecipe
>(&Ingredient
)) {
51 auto *Phi
= cast
<PHINode
>(VPPhi
->getUnderlyingValue());
52 if (const auto *II
= GetIntOrFpInductionDescriptor(Phi
)) {
53 VPValue
*Start
= Plan
->getVPValueOrAddLiveIn(II
->getStartValue());
55 vputils::getOrCreateVPValueForSCEVExpr(*Plan
, II
->getStep(), SE
);
56 NewRecipe
= new VPWidenIntOrFpInductionRecipe(Phi
, Start
, Step
, *II
);
58 Plan
->addVPValue(Phi
, VPPhi
);
62 assert(isa
<VPInstruction
>(&Ingredient
) &&
63 "only VPInstructions expected here");
64 assert(!isa
<PHINode
>(Inst
) && "phis should be handled above");
65 // Create VPWidenMemoryInstructionRecipe for loads and stores.
66 if (LoadInst
*Load
= dyn_cast
<LoadInst
>(Inst
)) {
67 NewRecipe
= new VPWidenMemoryInstructionRecipe(
68 *Load
, Ingredient
.getOperand(0), nullptr /*Mask*/,
69 false /*Consecutive*/, false /*Reverse*/);
70 } else if (StoreInst
*Store
= dyn_cast
<StoreInst
>(Inst
)) {
71 NewRecipe
= new VPWidenMemoryInstructionRecipe(
72 *Store
, Ingredient
.getOperand(1), Ingredient
.getOperand(0),
73 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
74 } else if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(Inst
)) {
75 NewRecipe
= new VPWidenGEPRecipe(GEP
, Ingredient
.operands());
76 } else if (CallInst
*CI
= dyn_cast
<CallInst
>(Inst
)) {
77 NewRecipe
= new VPWidenCallRecipe(
78 *CI
, drop_end(Ingredient
.operands()),
79 getVectorIntrinsicIDForCall(CI
, &TLI
), CI
->getDebugLoc());
80 } else if (SelectInst
*SI
= dyn_cast
<SelectInst
>(Inst
)) {
81 NewRecipe
= new VPWidenSelectRecipe(*SI
, Ingredient
.operands());
82 } else if (auto *CI
= dyn_cast
<CastInst
>(Inst
)) {
83 NewRecipe
= new VPWidenCastRecipe(
84 CI
->getOpcode(), Ingredient
.getOperand(0), CI
->getType(), *CI
);
86 NewRecipe
= new VPWidenRecipe(*Inst
, Ingredient
.operands());
90 NewRecipe
->insertBefore(&Ingredient
);
91 if (NewRecipe
->getNumDefinedValues() == 1)
92 VPV
->replaceAllUsesWith(NewRecipe
->getVPSingleValue());
94 assert(NewRecipe
->getNumDefinedValues() == 0 &&
95 "Only recpies with zero or one defined values expected");
96 Ingredient
.eraseFromParent();
101 static bool sinkScalarOperands(VPlan
&Plan
) {
102 auto Iter
= vp_depth_first_deep(Plan
.getEntry());
103 bool Changed
= false;
104 // First, collect the operands of all recipes in replicate blocks as seeds for
106 SetVector
<std::pair
<VPBasicBlock
*, VPSingleDefRecipe
*>> WorkList
;
107 for (VPRegionBlock
*VPR
: VPBlockUtils::blocksOnly
<VPRegionBlock
>(Iter
)) {
108 VPBasicBlock
*EntryVPBB
= VPR
->getEntryBasicBlock();
109 if (!VPR
->isReplicator() || EntryVPBB
->getSuccessors().size() != 2)
111 VPBasicBlock
*VPBB
= dyn_cast
<VPBasicBlock
>(EntryVPBB
->getSuccessors()[0]);
112 if (!VPBB
|| VPBB
->getSingleSuccessor() != VPR
->getExitingBasicBlock())
114 for (auto &Recipe
: *VPBB
) {
115 for (VPValue
*Op
: Recipe
.operands())
117 dyn_cast_or_null
<VPSingleDefRecipe
>(Op
->getDefiningRecipe()))
118 WorkList
.insert(std::make_pair(VPBB
, Def
));
122 bool ScalarVFOnly
= Plan
.hasScalarVFOnly();
123 // Try to sink each replicate or scalar IV steps recipe in the worklist.
124 for (unsigned I
= 0; I
!= WorkList
.size(); ++I
) {
125 VPBasicBlock
*SinkTo
;
126 VPSingleDefRecipe
*SinkCandidate
;
127 std::tie(SinkTo
, SinkCandidate
) = WorkList
[I
];
128 if (SinkCandidate
->getParent() == SinkTo
||
129 SinkCandidate
->mayHaveSideEffects() ||
130 SinkCandidate
->mayReadOrWriteMemory())
132 if (auto *RepR
= dyn_cast
<VPReplicateRecipe
>(SinkCandidate
)) {
133 if (!ScalarVFOnly
&& RepR
->isUniform())
135 } else if (!isa
<VPScalarIVStepsRecipe
>(SinkCandidate
))
138 bool NeedsDuplicating
= false;
139 // All recipe users of the sink candidate must be in the same block SinkTo
140 // or all users outside of SinkTo must be uniform-after-vectorization (
141 // i.e., only first lane is used) . In the latter case, we need to duplicate
143 auto CanSinkWithUser
= [SinkTo
, &NeedsDuplicating
,
144 SinkCandidate
](VPUser
*U
) {
145 auto *UI
= dyn_cast
<VPRecipeBase
>(U
);
148 if (UI
->getParent() == SinkTo
)
150 NeedsDuplicating
= UI
->onlyFirstLaneUsed(SinkCandidate
);
151 // We only know how to duplicate VPRecipeRecipes for now.
152 return NeedsDuplicating
&& isa
<VPReplicateRecipe
>(SinkCandidate
);
154 if (!all_of(SinkCandidate
->users(), CanSinkWithUser
))
157 if (NeedsDuplicating
) {
160 Instruction
*I
= cast
<Instruction
>(
161 cast
<VPReplicateRecipe
>(SinkCandidate
)->getUnderlyingValue());
162 auto *Clone
= new VPReplicateRecipe(I
, SinkCandidate
->operands(), true);
163 // TODO: add ".cloned" suffix to name of Clone's VPValue.
165 Clone
->insertBefore(SinkCandidate
);
166 SinkCandidate
->replaceUsesWithIf(Clone
, [SinkTo
](VPUser
&U
, unsigned) {
167 return cast
<VPRecipeBase
>(&U
)->getParent() != SinkTo
;
170 SinkCandidate
->moveBefore(*SinkTo
, SinkTo
->getFirstNonPhi());
171 for (VPValue
*Op
: SinkCandidate
->operands())
173 dyn_cast_or_null
<VPSingleDefRecipe
>(Op
->getDefiningRecipe()))
174 WorkList
.insert(std::make_pair(SinkTo
, Def
));
180 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
182 VPValue
*getPredicatedMask(VPRegionBlock
*R
) {
183 auto *EntryBB
= dyn_cast
<VPBasicBlock
>(R
->getEntry());
184 if (!EntryBB
|| EntryBB
->size() != 1 ||
185 !isa
<VPBranchOnMaskRecipe
>(EntryBB
->begin()))
188 return cast
<VPBranchOnMaskRecipe
>(&*EntryBB
->begin())->getOperand(0);
191 /// If \p R is a triangle region, return the 'then' block of the triangle.
192 static VPBasicBlock
*getPredicatedThenBlock(VPRegionBlock
*R
) {
193 auto *EntryBB
= cast
<VPBasicBlock
>(R
->getEntry());
194 if (EntryBB
->getNumSuccessors() != 2)
197 auto *Succ0
= dyn_cast
<VPBasicBlock
>(EntryBB
->getSuccessors()[0]);
198 auto *Succ1
= dyn_cast
<VPBasicBlock
>(EntryBB
->getSuccessors()[1]);
199 if (!Succ0
|| !Succ1
)
202 if (Succ0
->getNumSuccessors() + Succ1
->getNumSuccessors() != 1)
204 if (Succ0
->getSingleSuccessor() == Succ1
)
206 if (Succ1
->getSingleSuccessor() == Succ0
)
211 // Merge replicate regions in their successor region, if a replicate region
212 // is connected to a successor replicate region with the same predicate by a
213 // single, empty VPBasicBlock.
214 static bool mergeReplicateRegionsIntoSuccessors(VPlan
&Plan
) {
215 SetVector
<VPRegionBlock
*> DeletedRegions
;
217 // Collect replicate regions followed by an empty block, followed by another
218 // replicate region with matching masks to process front. This is to avoid
219 // iterator invalidation issues while merging regions.
220 SmallVector
<VPRegionBlock
*, 8> WorkList
;
221 for (VPRegionBlock
*Region1
: VPBlockUtils::blocksOnly
<VPRegionBlock
>(
222 vp_depth_first_deep(Plan
.getEntry()))) {
223 if (!Region1
->isReplicator())
225 auto *MiddleBasicBlock
=
226 dyn_cast_or_null
<VPBasicBlock
>(Region1
->getSingleSuccessor());
227 if (!MiddleBasicBlock
|| !MiddleBasicBlock
->empty())
231 dyn_cast_or_null
<VPRegionBlock
>(MiddleBasicBlock
->getSingleSuccessor());
232 if (!Region2
|| !Region2
->isReplicator())
235 VPValue
*Mask1
= getPredicatedMask(Region1
);
236 VPValue
*Mask2
= getPredicatedMask(Region2
);
237 if (!Mask1
|| Mask1
!= Mask2
)
240 assert(Mask1
&& Mask2
&& "both region must have conditions");
241 WorkList
.push_back(Region1
);
244 // Move recipes from Region1 to its successor region, if both are triangles.
245 for (VPRegionBlock
*Region1
: WorkList
) {
246 if (DeletedRegions
.contains(Region1
))
248 auto *MiddleBasicBlock
= cast
<VPBasicBlock
>(Region1
->getSingleSuccessor());
249 auto *Region2
= cast
<VPRegionBlock
>(MiddleBasicBlock
->getSingleSuccessor());
251 VPBasicBlock
*Then1
= getPredicatedThenBlock(Region1
);
252 VPBasicBlock
*Then2
= getPredicatedThenBlock(Region2
);
253 if (!Then1
|| !Then2
)
256 // Note: No fusion-preventing memory dependencies are expected in either
257 // region. Such dependencies should be rejected during earlier dependence
258 // checks, which guarantee accesses can be re-ordered for vectorization.
260 // Move recipes to the successor region.
261 for (VPRecipeBase
&ToMove
: make_early_inc_range(reverse(*Then1
)))
262 ToMove
.moveBefore(*Then2
, Then2
->getFirstNonPhi());
264 auto *Merge1
= cast
<VPBasicBlock
>(Then1
->getSingleSuccessor());
265 auto *Merge2
= cast
<VPBasicBlock
>(Then2
->getSingleSuccessor());
267 // Move VPPredInstPHIRecipes from the merge block to the successor region's
268 // merge block. Update all users inside the successor region to use the
270 for (VPRecipeBase
&Phi1ToMove
: make_early_inc_range(reverse(*Merge1
))) {
272 cast
<VPPredInstPHIRecipe
>(&Phi1ToMove
)->getOperand(0);
273 VPValue
*Phi1ToMoveV
= Phi1ToMove
.getVPSingleValue();
274 Phi1ToMoveV
->replaceUsesWithIf(PredInst1
, [Then2
](VPUser
&U
, unsigned) {
275 auto *UI
= dyn_cast
<VPRecipeBase
>(&U
);
276 return UI
&& UI
->getParent() == Then2
;
279 Phi1ToMove
.moveBefore(*Merge2
, Merge2
->begin());
282 // Finally, remove the first region.
283 for (VPBlockBase
*Pred
: make_early_inc_range(Region1
->getPredecessors())) {
284 VPBlockUtils::disconnectBlocks(Pred
, Region1
);
285 VPBlockUtils::connectBlocks(Pred
, MiddleBasicBlock
);
287 VPBlockUtils::disconnectBlocks(Region1
, MiddleBasicBlock
);
288 DeletedRegions
.insert(Region1
);
291 for (VPRegionBlock
*ToDelete
: DeletedRegions
)
293 return !DeletedRegions
.empty();
296 static VPRegionBlock
*createReplicateRegion(VPReplicateRecipe
*PredRecipe
,
298 Instruction
*Instr
= PredRecipe
->getUnderlyingInstr();
299 // Build the triangular if-then region.
300 std::string RegionName
= (Twine("pred.") + Instr
->getOpcodeName()).str();
301 assert(Instr
->getParent() && "Predicated instruction not in any basic block");
302 auto *BlockInMask
= PredRecipe
->getMask();
303 auto *BOMRecipe
= new VPBranchOnMaskRecipe(BlockInMask
);
304 auto *Entry
= new VPBasicBlock(Twine(RegionName
) + ".entry", BOMRecipe
);
306 // Replace predicated replicate recipe with a replicate recipe without a
307 // mask but in the replicate region.
308 auto *RecipeWithoutMask
= new VPReplicateRecipe(
309 PredRecipe
->getUnderlyingInstr(),
310 make_range(PredRecipe
->op_begin(), std::prev(PredRecipe
->op_end())),
311 PredRecipe
->isUniform());
312 auto *Pred
= new VPBasicBlock(Twine(RegionName
) + ".if", RecipeWithoutMask
);
314 VPPredInstPHIRecipe
*PHIRecipe
= nullptr;
315 if (PredRecipe
->getNumUsers() != 0) {
316 PHIRecipe
= new VPPredInstPHIRecipe(RecipeWithoutMask
);
317 PredRecipe
->replaceAllUsesWith(PHIRecipe
);
318 PHIRecipe
->setOperand(0, RecipeWithoutMask
);
320 PredRecipe
->eraseFromParent();
321 auto *Exiting
= new VPBasicBlock(Twine(RegionName
) + ".continue", PHIRecipe
);
322 VPRegionBlock
*Region
= new VPRegionBlock(Entry
, Exiting
, RegionName
, true);
324 // Note: first set Entry as region entry and then connect successors starting
325 // from it in order, to propagate the "parent" of each VPBasicBlock.
326 VPBlockUtils::insertTwoBlocksAfter(Pred
, Exiting
, Entry
);
327 VPBlockUtils::connectBlocks(Pred
, Exiting
);
332 static void addReplicateRegions(VPlan
&Plan
) {
333 SmallVector
<VPReplicateRecipe
*> WorkList
;
334 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
335 vp_depth_first_deep(Plan
.getEntry()))) {
336 for (VPRecipeBase
&R
: *VPBB
)
337 if (auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
)) {
338 if (RepR
->isPredicated())
339 WorkList
.push_back(RepR
);
344 for (VPReplicateRecipe
*RepR
: WorkList
) {
345 VPBasicBlock
*CurrentBlock
= RepR
->getParent();
346 VPBasicBlock
*SplitBlock
= CurrentBlock
->splitAt(RepR
->getIterator());
348 BasicBlock
*OrigBB
= RepR
->getUnderlyingInstr()->getParent();
350 OrigBB
->hasName() ? OrigBB
->getName() + "." + Twine(BBNum
++) : "");
351 // Record predicated instructions for above packing optimizations.
352 VPBlockBase
*Region
= createReplicateRegion(RepR
, Plan
);
353 Region
->setParent(CurrentBlock
->getParent());
354 VPBlockUtils::disconnectBlocks(CurrentBlock
, SplitBlock
);
355 VPBlockUtils::connectBlocks(CurrentBlock
, Region
);
356 VPBlockUtils::connectBlocks(Region
, SplitBlock
);
360 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan
&Plan
) {
361 // Convert masked VPReplicateRecipes to if-then region blocks.
362 addReplicateRegions(Plan
);
364 bool ShouldSimplify
= true;
365 while (ShouldSimplify
) {
366 ShouldSimplify
= sinkScalarOperands(Plan
);
367 ShouldSimplify
|= mergeReplicateRegionsIntoSuccessors(Plan
);
368 ShouldSimplify
|= VPlanTransforms::mergeBlocksIntoPredecessors(Plan
);
371 bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan
&Plan
) {
372 SmallVector
<VPBasicBlock
*> WorkList
;
373 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
374 vp_depth_first_deep(Plan
.getEntry()))) {
376 dyn_cast_or_null
<VPBasicBlock
>(VPBB
->getSinglePredecessor());
377 if (PredVPBB
&& PredVPBB
->getNumSuccessors() == 1)
378 WorkList
.push_back(VPBB
);
381 for (VPBasicBlock
*VPBB
: WorkList
) {
382 VPBasicBlock
*PredVPBB
= cast
<VPBasicBlock
>(VPBB
->getSinglePredecessor());
383 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
))
384 R
.moveBefore(*PredVPBB
, PredVPBB
->end());
385 VPBlockUtils::disconnectBlocks(PredVPBB
, VPBB
);
386 auto *ParentRegion
= cast_or_null
<VPRegionBlock
>(VPBB
->getParent());
387 if (ParentRegion
&& ParentRegion
->getExiting() == VPBB
)
388 ParentRegion
->setExiting(PredVPBB
);
389 for (auto *Succ
: to_vector(VPBB
->successors())) {
390 VPBlockUtils::disconnectBlocks(VPBB
, Succ
);
391 VPBlockUtils::connectBlocks(PredVPBB
, Succ
);
395 return !WorkList
.empty();
398 void VPlanTransforms::removeRedundantInductionCasts(VPlan
&Plan
) {
399 for (auto &Phi
: Plan
.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
400 auto *IV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
401 if (!IV
|| IV
->getTruncInst())
404 // A sequence of IR Casts has potentially been recorded for IV, which
405 // *must be bypassed* when the IV is vectorized, because the vectorized IV
406 // will produce the desired casted value. This sequence forms a def-use
407 // chain and is provided in reverse order, ending with the cast that uses
408 // the IV phi. Search for the recipe of the last cast in the chain and
409 // replace it with the original IV. Note that only the final cast is
410 // expected to have users outside the cast-chain and the dead casts left
411 // over will be cleaned up later.
412 auto &Casts
= IV
->getInductionDescriptor().getCastInsts();
413 VPValue
*FindMyCast
= IV
;
414 for (Instruction
*IRCast
: reverse(Casts
)) {
415 VPSingleDefRecipe
*FoundUserCast
= nullptr;
416 for (auto *U
: FindMyCast
->users()) {
417 auto *UserCast
= dyn_cast
<VPSingleDefRecipe
>(U
);
418 if (UserCast
&& UserCast
->getUnderlyingValue() == IRCast
) {
419 FoundUserCast
= UserCast
;
423 FindMyCast
= FoundUserCast
;
425 FindMyCast
->replaceAllUsesWith(IV
);
429 void VPlanTransforms::removeRedundantCanonicalIVs(VPlan
&Plan
) {
430 VPCanonicalIVPHIRecipe
*CanonicalIV
= Plan
.getCanonicalIV();
431 VPWidenCanonicalIVRecipe
*WidenNewIV
= nullptr;
432 for (VPUser
*U
: CanonicalIV
->users()) {
433 WidenNewIV
= dyn_cast
<VPWidenCanonicalIVRecipe
>(U
);
441 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
442 for (VPRecipeBase
&Phi
: HeaderVPBB
->phis()) {
443 auto *WidenOriginalIV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
445 if (!WidenOriginalIV
|| !WidenOriginalIV
->isCanonical() ||
446 WidenOriginalIV
->getScalarType() != WidenNewIV
->getScalarType())
449 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
450 // everything WidenNewIV's users need. That is, WidenOriginalIV will
451 // generate a vector phi or all users of WidenNewIV demand the first lane
453 if (any_of(WidenOriginalIV
->users(),
454 [WidenOriginalIV
](VPUser
*U
) {
455 return !U
->usesScalars(WidenOriginalIV
);
457 vputils::onlyFirstLaneUsed(WidenNewIV
)) {
458 WidenNewIV
->replaceAllUsesWith(WidenOriginalIV
);
459 WidenNewIV
->eraseFromParent();
465 void VPlanTransforms::removeDeadRecipes(VPlan
&Plan
) {
466 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>> RPOT(
469 for (VPBasicBlock
*VPBB
: reverse(VPBlockUtils::blocksOnly
<VPBasicBlock
>(RPOT
))) {
470 // The recipes in the block are processed in reverse order, to catch chains
472 for (VPRecipeBase
&R
: make_early_inc_range(reverse(*VPBB
))) {
473 // A user keeps R alive:
474 if (any_of(R
.definedValues(),
475 [](VPValue
*V
) { return V
->getNumUsers(); }))
478 // Having side effects keeps R alive, but do remove conditional assume
479 // instructions as their conditions may be flattened.
480 auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
);
481 bool IsConditionalAssume
=
482 RepR
&& RepR
->isPredicated() &&
483 match(RepR
->getUnderlyingInstr(), m_Intrinsic
<Intrinsic::assume
>());
484 if (R
.mayHaveSideEffects() && !IsConditionalAssume
)
492 static VPValue
*createScalarIVSteps(VPlan
&Plan
, const InductionDescriptor
&ID
,
493 ScalarEvolution
&SE
, Instruction
*TruncI
,
494 Type
*IVTy
, VPValue
*StartV
,
496 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
497 auto IP
= HeaderVPBB
->getFirstNonPhi();
498 VPCanonicalIVPHIRecipe
*CanonicalIV
= Plan
.getCanonicalIV();
499 Type
*TruncTy
= TruncI
? TruncI
->getType() : IVTy
;
500 VPValue
*BaseIV
= CanonicalIV
;
501 if (!CanonicalIV
->isCanonical(ID
.getKind(), StartV
, Step
, TruncTy
)) {
502 BaseIV
= new VPDerivedIVRecipe(ID
, StartV
, CanonicalIV
, Step
,
503 TruncI
? TruncI
->getType() : nullptr);
504 HeaderVPBB
->insert(BaseIV
->getDefiningRecipe(), IP
);
507 VPScalarIVStepsRecipe
*Steps
= new VPScalarIVStepsRecipe(ID
, BaseIV
, Step
);
508 HeaderVPBB
->insert(Steps
, IP
);
512 void VPlanTransforms::optimizeInductions(VPlan
&Plan
, ScalarEvolution
&SE
) {
513 SmallVector
<VPRecipeBase
*> ToRemove
;
514 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
515 bool HasOnlyVectorVFs
= !Plan
.hasVF(ElementCount::getFixed(1));
516 for (VPRecipeBase
&Phi
: HeaderVPBB
->phis()) {
517 auto *WideIV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
520 if (HasOnlyVectorVFs
&& none_of(WideIV
->users(), [WideIV
](VPUser
*U
) {
521 return U
->usesScalars(WideIV
);
525 const InductionDescriptor
&ID
= WideIV
->getInductionDescriptor();
526 VPValue
*Steps
= createScalarIVSteps(
527 Plan
, ID
, SE
, WideIV
->getTruncInst(), WideIV
->getPHINode()->getType(),
528 WideIV
->getStartValue(), WideIV
->getStepValue());
530 // Update scalar users of IV to use Step instead.
531 if (!HasOnlyVectorVFs
)
532 WideIV
->replaceAllUsesWith(Steps
);
534 WideIV
->replaceUsesWithIf(Steps
, [WideIV
](VPUser
&U
, unsigned) {
535 return U
.usesScalars(WideIV
);
540 void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan
&Plan
) {
541 DenseMap
<const SCEV
*, VPValue
*> SCEV2VPV
;
543 for (VPRecipeBase
&R
:
544 make_early_inc_range(*Plan
.getEntry()->getEntryBasicBlock())) {
545 auto *ExpR
= dyn_cast
<VPExpandSCEVRecipe
>(&R
);
549 auto I
= SCEV2VPV
.insert({ExpR
->getSCEV(), ExpR
});
552 ExpR
->replaceAllUsesWith(I
.first
->second
);
553 ExpR
->eraseFromParent();
557 static bool canSimplifyBranchOnCond(VPInstruction
*Term
) {
558 VPInstruction
*Not
= dyn_cast
<VPInstruction
>(Term
->getOperand(0));
559 if (!Not
|| Not
->getOpcode() != VPInstruction::Not
)
562 VPInstruction
*ALM
= dyn_cast
<VPInstruction
>(Not
->getOperand(0));
563 return ALM
&& ALM
->getOpcode() == VPInstruction::ActiveLaneMask
;
566 void VPlanTransforms::optimizeForVFAndUF(VPlan
&Plan
, ElementCount BestVF
,
568 PredicatedScalarEvolution
&PSE
) {
569 assert(Plan
.hasVF(BestVF
) && "BestVF is not available in Plan");
570 assert(Plan
.hasUF(BestUF
) && "BestUF is not available in Plan");
571 VPBasicBlock
*ExitingVPBB
=
572 Plan
.getVectorLoopRegion()->getExitingBasicBlock();
573 auto *Term
= dyn_cast
<VPInstruction
>(&ExitingVPBB
->back());
574 // Try to simplify the branch condition if TC <= VF * UF when preparing to
575 // execute the plan for the main vector loop. We only do this if the
577 // 1. BranchOnCount, or
578 // 2. BranchOnCond where the input is Not(ActiveLaneMask).
579 if (!Term
|| (Term
->getOpcode() != VPInstruction::BranchOnCount
&&
580 (Term
->getOpcode() != VPInstruction::BranchOnCond
||
581 !canSimplifyBranchOnCond(Term
))))
585 Plan
.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType();
586 const SCEV
*TripCount
= createTripCountSCEV(IdxTy
, PSE
);
587 ScalarEvolution
&SE
= *PSE
.getSE();
589 SE
.getConstant(TripCount
->getType(), BestVF
.getKnownMinValue() * BestUF
);
590 if (TripCount
->isZero() ||
591 !SE
.isKnownPredicate(CmpInst::ICMP_ULE
, TripCount
, C
))
594 LLVMContext
&Ctx
= SE
.getContext();
595 auto *BOC
= new VPInstruction(
596 VPInstruction::BranchOnCond
,
597 {Plan
.getVPValueOrAddLiveIn(ConstantInt::getTrue(Ctx
))});
598 Term
->eraseFromParent();
599 ExitingVPBB
->appendRecipe(BOC
);
602 // TODO: Further simplifications are possible
603 // 1. Replace inductions with constants.
604 // 2. Replace vector loop region with VPBasicBlock.
608 static VPRegionBlock
*GetReplicateRegion(VPRecipeBase
*R
) {
609 auto *Region
= dyn_cast_or_null
<VPRegionBlock
>(R
->getParent()->getParent());
610 if (Region
&& Region
->isReplicator()) {
611 assert(Region
->getNumSuccessors() == 1 &&
612 Region
->getNumPredecessors() == 1 && "Expected SESE region!");
613 assert(R
->getParent()->size() == 1 &&
614 "A recipe in an original replicator region must be the only "
615 "recipe in its block");
622 static bool properlyDominates(const VPRecipeBase
*A
, const VPRecipeBase
*B
,
623 VPDominatorTree
&VPDT
) {
627 auto LocalComesBefore
= [](const VPRecipeBase
*A
, const VPRecipeBase
*B
) {
628 for (auto &R
: *A
->getParent()) {
634 llvm_unreachable("recipe not found");
636 const VPBlockBase
*ParentA
= A
->getParent();
637 const VPBlockBase
*ParentB
= B
->getParent();
638 if (ParentA
== ParentB
)
639 return LocalComesBefore(A
, B
);
641 assert(!GetReplicateRegion(const_cast<VPRecipeBase
*>(A
)) &&
642 "No replicate regions expected at this point");
643 assert(!GetReplicateRegion(const_cast<VPRecipeBase
*>(B
)) &&
644 "No replicate regions expected at this point");
645 return VPDT
.properlyDominates(ParentA
, ParentB
);
648 /// Sink users of \p FOR after the recipe defining the previous value \p
649 /// Previous of the recurrence. \returns true if all users of \p FOR could be
650 /// re-arranged as needed or false if it is not possible.
652 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe
*FOR
,
653 VPRecipeBase
*Previous
,
654 VPDominatorTree
&VPDT
) {
655 // Collect recipes that need sinking.
656 SmallVector
<VPRecipeBase
*> WorkList
;
657 SmallPtrSet
<VPRecipeBase
*, 8> Seen
;
658 Seen
.insert(Previous
);
659 auto TryToPushSinkCandidate
= [&](VPRecipeBase
*SinkCandidate
) {
660 // The previous value must not depend on the users of the recurrence phi. In
661 // that case, FOR is not a fixed order recurrence.
662 if (SinkCandidate
== Previous
)
665 if (isa
<VPHeaderPHIRecipe
>(SinkCandidate
) ||
666 !Seen
.insert(SinkCandidate
).second
||
667 properlyDominates(Previous
, SinkCandidate
, VPDT
))
670 if (SinkCandidate
->mayHaveSideEffects())
673 WorkList
.push_back(SinkCandidate
);
677 // Recursively sink users of FOR after Previous.
678 WorkList
.push_back(FOR
);
679 for (unsigned I
= 0; I
!= WorkList
.size(); ++I
) {
680 VPRecipeBase
*Current
= WorkList
[I
];
681 assert(Current
->getNumDefinedValues() == 1 &&
682 "only recipes with a single defined value expected");
684 for (VPUser
*User
: Current
->getVPSingleValue()->users()) {
685 if (auto *R
= dyn_cast
<VPRecipeBase
>(User
))
686 if (!TryToPushSinkCandidate(R
))
691 // Keep recipes to sink ordered by dominance so earlier instructions are
693 sort(WorkList
, [&VPDT
](const VPRecipeBase
*A
, const VPRecipeBase
*B
) {
694 return properlyDominates(A
, B
, VPDT
);
697 for (VPRecipeBase
*SinkCandidate
: WorkList
) {
698 if (SinkCandidate
== FOR
)
701 SinkCandidate
->moveAfter(Previous
);
702 Previous
= SinkCandidate
;
707 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan
&Plan
,
708 VPBuilder
&Builder
) {
709 VPDominatorTree VPDT
;
710 VPDT
.recalculate(Plan
);
712 SmallVector
<VPFirstOrderRecurrencePHIRecipe
*> RecurrencePhis
;
713 for (VPRecipeBase
&R
:
714 Plan
.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
715 if (auto *FOR
= dyn_cast
<VPFirstOrderRecurrencePHIRecipe
>(&R
))
716 RecurrencePhis
.push_back(FOR
);
718 for (VPFirstOrderRecurrencePHIRecipe
*FOR
: RecurrencePhis
) {
719 SmallPtrSet
<VPFirstOrderRecurrencePHIRecipe
*, 4> SeenPhis
;
720 VPRecipeBase
*Previous
= FOR
->getBackedgeValue()->getDefiningRecipe();
721 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
723 while (auto *PrevPhi
=
724 dyn_cast_or_null
<VPFirstOrderRecurrencePHIRecipe
>(Previous
)) {
725 assert(PrevPhi
->getParent() == FOR
->getParent());
726 assert(SeenPhis
.insert(PrevPhi
).second
);
727 Previous
= PrevPhi
->getBackedgeValue()->getDefiningRecipe();
730 if (!sinkRecurrenceUsersAfterPrevious(FOR
, Previous
, VPDT
))
733 // Introduce a recipe to combine the incoming and previous values of a
734 // fixed-order recurrence.
735 VPBasicBlock
*InsertBlock
= Previous
->getParent();
736 if (isa
<VPHeaderPHIRecipe
>(Previous
))
737 Builder
.setInsertPoint(InsertBlock
, InsertBlock
->getFirstNonPhi());
739 Builder
.setInsertPoint(InsertBlock
, std::next(Previous
->getIterator()));
741 auto *RecurSplice
= cast
<VPInstruction
>(
742 Builder
.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice
,
743 {FOR
, FOR
->getBackedgeValue()}));
745 FOR
->replaceAllUsesWith(RecurSplice
);
746 // Set the first operand of RecurSplice to FOR again, after replacing
748 RecurSplice
->setOperand(0, FOR
);
753 void VPlanTransforms::clearReductionWrapFlags(VPlan
&Plan
) {
754 for (VPRecipeBase
&R
:
755 Plan
.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
756 auto *PhiR
= dyn_cast
<VPReductionPHIRecipe
>(&R
);
759 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
760 RecurKind RK
= RdxDesc
.getRecurrenceKind();
761 if (RK
!= RecurKind::Add
&& RK
!= RecurKind::Mul
)
764 SmallSetVector
<VPValue
*, 8> Worklist
;
765 Worklist
.insert(PhiR
);
767 for (unsigned I
= 0; I
!= Worklist
.size(); ++I
) {
768 VPValue
*Cur
= Worklist
[I
];
769 if (auto *RecWithFlags
=
770 dyn_cast
<VPRecipeWithIRFlags
>(Cur
->getDefiningRecipe())) {
771 RecWithFlags
->dropPoisonGeneratingFlags();
774 for (VPUser
*U
: Cur
->users()) {
775 auto *UserRecipe
= dyn_cast
<VPRecipeBase
>(U
);
778 for (VPValue
*V
: UserRecipe
->definedValues())
785 /// Returns true is \p V is constant one.
786 static bool isConstantOne(VPValue
*V
) {
789 auto *C
= dyn_cast
<ConstantInt
>(V
->getLiveInIRValue());
790 return C
&& C
->isOne();
793 /// Returns the llvm::Instruction opcode for \p R.
794 static unsigned getOpcodeForRecipe(VPRecipeBase
&R
) {
795 if (auto *WidenR
= dyn_cast
<VPWidenRecipe
>(&R
))
796 return WidenR
->getUnderlyingInstr()->getOpcode();
797 if (auto *WidenC
= dyn_cast
<VPWidenCastRecipe
>(&R
))
798 return WidenC
->getOpcode();
799 if (auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
))
800 return RepR
->getUnderlyingInstr()->getOpcode();
801 if (auto *VPI
= dyn_cast
<VPInstruction
>(&R
))
802 return VPI
->getOpcode();
806 /// Try to simplify recipe \p R.
807 static void simplifyRecipe(VPRecipeBase
&R
, VPTypeAnalysis
&TypeInfo
) {
808 switch (getOpcodeForRecipe(R
)) {
809 case Instruction::Mul
: {
810 VPValue
*A
= R
.getOperand(0);
811 VPValue
*B
= R
.getOperand(1);
812 if (isConstantOne(A
))
813 return R
.getVPSingleValue()->replaceAllUsesWith(B
);
814 if (isConstantOne(B
))
815 return R
.getVPSingleValue()->replaceAllUsesWith(A
);
818 case Instruction::Trunc
: {
819 VPRecipeBase
*Ext
= R
.getOperand(0)->getDefiningRecipe();
822 unsigned ExtOpcode
= getOpcodeForRecipe(*Ext
);
823 if (ExtOpcode
!= Instruction::ZExt
&& ExtOpcode
!= Instruction::SExt
)
825 VPValue
*A
= Ext
->getOperand(0);
826 VPValue
*Trunc
= R
.getVPSingleValue();
827 Type
*TruncTy
= TypeInfo
.inferScalarType(Trunc
);
828 Type
*ATy
= TypeInfo
.inferScalarType(A
);
829 if (TruncTy
== ATy
) {
830 Trunc
->replaceAllUsesWith(A
);
832 // Don't replace a scalarizing recipe with a widened cast.
833 if (isa
<VPReplicateRecipe
>(&R
))
835 if (ATy
->getScalarSizeInBits() < TruncTy
->getScalarSizeInBits()) {
837 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode
), A
, TruncTy
);
838 VPC
->insertBefore(&R
);
839 Trunc
->replaceAllUsesWith(VPC
);
840 } else if (ATy
->getScalarSizeInBits() > TruncTy
->getScalarSizeInBits()) {
841 auto *VPC
= new VPWidenCastRecipe(Instruction::Trunc
, A
, TruncTy
);
842 VPC
->insertBefore(&R
);
843 Trunc
->replaceAllUsesWith(VPC
);
847 // Verify that the cached type info is for both A and its users is still
848 // accurate by comparing it to freshly computed types.
849 VPTypeAnalysis
TypeInfo2(TypeInfo
.getContext());
850 assert(TypeInfo
.inferScalarType(A
) == TypeInfo2
.inferScalarType(A
));
851 for (VPUser
*U
: A
->users()) {
852 auto *R
= dyn_cast
<VPRecipeBase
>(U
);
855 for (VPValue
*VPV
: R
->definedValues())
856 assert(TypeInfo
.inferScalarType(VPV
) == TypeInfo2
.inferScalarType(VPV
));
866 /// Try to simplify the recipes in \p Plan.
867 static void simplifyRecipes(VPlan
&Plan
, LLVMContext
&Ctx
) {
868 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>> RPOT(
870 VPTypeAnalysis
TypeInfo(Ctx
);
871 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(RPOT
)) {
872 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
873 simplifyRecipe(R
, TypeInfo
);
878 void VPlanTransforms::truncateToMinimalBitwidths(
879 VPlan
&Plan
, const MapVector
<Instruction
*, uint64_t> &MinBWs
,
882 // Count the processed recipes and cross check the count later with MinBWs
883 // size, to make sure all entries in MinBWs have been handled.
884 unsigned NumProcessedRecipes
= 0;
886 // Keep track of created truncates, so they can be re-used. Note that we
887 // cannot use RAUW after creating a new truncate, as this would could make
888 // other uses have different types for their operands, making them invalidly
890 DenseMap
<VPValue
*, VPWidenCastRecipe
*> ProcessedTruncs
;
891 VPTypeAnalysis
TypeInfo(Ctx
);
892 VPBasicBlock
*PH
= Plan
.getEntry();
893 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
894 vp_depth_first_deep(Plan
.getVectorLoopRegion()))) {
895 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
896 if (!isa
<VPWidenRecipe
, VPWidenCastRecipe
, VPReplicateRecipe
,
897 VPWidenSelectRecipe
, VPWidenMemoryInstructionRecipe
>(&R
))
899 if (isa
<VPWidenMemoryInstructionRecipe
>(&R
) &&
900 cast
<VPWidenMemoryInstructionRecipe
>(&R
)->isStore())
903 VPValue
*ResultVPV
= R
.getVPSingleValue();
904 auto *UI
= cast_or_null
<Instruction
>(ResultVPV
->getUnderlyingValue());
905 unsigned NewResSizeInBits
= MinBWs
.lookup(UI
);
906 if (!NewResSizeInBits
)
910 NumProcessedRecipes
++;
912 // If the value wasn't vectorized, we must maintain the original scalar
913 // type. Skip those here, after incrementing NumProcessedRecipes. Also
914 // skip casts which do not need to be handled explicitly here, as
915 // redundant casts will be removed during recipe simplification.
916 if (isa
<VPReplicateRecipe
, VPWidenCastRecipe
>(&R
)) {
918 // If any of the operands is a live-in and not used by VPWidenRecipe or
919 // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
920 // processed as well. When MinBWs is currently constructed, there is no
921 // information about whether recipes are widened or replicated and in
922 // case they are reciplicated the operands are not truncated. Counting
923 // them them here ensures we do not miss any recipes in MinBWs.
924 // TODO: Remove once the analysis is done on VPlan.
925 for (VPValue
*Op
: R
.operands()) {
928 auto *UV
= dyn_cast_or_null
<Instruction
>(Op
->getUnderlyingValue());
929 if (UV
&& MinBWs
.contains(UV
) && !ProcessedTruncs
.contains(Op
) &&
930 all_of(Op
->users(), [](VPUser
*U
) {
931 return !isa
<VPWidenRecipe
, VPWidenSelectRecipe
>(U
);
933 // Add an entry to ProcessedTruncs to avoid counting the same
934 // operand multiple times.
935 ProcessedTruncs
[Op
] = nullptr;
936 NumProcessedRecipes
+= 1;
943 Type
*OldResTy
= TypeInfo
.inferScalarType(ResultVPV
);
944 unsigned OldResSizeInBits
= OldResTy
->getScalarSizeInBits();
945 assert(OldResTy
->isIntegerTy() && "only integer types supported");
946 if (OldResSizeInBits
== NewResSizeInBits
)
948 assert(OldResSizeInBits
> NewResSizeInBits
&& "Nothing to shrink?");
949 (void)OldResSizeInBits
;
951 auto *NewResTy
= IntegerType::get(Ctx
, NewResSizeInBits
);
953 // Any wrapping introduced by shrinking this operation shouldn't be
954 // considered undefined behavior. So, we can't unconditionally copy
955 // arithmetic wrapping flags to VPW.
956 if (auto *VPW
= dyn_cast
<VPRecipeWithIRFlags
>(&R
))
957 VPW
->dropPoisonGeneratingFlags();
959 // Extend result to original width.
960 auto *Ext
= new VPWidenCastRecipe(Instruction::ZExt
, ResultVPV
, OldResTy
);
961 Ext
->insertAfter(&R
);
962 ResultVPV
->replaceAllUsesWith(Ext
);
963 Ext
->setOperand(0, ResultVPV
);
965 if (isa
<VPWidenMemoryInstructionRecipe
>(&R
)) {
966 assert(!cast
<VPWidenMemoryInstructionRecipe
>(&R
)->isStore() && "stores cannot be narrowed");
970 // Shrink operands by introducing truncates as needed.
971 unsigned StartIdx
= isa
<VPWidenSelectRecipe
>(&R
) ? 1 : 0;
972 for (unsigned Idx
= StartIdx
; Idx
!= R
.getNumOperands(); ++Idx
) {
973 auto *Op
= R
.getOperand(Idx
);
974 unsigned OpSizeInBits
=
975 TypeInfo
.inferScalarType(Op
)->getScalarSizeInBits();
976 if (OpSizeInBits
== NewResSizeInBits
)
978 assert(OpSizeInBits
> NewResSizeInBits
&& "nothing to truncate");
979 auto [ProcessedIter
, IterIsEmpty
] =
980 ProcessedTruncs
.insert({Op
, nullptr});
981 VPWidenCastRecipe
*NewOp
=
983 ? new VPWidenCastRecipe(Instruction::Trunc
, Op
, NewResTy
)
984 : ProcessedIter
->second
;
985 R
.setOperand(Idx
, NewOp
);
988 ProcessedIter
->second
= NewOp
;
989 if (!Op
->isLiveIn()) {
990 NewOp
->insertBefore(&R
);
992 PH
->appendRecipe(NewOp
);
994 auto *OpInst
= dyn_cast
<Instruction
>(Op
->getLiveInIRValue());
995 bool IsContained
= MinBWs
.contains(OpInst
);
996 NumProcessedRecipes
+= IsContained
;
1004 assert(MinBWs
.size() == NumProcessedRecipes
&&
1005 "some entries in MinBWs haven't been processed");
1008 void VPlanTransforms::optimize(VPlan
&Plan
, ScalarEvolution
&SE
) {
1009 removeRedundantCanonicalIVs(Plan
);
1010 removeRedundantInductionCasts(Plan
);
1012 optimizeInductions(Plan
, SE
);
1013 simplifyRecipes(Plan
, SE
.getContext());
1014 removeDeadRecipes(Plan
);
1016 createAndOptimizeReplicateRegions(Plan
);
1018 removeRedundantExpandSCEVRecipes(Plan
);
1019 mergeBlocksIntoPredecessors(Plan
);
1022 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1023 // the loop terminator with a branch-on-cond recipe with the negated
1024 // active-lane-mask as operand. Note that this turns the loop into an
1025 // uncountable one. Only the existing terminator is replaced, all other existing
1026 // recipes/users remain unchanged, except for poison-generating flags being
1027 // dropped from the canonical IV increment. Return the created
1028 // VPActiveLaneMaskPHIRecipe.
1030 // The function uses the following definitions:
1032 // %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1033 // calculate-trip-count-minus-VF (original TC) : original TC
1034 // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1035 // CanonicalIVPhi : CanonicalIVIncrement
1036 // %StartV is the canonical induction start value.
1038 // The function adds the following recipes:
1041 // %TripCount = calculate-trip-count-minus-VF (original TC)
1042 // [if DataWithControlFlowWithoutRuntimeCheck]
1043 // %EntryInc = canonical-iv-increment-for-part %StartV
1044 // %EntryALM = active-lane-mask %EntryInc, %TripCount
1048 // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1050 // %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1051 // %ALM = active-lane-mask %InLoopInc, TripCount
1052 // %Negated = Not %ALM
1053 // branch-on-cond %Negated
1055 static VPActiveLaneMaskPHIRecipe
*addVPLaneMaskPhiAndUpdateExitBranch(
1056 VPlan
&Plan
, bool DataAndControlFlowWithoutRuntimeCheck
) {
1057 VPRegionBlock
*TopRegion
= Plan
.getVectorLoopRegion();
1058 VPBasicBlock
*EB
= TopRegion
->getExitingBasicBlock();
1059 auto *CanonicalIVPHI
= Plan
.getCanonicalIV();
1060 VPValue
*StartV
= CanonicalIVPHI
->getStartValue();
1062 auto *CanonicalIVIncrement
=
1063 cast
<VPInstruction
>(CanonicalIVPHI
->getBackedgeValue());
1064 // TODO: Check if dropping the flags is needed if
1065 // !DataAndControlFlowWithoutRuntimeCheck.
1066 CanonicalIVIncrement
->dropPoisonGeneratingFlags();
1067 DebugLoc DL
= CanonicalIVIncrement
->getDebugLoc();
1068 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1069 // we have to take unrolling into account. Each part needs to start at
1071 auto *VecPreheader
= cast
<VPBasicBlock
>(TopRegion
->getSinglePredecessor());
1072 VPBuilder
Builder(VecPreheader
);
1074 // Create the ActiveLaneMask instruction using the correct start values.
1075 VPValue
*TC
= Plan
.getTripCount();
1077 VPValue
*TripCount
, *IncrementValue
;
1078 if (!DataAndControlFlowWithoutRuntimeCheck
) {
1079 // When the loop is guarded by a runtime overflow check for the loop
1080 // induction variable increment by VF, we can increment the value before
1081 // the get.active.lane mask and use the unmodified tripcount.
1082 IncrementValue
= CanonicalIVIncrement
;
1085 // When avoiding a runtime check, the active.lane.mask inside the loop
1086 // uses a modified trip count and the induction variable increment is
1087 // done after the active.lane.mask intrinsic is called.
1088 IncrementValue
= CanonicalIVPHI
;
1089 TripCount
= Builder
.createNaryOp(VPInstruction::CalculateTripCountMinusVF
,
1092 auto *EntryIncrement
= Builder
.createOverflowingOp(
1093 VPInstruction::CanonicalIVIncrementForPart
, {StartV
}, {false, false}, DL
,
1096 // Create the active lane mask instruction in the VPlan preheader.
1098 Builder
.createNaryOp(VPInstruction::ActiveLaneMask
, {EntryIncrement
, TC
},
1099 DL
, "active.lane.mask.entry");
1101 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
1102 // preheader ActiveLaneMask instruction.
1103 auto LaneMaskPhi
= new VPActiveLaneMaskPHIRecipe(EntryALM
, DebugLoc());
1104 LaneMaskPhi
->insertAfter(CanonicalIVPHI
);
1106 // Create the active lane mask for the next iteration of the loop before the
1107 // original terminator.
1108 VPRecipeBase
*OriginalTerminator
= EB
->getTerminator();
1109 Builder
.setInsertPoint(OriginalTerminator
);
1110 auto *InLoopIncrement
=
1111 Builder
.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart
,
1112 {IncrementValue
}, {false, false}, DL
);
1113 auto *ALM
= Builder
.createNaryOp(VPInstruction::ActiveLaneMask
,
1114 {InLoopIncrement
, TripCount
}, DL
,
1115 "active.lane.mask.next");
1116 LaneMaskPhi
->addOperand(ALM
);
1118 // Replace the original terminator with BranchOnCond. We have to invert the
1119 // mask here because a true condition means jumping to the exit block.
1120 auto *NotMask
= Builder
.createNot(ALM
, DL
);
1121 Builder
.createNaryOp(VPInstruction::BranchOnCond
, {NotMask
}, DL
);
1122 OriginalTerminator
->eraseFromParent();
1126 void VPlanTransforms::addActiveLaneMask(
1127 VPlan
&Plan
, bool UseActiveLaneMaskForControlFlow
,
1128 bool DataAndControlFlowWithoutRuntimeCheck
) {
1129 assert((!DataAndControlFlowWithoutRuntimeCheck
||
1130 UseActiveLaneMaskForControlFlow
) &&
1131 "DataAndControlFlowWithoutRuntimeCheck implies "
1132 "UseActiveLaneMaskForControlFlow");
1134 auto FoundWidenCanonicalIVUser
=
1135 find_if(Plan
.getCanonicalIV()->users(),
1136 [](VPUser
*U
) { return isa
<VPWidenCanonicalIVRecipe
>(U
); });
1137 assert(FoundWidenCanonicalIVUser
&&
1138 "Must have widened canonical IV when tail folding!");
1139 auto *WideCanonicalIV
=
1140 cast
<VPWidenCanonicalIVRecipe
>(*FoundWidenCanonicalIVUser
);
1141 VPSingleDefRecipe
*LaneMask
;
1142 if (UseActiveLaneMaskForControlFlow
) {
1143 LaneMask
= addVPLaneMaskPhiAndUpdateExitBranch(
1144 Plan
, DataAndControlFlowWithoutRuntimeCheck
);
1146 LaneMask
= new VPInstruction(VPInstruction::ActiveLaneMask
,
1147 {WideCanonicalIV
, Plan
.getTripCount()},
1148 nullptr, "active.lane.mask");
1149 LaneMask
->insertAfter(WideCanonicalIV
);
1152 // Walk users of WideCanonicalIV and replace all compares of the form
1153 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
1154 // active-lane-mask.
1155 VPValue
*BTC
= Plan
.getOrCreateBackedgeTakenCount();
1156 for (VPUser
*U
: SmallVector
<VPUser
*>(WideCanonicalIV
->users())) {
1157 auto *CompareToReplace
= dyn_cast
<VPInstruction
>(U
);
1158 if (!CompareToReplace
||
1159 CompareToReplace
->getOpcode() != Instruction::ICmp
||
1160 CompareToReplace
->getPredicate() != CmpInst::ICMP_ULE
||
1161 CompareToReplace
->getOperand(1) != BTC
)
1164 assert(CompareToReplace
->getOperand(0) == WideCanonicalIV
&&
1165 "WidenCanonicalIV must be the first operand of the compare");
1166 CompareToReplace
->replaceAllUsesWith(LaneMask
);
1167 CompareToReplace
->eraseFromParent();