1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements a set of utility VPlan to VPlan transformations.
12 //===----------------------------------------------------------------------===//
14 #include "VPlanTransforms.h"
15 #include "VPRecipeBuilder.h"
17 #include "VPlanAnalysis.h"
19 #include "VPlanDominatorTree.h"
20 #include "VPlanPatternMatch.h"
21 #include "VPlanUtils.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SetVector.h"
25 #include "llvm/ADT/TypeSwitch.h"
26 #include "llvm/Analysis/IVDescriptors.h"
27 #include "llvm/Analysis/VectorUtils.h"
28 #include "llvm/IR/Intrinsics.h"
29 #include "llvm/IR/PatternMatch.h"
33 void VPlanTransforms::VPInstructionsToVPRecipes(
35 function_ref
<const InductionDescriptor
*(PHINode
*)>
36 GetIntOrFpInductionDescriptor
,
37 ScalarEvolution
&SE
, const TargetLibraryInfo
&TLI
) {
39 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>> RPOT(
40 Plan
->getVectorLoopRegion());
41 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(RPOT
)) {
42 // Skip blocks outside region
43 if (!VPBB
->getParent())
45 VPRecipeBase
*Term
= VPBB
->getTerminator();
46 auto EndIter
= Term
? Term
->getIterator() : VPBB
->end();
47 // Introduce each ingredient into VPlan.
48 for (VPRecipeBase
&Ingredient
:
49 make_early_inc_range(make_range(VPBB
->begin(), EndIter
))) {
51 VPValue
*VPV
= Ingredient
.getVPSingleValue();
52 Instruction
*Inst
= cast
<Instruction
>(VPV
->getUnderlyingValue());
54 VPRecipeBase
*NewRecipe
= nullptr;
55 if (auto *VPPhi
= dyn_cast
<VPWidenPHIRecipe
>(&Ingredient
)) {
56 auto *Phi
= cast
<PHINode
>(VPPhi
->getUnderlyingValue());
57 const auto *II
= GetIntOrFpInductionDescriptor(Phi
);
61 VPValue
*Start
= Plan
->getOrAddLiveIn(II
->getStartValue());
63 vputils::getOrCreateVPValueForSCEVExpr(*Plan
, II
->getStep(), SE
);
64 NewRecipe
= new VPWidenIntOrFpInductionRecipe(Phi
, Start
, Step
,
67 assert(isa
<VPInstruction
>(&Ingredient
) &&
68 "only VPInstructions expected here");
69 assert(!isa
<PHINode
>(Inst
) && "phis should be handled above");
70 // Create VPWidenMemoryRecipe for loads and stores.
71 if (LoadInst
*Load
= dyn_cast
<LoadInst
>(Inst
)) {
72 NewRecipe
= new VPWidenLoadRecipe(
73 *Load
, Ingredient
.getOperand(0), nullptr /*Mask*/,
74 false /*Consecutive*/, false /*Reverse*/,
75 Ingredient
.getDebugLoc());
76 } else if (StoreInst
*Store
= dyn_cast
<StoreInst
>(Inst
)) {
77 NewRecipe
= new VPWidenStoreRecipe(
78 *Store
, Ingredient
.getOperand(1), Ingredient
.getOperand(0),
79 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
80 Ingredient
.getDebugLoc());
81 } else if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(Inst
)) {
82 NewRecipe
= new VPWidenGEPRecipe(GEP
, Ingredient
.operands());
83 } else if (CallInst
*CI
= dyn_cast
<CallInst
>(Inst
)) {
84 NewRecipe
= new VPWidenIntrinsicRecipe(
85 *CI
, getVectorIntrinsicIDForCall(CI
, &TLI
),
86 {Ingredient
.op_begin(), Ingredient
.op_end() - 1}, CI
->getType(),
88 } else if (SelectInst
*SI
= dyn_cast
<SelectInst
>(Inst
)) {
89 NewRecipe
= new VPWidenSelectRecipe(*SI
, Ingredient
.operands());
90 } else if (auto *CI
= dyn_cast
<CastInst
>(Inst
)) {
91 NewRecipe
= new VPWidenCastRecipe(
92 CI
->getOpcode(), Ingredient
.getOperand(0), CI
->getType(), *CI
);
94 NewRecipe
= new VPWidenRecipe(*Inst
, Ingredient
.operands());
98 NewRecipe
->insertBefore(&Ingredient
);
99 if (NewRecipe
->getNumDefinedValues() == 1)
100 VPV
->replaceAllUsesWith(NewRecipe
->getVPSingleValue());
102 assert(NewRecipe
->getNumDefinedValues() == 0 &&
103 "Only recpies with zero or one defined values expected");
104 Ingredient
.eraseFromParent();
109 static bool sinkScalarOperands(VPlan
&Plan
) {
110 auto Iter
= vp_depth_first_deep(Plan
.getEntry());
111 bool Changed
= false;
112 // First, collect the operands of all recipes in replicate blocks as seeds for
114 SetVector
<std::pair
<VPBasicBlock
*, VPSingleDefRecipe
*>> WorkList
;
115 for (VPRegionBlock
*VPR
: VPBlockUtils::blocksOnly
<VPRegionBlock
>(Iter
)) {
116 VPBasicBlock
*EntryVPBB
= VPR
->getEntryBasicBlock();
117 if (!VPR
->isReplicator() || EntryVPBB
->getSuccessors().size() != 2)
119 VPBasicBlock
*VPBB
= dyn_cast
<VPBasicBlock
>(EntryVPBB
->getSuccessors()[0]);
120 if (!VPBB
|| VPBB
->getSingleSuccessor() != VPR
->getExitingBasicBlock())
122 for (auto &Recipe
: *VPBB
) {
123 for (VPValue
*Op
: Recipe
.operands())
125 dyn_cast_or_null
<VPSingleDefRecipe
>(Op
->getDefiningRecipe()))
126 WorkList
.insert(std::make_pair(VPBB
, Def
));
130 bool ScalarVFOnly
= Plan
.hasScalarVFOnly();
131 // Try to sink each replicate or scalar IV steps recipe in the worklist.
132 for (unsigned I
= 0; I
!= WorkList
.size(); ++I
) {
133 VPBasicBlock
*SinkTo
;
134 VPSingleDefRecipe
*SinkCandidate
;
135 std::tie(SinkTo
, SinkCandidate
) = WorkList
[I
];
136 if (SinkCandidate
->getParent() == SinkTo
||
137 SinkCandidate
->mayHaveSideEffects() ||
138 SinkCandidate
->mayReadOrWriteMemory())
140 if (auto *RepR
= dyn_cast
<VPReplicateRecipe
>(SinkCandidate
)) {
141 if (!ScalarVFOnly
&& RepR
->isUniform())
143 } else if (!isa
<VPScalarIVStepsRecipe
>(SinkCandidate
))
146 bool NeedsDuplicating
= false;
147 // All recipe users of the sink candidate must be in the same block SinkTo
148 // or all users outside of SinkTo must be uniform-after-vectorization (
149 // i.e., only first lane is used) . In the latter case, we need to duplicate
151 auto CanSinkWithUser
= [SinkTo
, &NeedsDuplicating
,
152 SinkCandidate
](VPUser
*U
) {
153 auto *UI
= cast
<VPRecipeBase
>(U
);
154 if (UI
->getParent() == SinkTo
)
156 NeedsDuplicating
= UI
->onlyFirstLaneUsed(SinkCandidate
);
157 // We only know how to duplicate VPRecipeRecipes for now.
158 return NeedsDuplicating
&& isa
<VPReplicateRecipe
>(SinkCandidate
);
160 if (!all_of(SinkCandidate
->users(), CanSinkWithUser
))
163 if (NeedsDuplicating
) {
166 Instruction
*I
= SinkCandidate
->getUnderlyingInstr();
167 auto *Clone
= new VPReplicateRecipe(I
, SinkCandidate
->operands(), true);
168 // TODO: add ".cloned" suffix to name of Clone's VPValue.
170 Clone
->insertBefore(SinkCandidate
);
171 SinkCandidate
->replaceUsesWithIf(Clone
, [SinkTo
](VPUser
&U
, unsigned) {
172 return cast
<VPRecipeBase
>(&U
)->getParent() != SinkTo
;
175 SinkCandidate
->moveBefore(*SinkTo
, SinkTo
->getFirstNonPhi());
176 for (VPValue
*Op
: SinkCandidate
->operands())
178 dyn_cast_or_null
<VPSingleDefRecipe
>(Op
->getDefiningRecipe()))
179 WorkList
.insert(std::make_pair(SinkTo
, Def
));
185 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
187 VPValue
*getPredicatedMask(VPRegionBlock
*R
) {
188 auto *EntryBB
= dyn_cast
<VPBasicBlock
>(R
->getEntry());
189 if (!EntryBB
|| EntryBB
->size() != 1 ||
190 !isa
<VPBranchOnMaskRecipe
>(EntryBB
->begin()))
193 return cast
<VPBranchOnMaskRecipe
>(&*EntryBB
->begin())->getOperand(0);
196 /// If \p R is a triangle region, return the 'then' block of the triangle.
197 static VPBasicBlock
*getPredicatedThenBlock(VPRegionBlock
*R
) {
198 auto *EntryBB
= cast
<VPBasicBlock
>(R
->getEntry());
199 if (EntryBB
->getNumSuccessors() != 2)
202 auto *Succ0
= dyn_cast
<VPBasicBlock
>(EntryBB
->getSuccessors()[0]);
203 auto *Succ1
= dyn_cast
<VPBasicBlock
>(EntryBB
->getSuccessors()[1]);
204 if (!Succ0
|| !Succ1
)
207 if (Succ0
->getNumSuccessors() + Succ1
->getNumSuccessors() != 1)
209 if (Succ0
->getSingleSuccessor() == Succ1
)
211 if (Succ1
->getSingleSuccessor() == Succ0
)
216 // Merge replicate regions in their successor region, if a replicate region
217 // is connected to a successor replicate region with the same predicate by a
218 // single, empty VPBasicBlock.
219 static bool mergeReplicateRegionsIntoSuccessors(VPlan
&Plan
) {
220 SetVector
<VPRegionBlock
*> DeletedRegions
;
222 // Collect replicate regions followed by an empty block, followed by another
223 // replicate region with matching masks to process front. This is to avoid
224 // iterator invalidation issues while merging regions.
225 SmallVector
<VPRegionBlock
*, 8> WorkList
;
226 for (VPRegionBlock
*Region1
: VPBlockUtils::blocksOnly
<VPRegionBlock
>(
227 vp_depth_first_deep(Plan
.getEntry()))) {
228 if (!Region1
->isReplicator())
230 auto *MiddleBasicBlock
=
231 dyn_cast_or_null
<VPBasicBlock
>(Region1
->getSingleSuccessor());
232 if (!MiddleBasicBlock
|| !MiddleBasicBlock
->empty())
236 dyn_cast_or_null
<VPRegionBlock
>(MiddleBasicBlock
->getSingleSuccessor());
237 if (!Region2
|| !Region2
->isReplicator())
240 VPValue
*Mask1
= getPredicatedMask(Region1
);
241 VPValue
*Mask2
= getPredicatedMask(Region2
);
242 if (!Mask1
|| Mask1
!= Mask2
)
245 assert(Mask1
&& Mask2
&& "both region must have conditions");
246 WorkList
.push_back(Region1
);
249 // Move recipes from Region1 to its successor region, if both are triangles.
250 for (VPRegionBlock
*Region1
: WorkList
) {
251 if (DeletedRegions
.contains(Region1
))
253 auto *MiddleBasicBlock
= cast
<VPBasicBlock
>(Region1
->getSingleSuccessor());
254 auto *Region2
= cast
<VPRegionBlock
>(MiddleBasicBlock
->getSingleSuccessor());
256 VPBasicBlock
*Then1
= getPredicatedThenBlock(Region1
);
257 VPBasicBlock
*Then2
= getPredicatedThenBlock(Region2
);
258 if (!Then1
|| !Then2
)
261 // Note: No fusion-preventing memory dependencies are expected in either
262 // region. Such dependencies should be rejected during earlier dependence
263 // checks, which guarantee accesses can be re-ordered for vectorization.
265 // Move recipes to the successor region.
266 for (VPRecipeBase
&ToMove
: make_early_inc_range(reverse(*Then1
)))
267 ToMove
.moveBefore(*Then2
, Then2
->getFirstNonPhi());
269 auto *Merge1
= cast
<VPBasicBlock
>(Then1
->getSingleSuccessor());
270 auto *Merge2
= cast
<VPBasicBlock
>(Then2
->getSingleSuccessor());
272 // Move VPPredInstPHIRecipes from the merge block to the successor region's
273 // merge block. Update all users inside the successor region to use the
275 for (VPRecipeBase
&Phi1ToMove
: make_early_inc_range(reverse(*Merge1
))) {
277 cast
<VPPredInstPHIRecipe
>(&Phi1ToMove
)->getOperand(0);
278 VPValue
*Phi1ToMoveV
= Phi1ToMove
.getVPSingleValue();
279 Phi1ToMoveV
->replaceUsesWithIf(PredInst1
, [Then2
](VPUser
&U
, unsigned) {
280 return cast
<VPRecipeBase
>(&U
)->getParent() == Then2
;
283 // Remove phi recipes that are unused after merging the regions.
284 if (Phi1ToMove
.getVPSingleValue()->getNumUsers() == 0) {
285 Phi1ToMove
.eraseFromParent();
288 Phi1ToMove
.moveBefore(*Merge2
, Merge2
->begin());
291 // Finally, remove the first region.
292 for (VPBlockBase
*Pred
: make_early_inc_range(Region1
->getPredecessors())) {
293 VPBlockUtils::disconnectBlocks(Pred
, Region1
);
294 VPBlockUtils::connectBlocks(Pred
, MiddleBasicBlock
);
296 VPBlockUtils::disconnectBlocks(Region1
, MiddleBasicBlock
);
297 DeletedRegions
.insert(Region1
);
300 for (VPRegionBlock
*ToDelete
: DeletedRegions
)
302 return !DeletedRegions
.empty();
305 static VPRegionBlock
*createReplicateRegion(VPReplicateRecipe
*PredRecipe
,
307 Instruction
*Instr
= PredRecipe
->getUnderlyingInstr();
308 // Build the triangular if-then region.
309 std::string RegionName
= (Twine("pred.") + Instr
->getOpcodeName()).str();
310 assert(Instr
->getParent() && "Predicated instruction not in any basic block");
311 auto *BlockInMask
= PredRecipe
->getMask();
312 auto *BOMRecipe
= new VPBranchOnMaskRecipe(BlockInMask
);
313 auto *Entry
= new VPBasicBlock(Twine(RegionName
) + ".entry", BOMRecipe
);
315 // Replace predicated replicate recipe with a replicate recipe without a
316 // mask but in the replicate region.
317 auto *RecipeWithoutMask
= new VPReplicateRecipe(
318 PredRecipe
->getUnderlyingInstr(),
319 make_range(PredRecipe
->op_begin(), std::prev(PredRecipe
->op_end())),
320 PredRecipe
->isUniform());
321 auto *Pred
= new VPBasicBlock(Twine(RegionName
) + ".if", RecipeWithoutMask
);
323 VPPredInstPHIRecipe
*PHIRecipe
= nullptr;
324 if (PredRecipe
->getNumUsers() != 0) {
325 PHIRecipe
= new VPPredInstPHIRecipe(RecipeWithoutMask
);
326 PredRecipe
->replaceAllUsesWith(PHIRecipe
);
327 PHIRecipe
->setOperand(0, RecipeWithoutMask
);
329 PredRecipe
->eraseFromParent();
330 auto *Exiting
= new VPBasicBlock(Twine(RegionName
) + ".continue", PHIRecipe
);
331 VPRegionBlock
*Region
= new VPRegionBlock(Entry
, Exiting
, RegionName
, true);
333 // Note: first set Entry as region entry and then connect successors starting
334 // from it in order, to propagate the "parent" of each VPBasicBlock.
335 VPBlockUtils::insertTwoBlocksAfter(Pred
, Exiting
, Entry
);
336 VPBlockUtils::connectBlocks(Pred
, Exiting
);
341 static void addReplicateRegions(VPlan
&Plan
) {
342 SmallVector
<VPReplicateRecipe
*> WorkList
;
343 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
344 vp_depth_first_deep(Plan
.getEntry()))) {
345 for (VPRecipeBase
&R
: *VPBB
)
346 if (auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
)) {
347 if (RepR
->isPredicated())
348 WorkList
.push_back(RepR
);
353 for (VPReplicateRecipe
*RepR
: WorkList
) {
354 VPBasicBlock
*CurrentBlock
= RepR
->getParent();
355 VPBasicBlock
*SplitBlock
= CurrentBlock
->splitAt(RepR
->getIterator());
357 BasicBlock
*OrigBB
= RepR
->getUnderlyingInstr()->getParent();
359 OrigBB
->hasName() ? OrigBB
->getName() + "." + Twine(BBNum
++) : "");
360 // Record predicated instructions for above packing optimizations.
361 VPBlockBase
*Region
= createReplicateRegion(RepR
, Plan
);
362 Region
->setParent(CurrentBlock
->getParent());
363 VPBlockUtils::insertOnEdge(CurrentBlock
, SplitBlock
, Region
);
367 /// Remove redundant VPBasicBlocks by merging them into their predecessor if
368 /// the predecessor has a single successor.
369 static bool mergeBlocksIntoPredecessors(VPlan
&Plan
) {
370 SmallVector
<VPBasicBlock
*> WorkList
;
371 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
372 vp_depth_first_deep(Plan
.getEntry()))) {
373 // Don't fold the blocks in the skeleton of the Plan into their single
374 // predecessors for now.
375 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
376 if (!VPBB
->getParent())
379 dyn_cast_or_null
<VPBasicBlock
>(VPBB
->getSinglePredecessor());
380 if (!PredVPBB
|| PredVPBB
->getNumSuccessors() != 1)
382 WorkList
.push_back(VPBB
);
385 for (VPBasicBlock
*VPBB
: WorkList
) {
386 VPBasicBlock
*PredVPBB
= cast
<VPBasicBlock
>(VPBB
->getSinglePredecessor());
387 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
))
388 R
.moveBefore(*PredVPBB
, PredVPBB
->end());
389 VPBlockUtils::disconnectBlocks(PredVPBB
, VPBB
);
390 auto *ParentRegion
= cast_or_null
<VPRegionBlock
>(VPBB
->getParent());
391 if (ParentRegion
&& ParentRegion
->getExiting() == VPBB
)
392 ParentRegion
->setExiting(PredVPBB
);
393 for (auto *Succ
: to_vector(VPBB
->successors())) {
394 VPBlockUtils::disconnectBlocks(VPBB
, Succ
);
395 VPBlockUtils::connectBlocks(PredVPBB
, Succ
);
399 return !WorkList
.empty();
402 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan
&Plan
) {
403 // Convert masked VPReplicateRecipes to if-then region blocks.
404 addReplicateRegions(Plan
);
406 bool ShouldSimplify
= true;
407 while (ShouldSimplify
) {
408 ShouldSimplify
= sinkScalarOperands(Plan
);
409 ShouldSimplify
|= mergeReplicateRegionsIntoSuccessors(Plan
);
410 ShouldSimplify
|= mergeBlocksIntoPredecessors(Plan
);
414 /// Remove redundant casts of inductions.
416 /// Such redundant casts are casts of induction variables that can be ignored,
417 /// because we already proved that the casted phi is equal to the uncasted phi
418 /// in the vectorized loop. There is no need to vectorize the cast - the same
419 /// value can be used for both the phi and casts in the vector loop.
420 static void removeRedundantInductionCasts(VPlan
&Plan
) {
421 for (auto &Phi
: Plan
.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
422 auto *IV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
423 if (!IV
|| IV
->getTruncInst())
426 // A sequence of IR Casts has potentially been recorded for IV, which
427 // *must be bypassed* when the IV is vectorized, because the vectorized IV
428 // will produce the desired casted value. This sequence forms a def-use
429 // chain and is provided in reverse order, ending with the cast that uses
430 // the IV phi. Search for the recipe of the last cast in the chain and
431 // replace it with the original IV. Note that only the final cast is
432 // expected to have users outside the cast-chain and the dead casts left
433 // over will be cleaned up later.
434 auto &Casts
= IV
->getInductionDescriptor().getCastInsts();
435 VPValue
*FindMyCast
= IV
;
436 for (Instruction
*IRCast
: reverse(Casts
)) {
437 VPSingleDefRecipe
*FoundUserCast
= nullptr;
438 for (auto *U
: FindMyCast
->users()) {
439 auto *UserCast
= dyn_cast
<VPSingleDefRecipe
>(U
);
440 if (UserCast
&& UserCast
->getUnderlyingValue() == IRCast
) {
441 FoundUserCast
= UserCast
;
445 FindMyCast
= FoundUserCast
;
447 FindMyCast
->replaceAllUsesWith(IV
);
451 /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
452 /// recipe, if it exists.
453 static void removeRedundantCanonicalIVs(VPlan
&Plan
) {
454 VPCanonicalIVPHIRecipe
*CanonicalIV
= Plan
.getCanonicalIV();
455 VPWidenCanonicalIVRecipe
*WidenNewIV
= nullptr;
456 for (VPUser
*U
: CanonicalIV
->users()) {
457 WidenNewIV
= dyn_cast
<VPWidenCanonicalIVRecipe
>(U
);
465 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
466 for (VPRecipeBase
&Phi
: HeaderVPBB
->phis()) {
467 auto *WidenOriginalIV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
469 if (!WidenOriginalIV
|| !WidenOriginalIV
->isCanonical())
472 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
473 // everything WidenNewIV's users need. That is, WidenOriginalIV will
474 // generate a vector phi or all users of WidenNewIV demand the first lane
476 if (any_of(WidenOriginalIV
->users(),
477 [WidenOriginalIV
](VPUser
*U
) {
478 return !U
->usesScalars(WidenOriginalIV
);
480 vputils::onlyFirstLaneUsed(WidenNewIV
)) {
481 WidenNewIV
->replaceAllUsesWith(WidenOriginalIV
);
482 WidenNewIV
->eraseFromParent();
488 /// Returns true if \p R is dead and can be removed.
489 static bool isDeadRecipe(VPRecipeBase
&R
) {
490 using namespace llvm::PatternMatch
;
491 // Do remove conditional assume instructions as their conditions may be
493 auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
);
494 bool IsConditionalAssume
=
495 RepR
&& RepR
->isPredicated() &&
496 match(RepR
->getUnderlyingInstr(), m_Intrinsic
<Intrinsic::assume
>());
497 if (IsConditionalAssume
)
500 if (R
.mayHaveSideEffects())
503 // Recipe is dead if no user keeps the recipe alive.
504 return all_of(R
.definedValues(),
505 [](VPValue
*V
) { return V
->getNumUsers() == 0; });
508 void VPlanTransforms::removeDeadRecipes(VPlan
&Plan
) {
509 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>> RPOT(
512 for (VPBasicBlock
*VPBB
: reverse(VPBlockUtils::blocksOnly
<VPBasicBlock
>(RPOT
))) {
513 // The recipes in the block are processed in reverse order, to catch chains
515 for (VPRecipeBase
&R
: make_early_inc_range(reverse(*VPBB
))) {
522 static VPScalarIVStepsRecipe
*
523 createScalarIVSteps(VPlan
&Plan
, InductionDescriptor::InductionKind Kind
,
524 Instruction::BinaryOps InductionOpcode
,
525 FPMathOperator
*FPBinOp
, Instruction
*TruncI
,
526 VPValue
*StartV
, VPValue
*Step
, VPBuilder
&Builder
) {
527 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
528 VPCanonicalIVPHIRecipe
*CanonicalIV
= Plan
.getCanonicalIV();
529 VPSingleDefRecipe
*BaseIV
= CanonicalIV
;
530 if (!CanonicalIV
->isCanonical(Kind
, StartV
, Step
)) {
531 BaseIV
= Builder
.createDerivedIV(Kind
, FPBinOp
, StartV
, CanonicalIV
, Step
);
534 // Truncate base induction if needed.
535 Type
*CanonicalIVType
= CanonicalIV
->getScalarType();
536 VPTypeAnalysis
TypeInfo(CanonicalIVType
);
537 Type
*ResultTy
= TypeInfo
.inferScalarType(BaseIV
);
539 Type
*TruncTy
= TruncI
->getType();
540 assert(ResultTy
->getScalarSizeInBits() > TruncTy
->getScalarSizeInBits() &&
542 assert(ResultTy
->isIntegerTy() && "Truncation requires an integer type");
543 BaseIV
= Builder
.createScalarCast(Instruction::Trunc
, BaseIV
, TruncTy
);
547 // Truncate step if needed.
548 Type
*StepTy
= TypeInfo
.inferScalarType(Step
);
549 if (ResultTy
!= StepTy
) {
550 assert(StepTy
->getScalarSizeInBits() > ResultTy
->getScalarSizeInBits() &&
552 assert(StepTy
->isIntegerTy() && "Truncation requires an integer type");
554 cast
<VPBasicBlock
>(HeaderVPBB
->getSingleHierarchicalPredecessor());
555 VPBuilder::InsertPointGuard
Guard(Builder
);
556 Builder
.setInsertPoint(VecPreheader
);
557 Step
= Builder
.createScalarCast(Instruction::Trunc
, Step
, ResultTy
);
559 return Builder
.createScalarIVSteps(InductionOpcode
, FPBinOp
, BaseIV
, Step
);
562 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
563 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
564 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
565 /// require vectors while other require scalars, the scalar uses need to extract
566 /// the scalars from the generated vectors (Note that this is different to how
567 /// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe,
568 /// if any of its users needs scalar values, by providing them scalar steps
569 /// built on the canonical scalar IV and update the original IV's users. This is
570 /// an optional optimization to reduce the needs of vector extracts.
571 static void legalizeAndOptimizeInductions(VPlan
&Plan
) {
572 SmallVector
<VPRecipeBase
*> ToRemove
;
573 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
574 bool HasOnlyVectorVFs
= !Plan
.hasVF(ElementCount::getFixed(1));
575 VPBuilder
Builder(HeaderVPBB
, HeaderVPBB
->getFirstNonPhi());
576 for (VPRecipeBase
&Phi
: HeaderVPBB
->phis()) {
577 // Replace wide pointer inductions which have only their scalars used by
578 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
579 if (auto *PtrIV
= dyn_cast
<VPWidenPointerInductionRecipe
>(&Phi
)) {
580 if (!PtrIV
->onlyScalarsGenerated(Plan
.hasScalableVF()))
583 const InductionDescriptor
&ID
= PtrIV
->getInductionDescriptor();
585 Plan
.getOrAddLiveIn(ConstantInt::get(ID
.getStep()->getType(), 0));
586 VPValue
*StepV
= PtrIV
->getOperand(1);
587 VPScalarIVStepsRecipe
*Steps
= createScalarIVSteps(
588 Plan
, InductionDescriptor::IK_IntInduction
, Instruction::Add
, nullptr,
589 nullptr, StartV
, StepV
, Builder
);
591 VPValue
*PtrAdd
= Builder
.createPtrAdd(PtrIV
->getStartValue(), Steps
,
592 PtrIV
->getDebugLoc(), "next.gep");
594 PtrIV
->replaceAllUsesWith(PtrAdd
);
598 // Replace widened induction with scalar steps for users that only use
600 auto *WideIV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
603 if (HasOnlyVectorVFs
&& none_of(WideIV
->users(), [WideIV
](VPUser
*U
) {
604 return U
->usesScalars(WideIV
);
608 const InductionDescriptor
&ID
= WideIV
->getInductionDescriptor();
609 VPScalarIVStepsRecipe
*Steps
= createScalarIVSteps(
610 Plan
, ID
.getKind(), ID
.getInductionOpcode(),
611 dyn_cast_or_null
<FPMathOperator
>(ID
.getInductionBinOp()),
612 WideIV
->getTruncInst(), WideIV
->getStartValue(), WideIV
->getStepValue(),
615 // Update scalar users of IV to use Step instead.
616 if (!HasOnlyVectorVFs
)
617 WideIV
->replaceAllUsesWith(Steps
);
619 WideIV
->replaceUsesWithIf(Steps
, [WideIV
](VPUser
&U
, unsigned) {
620 return U
.usesScalars(WideIV
);
625 /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
626 /// them with already existing recipes expanding the same SCEV expression.
627 static void removeRedundantExpandSCEVRecipes(VPlan
&Plan
) {
628 DenseMap
<const SCEV
*, VPValue
*> SCEV2VPV
;
630 for (VPRecipeBase
&R
:
631 make_early_inc_range(*Plan
.getEntry()->getEntryBasicBlock())) {
632 auto *ExpR
= dyn_cast
<VPExpandSCEVRecipe
>(&R
);
636 auto I
= SCEV2VPV
.insert({ExpR
->getSCEV(), ExpR
});
639 ExpR
->replaceAllUsesWith(I
.first
->second
);
640 ExpR
->eraseFromParent();
644 static void recursivelyDeleteDeadRecipes(VPValue
*V
) {
645 SmallVector
<VPValue
*> WorkList
;
646 SmallPtrSet
<VPValue
*, 8> Seen
;
647 WorkList
.push_back(V
);
649 while (!WorkList
.empty()) {
650 VPValue
*Cur
= WorkList
.pop_back_val();
651 if (!Seen
.insert(Cur
).second
)
653 VPRecipeBase
*R
= Cur
->getDefiningRecipe();
656 if (!isDeadRecipe(*R
))
658 WorkList
.append(R
->op_begin(), R
->op_end());
659 R
->eraseFromParent();
663 void VPlanTransforms::optimizeForVFAndUF(VPlan
&Plan
, ElementCount BestVF
,
665 PredicatedScalarEvolution
&PSE
) {
666 assert(Plan
.hasVF(BestVF
) && "BestVF is not available in Plan");
667 assert(Plan
.hasUF(BestUF
) && "BestUF is not available in Plan");
668 VPBasicBlock
*ExitingVPBB
=
669 Plan
.getVectorLoopRegion()->getExitingBasicBlock();
670 auto *Term
= &ExitingVPBB
->back();
671 // Try to simplify the branch condition if TC <= VF * UF when preparing to
672 // execute the plan for the main vector loop. We only do this if the
674 // 1. BranchOnCount, or
675 // 2. BranchOnCond where the input is Not(ActiveLaneMask).
676 using namespace llvm::VPlanPatternMatch
;
677 if (!match(Term
, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
679 m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
682 ScalarEvolution
&SE
= *PSE
.getSE();
683 const SCEV
*TripCount
=
684 vputils::getSCEVExprForVPValue(Plan
.getTripCount(), SE
);
685 assert(!isa
<SCEVCouldNotCompute
>(TripCount
) &&
686 "Trip count SCEV must be computable");
687 ElementCount NumElements
= BestVF
.multiplyCoefficientBy(BestUF
);
688 const SCEV
*C
= SE
.getElementCount(TripCount
->getType(), NumElements
);
689 if (TripCount
->isZero() ||
690 !SE
.isKnownPredicate(CmpInst::ICMP_ULE
, TripCount
, C
))
693 LLVMContext
&Ctx
= SE
.getContext();
694 auto *BOC
= new VPInstruction(
695 VPInstruction::BranchOnCond
,
696 {Plan
.getOrAddLiveIn(ConstantInt::getTrue(Ctx
))}, Term
->getDebugLoc());
698 SmallVector
<VPValue
*> PossiblyDead(Term
->operands());
699 Term
->eraseFromParent();
700 for (VPValue
*Op
: PossiblyDead
)
701 recursivelyDeleteDeadRecipes(Op
);
702 ExitingVPBB
->appendRecipe(BOC
);
705 // TODO: Further simplifications are possible
706 // 1. Replace inductions with constants.
707 // 2. Replace vector loop region with VPBasicBlock.
710 /// Sink users of \p FOR after the recipe defining the previous value \p
711 /// Previous of the recurrence. \returns true if all users of \p FOR could be
712 /// re-arranged as needed or false if it is not possible.
714 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe
*FOR
,
715 VPRecipeBase
*Previous
,
716 VPDominatorTree
&VPDT
) {
717 // Collect recipes that need sinking.
718 SmallVector
<VPRecipeBase
*> WorkList
;
719 SmallPtrSet
<VPRecipeBase
*, 8> Seen
;
720 Seen
.insert(Previous
);
721 auto TryToPushSinkCandidate
= [&](VPRecipeBase
*SinkCandidate
) {
722 // The previous value must not depend on the users of the recurrence phi. In
723 // that case, FOR is not a fixed order recurrence.
724 if (SinkCandidate
== Previous
)
727 if (isa
<VPHeaderPHIRecipe
>(SinkCandidate
) ||
728 !Seen
.insert(SinkCandidate
).second
||
729 VPDT
.properlyDominates(Previous
, SinkCandidate
))
732 if (SinkCandidate
->mayHaveSideEffects())
735 WorkList
.push_back(SinkCandidate
);
739 // Recursively sink users of FOR after Previous.
740 WorkList
.push_back(FOR
);
741 for (unsigned I
= 0; I
!= WorkList
.size(); ++I
) {
742 VPRecipeBase
*Current
= WorkList
[I
];
743 assert(Current
->getNumDefinedValues() == 1 &&
744 "only recipes with a single defined value expected");
746 for (VPUser
*User
: Current
->getVPSingleValue()->users()) {
747 if (!TryToPushSinkCandidate(cast
<VPRecipeBase
>(User
)))
752 // Keep recipes to sink ordered by dominance so earlier instructions are
754 sort(WorkList
, [&VPDT
](const VPRecipeBase
*A
, const VPRecipeBase
*B
) {
755 return VPDT
.properlyDominates(A
, B
);
758 for (VPRecipeBase
*SinkCandidate
: WorkList
) {
759 if (SinkCandidate
== FOR
)
762 SinkCandidate
->moveAfter(Previous
);
763 Previous
= SinkCandidate
;
768 /// Try to hoist \p Previous and its operands before all users of \p FOR.
769 static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe
*FOR
,
770 VPRecipeBase
*Previous
,
771 VPDominatorTree
&VPDT
) {
772 if (Previous
->mayHaveSideEffects() || Previous
->mayReadFromMemory())
775 // Collect recipes that need hoisting.
776 SmallVector
<VPRecipeBase
*> HoistCandidates
;
777 SmallPtrSet
<VPRecipeBase
*, 8> Visited
;
778 VPRecipeBase
*HoistPoint
= nullptr;
779 // Find the closest hoist point by looking at all users of FOR and selecting
780 // the recipe dominating all other users.
781 for (VPUser
*U
: FOR
->users()) {
782 auto *R
= cast
<VPRecipeBase
>(U
);
783 if (!HoistPoint
|| VPDT
.properlyDominates(R
, HoistPoint
))
786 assert(all_of(FOR
->users(),
787 [&VPDT
, HoistPoint
](VPUser
*U
) {
788 auto *R
= cast
<VPRecipeBase
>(U
);
789 return HoistPoint
== R
||
790 VPDT
.properlyDominates(HoistPoint
, R
);
792 "HoistPoint must dominate all users of FOR");
794 auto NeedsHoisting
= [HoistPoint
, &VPDT
,
795 &Visited
](VPValue
*HoistCandidateV
) -> VPRecipeBase
* {
796 VPRecipeBase
*HoistCandidate
= HoistCandidateV
->getDefiningRecipe();
799 VPRegionBlock
*EnclosingLoopRegion
=
800 HoistCandidate
->getParent()->getEnclosingLoopRegion();
801 assert((!HoistCandidate
->getParent()->getParent() ||
802 HoistCandidate
->getParent()->getParent() == EnclosingLoopRegion
) &&
803 "CFG in VPlan should still be flat, without replicate regions");
804 // Hoist candidate was already visited, no need to hoist.
805 if (!Visited
.insert(HoistCandidate
).second
)
808 // Candidate is outside loop region or a header phi, dominates FOR users w/o
810 if (!EnclosingLoopRegion
|| isa
<VPHeaderPHIRecipe
>(HoistCandidate
))
813 // If we reached a recipe that dominates HoistPoint, we don't need to
815 if (VPDT
.properlyDominates(HoistCandidate
, HoistPoint
))
817 return HoistCandidate
;
819 auto CanHoist
= [&](VPRecipeBase
*HoistCandidate
) {
820 // Avoid hoisting candidates with side-effects, as we do not yet analyze
821 // associated dependencies.
822 return !HoistCandidate
->mayHaveSideEffects();
825 if (!NeedsHoisting(Previous
->getVPSingleValue()))
828 // Recursively try to hoist Previous and its operands before all users of FOR.
829 HoistCandidates
.push_back(Previous
);
831 for (unsigned I
= 0; I
!= HoistCandidates
.size(); ++I
) {
832 VPRecipeBase
*Current
= HoistCandidates
[I
];
833 assert(Current
->getNumDefinedValues() == 1 &&
834 "only recipes with a single defined value expected");
835 if (!CanHoist(Current
))
838 for (VPValue
*Op
: Current
->operands()) {
839 // If we reach FOR, it means the original Previous depends on some other
840 // recurrence that in turn depends on FOR. If that is the case, we would
841 // also need to hoist recipes involving the other FOR, which may break
846 if (auto *R
= NeedsHoisting(Op
))
847 HoistCandidates
.push_back(R
);
851 // Order recipes to hoist by dominance so earlier instructions are processed
853 sort(HoistCandidates
, [&VPDT
](const VPRecipeBase
*A
, const VPRecipeBase
*B
) {
854 return VPDT
.properlyDominates(A
, B
);
857 for (VPRecipeBase
*HoistCandidate
: HoistCandidates
) {
858 HoistCandidate
->moveBefore(*HoistPoint
->getParent(),
859 HoistPoint
->getIterator());
865 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan
&Plan
,
866 VPBuilder
&LoopBuilder
) {
867 VPDominatorTree VPDT
;
868 VPDT
.recalculate(Plan
);
870 SmallVector
<VPFirstOrderRecurrencePHIRecipe
*> RecurrencePhis
;
871 for (VPRecipeBase
&R
:
872 Plan
.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
873 if (auto *FOR
= dyn_cast
<VPFirstOrderRecurrencePHIRecipe
>(&R
))
874 RecurrencePhis
.push_back(FOR
);
876 for (VPFirstOrderRecurrencePHIRecipe
*FOR
: RecurrencePhis
) {
877 SmallPtrSet
<VPFirstOrderRecurrencePHIRecipe
*, 4> SeenPhis
;
878 VPRecipeBase
*Previous
= FOR
->getBackedgeValue()->getDefiningRecipe();
879 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
881 while (auto *PrevPhi
=
882 dyn_cast_or_null
<VPFirstOrderRecurrencePHIRecipe
>(Previous
)) {
883 assert(PrevPhi
->getParent() == FOR
->getParent());
884 assert(SeenPhis
.insert(PrevPhi
).second
);
885 Previous
= PrevPhi
->getBackedgeValue()->getDefiningRecipe();
888 if (!sinkRecurrenceUsersAfterPrevious(FOR
, Previous
, VPDT
) &&
889 !hoistPreviousBeforeFORUsers(FOR
, Previous
, VPDT
))
892 // Introduce a recipe to combine the incoming and previous values of a
893 // fixed-order recurrence.
894 VPBasicBlock
*InsertBlock
= Previous
->getParent();
895 if (isa
<VPHeaderPHIRecipe
>(Previous
))
896 LoopBuilder
.setInsertPoint(InsertBlock
, InsertBlock
->getFirstNonPhi());
898 LoopBuilder
.setInsertPoint(InsertBlock
,
899 std::next(Previous
->getIterator()));
901 auto *RecurSplice
= cast
<VPInstruction
>(
902 LoopBuilder
.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice
,
903 {FOR
, FOR
->getBackedgeValue()}));
905 FOR
->replaceAllUsesWith(RecurSplice
);
906 // Set the first operand of RecurSplice to FOR again, after replacing
908 RecurSplice
->setOperand(0, FOR
);
913 static SmallVector
<VPUser
*> collectUsersRecursively(VPValue
*V
) {
914 SetVector
<VPUser
*> Users(V
->user_begin(), V
->user_end());
915 for (unsigned I
= 0; I
!= Users
.size(); ++I
) {
916 VPRecipeBase
*Cur
= cast
<VPRecipeBase
>(Users
[I
]);
917 if (isa
<VPHeaderPHIRecipe
>(Cur
))
919 for (VPValue
*V
: Cur
->definedValues())
920 Users
.insert(V
->user_begin(), V
->user_end());
922 return Users
.takeVector();
925 void VPlanTransforms::clearReductionWrapFlags(VPlan
&Plan
) {
926 for (VPRecipeBase
&R
:
927 Plan
.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
928 auto *PhiR
= dyn_cast
<VPReductionPHIRecipe
>(&R
);
931 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
932 RecurKind RK
= RdxDesc
.getRecurrenceKind();
933 if (RK
!= RecurKind::Add
&& RK
!= RecurKind::Mul
)
936 for (VPUser
*U
: collectUsersRecursively(PhiR
))
937 if (auto *RecWithFlags
= dyn_cast
<VPRecipeWithIRFlags
>(U
)) {
938 RecWithFlags
->dropPoisonGeneratingFlags();
943 /// Try to simplify recipe \p R.
944 static void simplifyRecipe(VPRecipeBase
&R
, VPTypeAnalysis
&TypeInfo
) {
945 using namespace llvm::VPlanPatternMatch
;
947 if (auto *Blend
= dyn_cast
<VPBlendRecipe
>(&R
)) {
948 // Try to remove redundant blend recipes.
949 SmallPtrSet
<VPValue
*, 4> UniqueValues
;
950 if (Blend
->isNormalized() || !match(Blend
->getMask(0), m_False()))
951 UniqueValues
.insert(Blend
->getIncomingValue(0));
952 for (unsigned I
= 1; I
!= Blend
->getNumIncomingValues(); ++I
)
953 if (!match(Blend
->getMask(I
), m_False()))
954 UniqueValues
.insert(Blend
->getIncomingValue(I
));
956 if (UniqueValues
.size() == 1) {
957 Blend
->replaceAllUsesWith(*UniqueValues
.begin());
958 Blend
->eraseFromParent();
962 if (Blend
->isNormalized())
965 // Normalize the blend so its first incoming value is used as the initial
966 // value with the others blended into it.
968 unsigned StartIndex
= 0;
969 for (unsigned I
= 0; I
!= Blend
->getNumIncomingValues(); ++I
) {
970 // If a value's mask is used only by the blend then is can be deadcoded.
971 // TODO: Find the most expensive mask that can be deadcoded, or a mask
972 // that's used by multiple blends where it can be removed from them all.
973 VPValue
*Mask
= Blend
->getMask(I
);
974 if (Mask
->getNumUsers() == 1 && !match(Mask
, m_False())) {
980 SmallVector
<VPValue
*, 4> OperandsWithMask
;
981 OperandsWithMask
.push_back(Blend
->getIncomingValue(StartIndex
));
983 for (unsigned I
= 0; I
!= Blend
->getNumIncomingValues(); ++I
) {
986 OperandsWithMask
.push_back(Blend
->getIncomingValue(I
));
987 OperandsWithMask
.push_back(Blend
->getMask(I
));
990 auto *NewBlend
= new VPBlendRecipe(
991 cast
<PHINode
>(Blend
->getUnderlyingValue()), OperandsWithMask
);
992 NewBlend
->insertBefore(&R
);
994 VPValue
*DeadMask
= Blend
->getMask(StartIndex
);
995 Blend
->replaceAllUsesWith(NewBlend
);
996 Blend
->eraseFromParent();
997 recursivelyDeleteDeadRecipes(DeadMask
);
1002 if (match(&R
, m_Trunc(m_ZExtOrSExt(m_VPValue(A
))))) {
1003 VPValue
*Trunc
= R
.getVPSingleValue();
1004 Type
*TruncTy
= TypeInfo
.inferScalarType(Trunc
);
1005 Type
*ATy
= TypeInfo
.inferScalarType(A
);
1006 if (TruncTy
== ATy
) {
1007 Trunc
->replaceAllUsesWith(A
);
1009 // Don't replace a scalarizing recipe with a widened cast.
1010 if (isa
<VPReplicateRecipe
>(&R
))
1012 if (ATy
->getScalarSizeInBits() < TruncTy
->getScalarSizeInBits()) {
1014 unsigned ExtOpcode
= match(R
.getOperand(0), m_SExt(m_VPValue()))
1016 : Instruction::ZExt
;
1018 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode
), A
, TruncTy
);
1019 if (auto *UnderlyingExt
= R
.getOperand(0)->getUnderlyingValue()) {
1020 // UnderlyingExt has distinct return type, used to retain legacy cost.
1021 VPC
->setUnderlyingValue(UnderlyingExt
);
1023 VPC
->insertBefore(&R
);
1024 Trunc
->replaceAllUsesWith(VPC
);
1025 } else if (ATy
->getScalarSizeInBits() > TruncTy
->getScalarSizeInBits()) {
1026 auto *VPC
= new VPWidenCastRecipe(Instruction::Trunc
, A
, TruncTy
);
1027 VPC
->insertBefore(&R
);
1028 Trunc
->replaceAllUsesWith(VPC
);
1032 // Verify that the cached type info is for both A and its users is still
1033 // accurate by comparing it to freshly computed types.
1034 VPTypeAnalysis
TypeInfo2(
1035 R
.getParent()->getPlan()->getCanonicalIV()->getScalarType());
1036 assert(TypeInfo
.inferScalarType(A
) == TypeInfo2
.inferScalarType(A
));
1037 for (VPUser
*U
: A
->users()) {
1038 auto *R
= cast
<VPRecipeBase
>(U
);
1039 for (VPValue
*VPV
: R
->definedValues())
1040 assert(TypeInfo
.inferScalarType(VPV
) == TypeInfo2
.inferScalarType(VPV
));
1045 // Simplify (X && Y) || (X && !Y) -> X.
1046 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1047 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1048 // recipes to be visited during simplification.
1049 VPValue
*X
, *Y
, *X1
, *Y1
;
1051 m_c_BinaryOr(m_LogicalAnd(m_VPValue(X
), m_VPValue(Y
)),
1052 m_LogicalAnd(m_VPValue(X1
), m_Not(m_VPValue(Y1
))))) &&
1053 X
== X1
&& Y
== Y1
) {
1054 R
.getVPSingleValue()->replaceAllUsesWith(X
);
1055 R
.eraseFromParent();
1059 if (match(&R
, m_c_Mul(m_VPValue(A
), m_SpecificInt(1))))
1060 return R
.getVPSingleValue()->replaceAllUsesWith(A
);
1063 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
1064 static void licm(VPlan
&Plan
) {
1065 VPBasicBlock
*Preheader
= Plan
.getVectorPreheader();
1067 // Return true if we do not know how to (mechanically) hoist a given recipe
1068 // out of a loop region. Does not address legality concerns such as aliasing
1069 // or speculation safety.
1070 auto CannotHoistRecipe
= [](VPRecipeBase
&R
) {
1071 // Allocas cannot be hoisted.
1072 auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
);
1073 return RepR
&& RepR
->getOpcode() == Instruction::Alloca
;
1076 // Hoist any loop invariant recipes from the vector loop region to the
1077 // preheader. Preform a shallow traversal of the vector loop region, to
1078 // exclude recipes in replicate regions.
1079 VPRegionBlock
*LoopRegion
= Plan
.getVectorLoopRegion();
1080 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
1081 vp_depth_first_shallow(LoopRegion
->getEntry()))) {
1082 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
1083 if (CannotHoistRecipe(R
))
1085 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1086 // their memory location is not modified in the vector loop.
1087 if (R
.mayHaveSideEffects() || R
.mayReadFromMemory() || R
.isPhi() ||
1088 any_of(R
.operands(), [](VPValue
*Op
) {
1089 return !Op
->isDefinedOutsideLoopRegions();
1092 R
.moveBefore(*Preheader
, Preheader
->end());
1097 /// Try to simplify the recipes in \p Plan.
1098 static void simplifyRecipes(VPlan
&Plan
) {
1099 ReversePostOrderTraversal
<VPBlockDeepTraversalWrapper
<VPBlockBase
*>> RPOT(
1101 Type
*CanonicalIVType
= Plan
.getCanonicalIV()->getScalarType();
1102 VPTypeAnalysis
TypeInfo(CanonicalIVType
);
1103 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(RPOT
)) {
1104 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
1105 simplifyRecipe(R
, TypeInfo
);
1110 void VPlanTransforms::truncateToMinimalBitwidths(
1111 VPlan
&Plan
, const MapVector
<Instruction
*, uint64_t> &MinBWs
) {
1113 // Count the processed recipes and cross check the count later with MinBWs
1114 // size, to make sure all entries in MinBWs have been handled.
1115 unsigned NumProcessedRecipes
= 0;
1117 // Keep track of created truncates, so they can be re-used. Note that we
1118 // cannot use RAUW after creating a new truncate, as this would could make
1119 // other uses have different types for their operands, making them invalidly
1121 DenseMap
<VPValue
*, VPWidenCastRecipe
*> ProcessedTruncs
;
1122 Type
*CanonicalIVType
= Plan
.getCanonicalIV()->getScalarType();
1123 VPTypeAnalysis
TypeInfo(CanonicalIVType
);
1124 VPBasicBlock
*PH
= Plan
.getVectorPreheader();
1125 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(
1126 vp_depth_first_deep(Plan
.getVectorLoopRegion()))) {
1127 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
1128 if (!isa
<VPWidenRecipe
, VPWidenCastRecipe
, VPReplicateRecipe
,
1129 VPWidenSelectRecipe
, VPWidenLoadRecipe
>(&R
))
1132 VPValue
*ResultVPV
= R
.getVPSingleValue();
1133 auto *UI
= cast_or_null
<Instruction
>(ResultVPV
->getUnderlyingValue());
1134 unsigned NewResSizeInBits
= MinBWs
.lookup(UI
);
1135 if (!NewResSizeInBits
)
1139 NumProcessedRecipes
++;
1141 // If the value wasn't vectorized, we must maintain the original scalar
1142 // type. Skip those here, after incrementing NumProcessedRecipes. Also
1143 // skip casts which do not need to be handled explicitly here, as
1144 // redundant casts will be removed during recipe simplification.
1145 if (isa
<VPReplicateRecipe
, VPWidenCastRecipe
>(&R
)) {
1147 // If any of the operands is a live-in and not used by VPWidenRecipe or
1148 // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
1149 // processed as well. When MinBWs is currently constructed, there is no
1150 // information about whether recipes are widened or replicated and in
1151 // case they are reciplicated the operands are not truncated. Counting
1152 // them them here ensures we do not miss any recipes in MinBWs.
1153 // TODO: Remove once the analysis is done on VPlan.
1154 for (VPValue
*Op
: R
.operands()) {
1155 if (!Op
->isLiveIn())
1157 auto *UV
= dyn_cast_or_null
<Instruction
>(Op
->getUnderlyingValue());
1158 if (UV
&& MinBWs
.contains(UV
) && !ProcessedTruncs
.contains(Op
) &&
1159 all_of(Op
->users(), [](VPUser
*U
) {
1160 return !isa
<VPWidenRecipe
, VPWidenSelectRecipe
>(U
);
1162 // Add an entry to ProcessedTruncs to avoid counting the same
1163 // operand multiple times.
1164 ProcessedTruncs
[Op
] = nullptr;
1165 NumProcessedRecipes
+= 1;
1172 Type
*OldResTy
= TypeInfo
.inferScalarType(ResultVPV
);
1173 unsigned OldResSizeInBits
= OldResTy
->getScalarSizeInBits();
1174 assert(OldResTy
->isIntegerTy() && "only integer types supported");
1175 (void)OldResSizeInBits
;
1177 LLVMContext
&Ctx
= CanonicalIVType
->getContext();
1178 auto *NewResTy
= IntegerType::get(Ctx
, NewResSizeInBits
);
1180 // Any wrapping introduced by shrinking this operation shouldn't be
1181 // considered undefined behavior. So, we can't unconditionally copy
1182 // arithmetic wrapping flags to VPW.
1183 if (auto *VPW
= dyn_cast
<VPRecipeWithIRFlags
>(&R
))
1184 VPW
->dropPoisonGeneratingFlags();
1186 using namespace llvm::VPlanPatternMatch
;
1187 if (OldResSizeInBits
!= NewResSizeInBits
&&
1188 !match(&R
, m_Binary
<Instruction::ICmp
>(m_VPValue(), m_VPValue()))) {
1189 // Extend result to original width.
1191 new VPWidenCastRecipe(Instruction::ZExt
, ResultVPV
, OldResTy
);
1192 Ext
->insertAfter(&R
);
1193 ResultVPV
->replaceAllUsesWith(Ext
);
1194 Ext
->setOperand(0, ResultVPV
);
1195 assert(OldResSizeInBits
> NewResSizeInBits
&& "Nothing to shrink?");
1198 match(&R
, m_Binary
<Instruction::ICmp
>(m_VPValue(), m_VPValue())) &&
1199 "Only ICmps should not need extending the result.");
1202 assert(!isa
<VPWidenStoreRecipe
>(&R
) && "stores cannot be narrowed");
1203 if (isa
<VPWidenLoadRecipe
>(&R
))
1206 // Shrink operands by introducing truncates as needed.
1207 unsigned StartIdx
= isa
<VPWidenSelectRecipe
>(&R
) ? 1 : 0;
1208 for (unsigned Idx
= StartIdx
; Idx
!= R
.getNumOperands(); ++Idx
) {
1209 auto *Op
= R
.getOperand(Idx
);
1210 unsigned OpSizeInBits
=
1211 TypeInfo
.inferScalarType(Op
)->getScalarSizeInBits();
1212 if (OpSizeInBits
== NewResSizeInBits
)
1214 assert(OpSizeInBits
> NewResSizeInBits
&& "nothing to truncate");
1215 auto [ProcessedIter
, IterIsEmpty
] =
1216 ProcessedTruncs
.insert({Op
, nullptr});
1217 VPWidenCastRecipe
*NewOp
=
1219 ? new VPWidenCastRecipe(Instruction::Trunc
, Op
, NewResTy
)
1220 : ProcessedIter
->second
;
1221 R
.setOperand(Idx
, NewOp
);
1224 ProcessedIter
->second
= NewOp
;
1225 if (!Op
->isLiveIn()) {
1226 NewOp
->insertBefore(&R
);
1228 PH
->appendRecipe(NewOp
);
1230 auto *OpInst
= dyn_cast
<Instruction
>(Op
->getLiveInIRValue());
1231 bool IsContained
= MinBWs
.contains(OpInst
);
1232 NumProcessedRecipes
+= IsContained
;
1240 assert(MinBWs
.size() == NumProcessedRecipes
&&
1241 "some entries in MinBWs haven't been processed");
1244 void VPlanTransforms::optimize(VPlan
&Plan
) {
1245 removeRedundantCanonicalIVs(Plan
);
1246 removeRedundantInductionCasts(Plan
);
1248 simplifyRecipes(Plan
);
1249 legalizeAndOptimizeInductions(Plan
);
1250 removeDeadRecipes(Plan
);
1252 createAndOptimizeReplicateRegions(Plan
);
1254 removeRedundantExpandSCEVRecipes(Plan
);
1255 mergeBlocksIntoPredecessors(Plan
);
1259 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1260 // the loop terminator with a branch-on-cond recipe with the negated
1261 // active-lane-mask as operand. Note that this turns the loop into an
1262 // uncountable one. Only the existing terminator is replaced, all other existing
1263 // recipes/users remain unchanged, except for poison-generating flags being
1264 // dropped from the canonical IV increment. Return the created
1265 // VPActiveLaneMaskPHIRecipe.
1267 // The function uses the following definitions:
1269 // %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1270 // calculate-trip-count-minus-VF (original TC) : original TC
1271 // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1272 // CanonicalIVPhi : CanonicalIVIncrement
1273 // %StartV is the canonical induction start value.
1275 // The function adds the following recipes:
1278 // %TripCount = calculate-trip-count-minus-VF (original TC)
1279 // [if DataWithControlFlowWithoutRuntimeCheck]
1280 // %EntryInc = canonical-iv-increment-for-part %StartV
1281 // %EntryALM = active-lane-mask %EntryInc, %TripCount
1285 // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1287 // %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1288 // %ALM = active-lane-mask %InLoopInc, TripCount
1289 // %Negated = Not %ALM
1290 // branch-on-cond %Negated
1292 static VPActiveLaneMaskPHIRecipe
*addVPLaneMaskPhiAndUpdateExitBranch(
1293 VPlan
&Plan
, bool DataAndControlFlowWithoutRuntimeCheck
) {
1294 VPRegionBlock
*TopRegion
= Plan
.getVectorLoopRegion();
1295 VPBasicBlock
*EB
= TopRegion
->getExitingBasicBlock();
1296 auto *CanonicalIVPHI
= Plan
.getCanonicalIV();
1297 VPValue
*StartV
= CanonicalIVPHI
->getStartValue();
1299 auto *CanonicalIVIncrement
=
1300 cast
<VPInstruction
>(CanonicalIVPHI
->getBackedgeValue());
1301 // TODO: Check if dropping the flags is needed if
1302 // !DataAndControlFlowWithoutRuntimeCheck.
1303 CanonicalIVIncrement
->dropPoisonGeneratingFlags();
1304 DebugLoc DL
= CanonicalIVIncrement
->getDebugLoc();
1305 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1306 // we have to take unrolling into account. Each part needs to start at
1308 auto *VecPreheader
= Plan
.getVectorPreheader();
1309 VPBuilder
Builder(VecPreheader
);
1311 // Create the ActiveLaneMask instruction using the correct start values.
1312 VPValue
*TC
= Plan
.getTripCount();
1314 VPValue
*TripCount
, *IncrementValue
;
1315 if (!DataAndControlFlowWithoutRuntimeCheck
) {
1316 // When the loop is guarded by a runtime overflow check for the loop
1317 // induction variable increment by VF, we can increment the value before
1318 // the get.active.lane mask and use the unmodified tripcount.
1319 IncrementValue
= CanonicalIVIncrement
;
1322 // When avoiding a runtime check, the active.lane.mask inside the loop
1323 // uses a modified trip count and the induction variable increment is
1324 // done after the active.lane.mask intrinsic is called.
1325 IncrementValue
= CanonicalIVPHI
;
1326 TripCount
= Builder
.createNaryOp(VPInstruction::CalculateTripCountMinusVF
,
1329 auto *EntryIncrement
= Builder
.createOverflowingOp(
1330 VPInstruction::CanonicalIVIncrementForPart
, {StartV
}, {false, false}, DL
,
1333 // Create the active lane mask instruction in the VPlan preheader.
1335 Builder
.createNaryOp(VPInstruction::ActiveLaneMask
, {EntryIncrement
, TC
},
1336 DL
, "active.lane.mask.entry");
1338 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
1339 // preheader ActiveLaneMask instruction.
1340 auto *LaneMaskPhi
= new VPActiveLaneMaskPHIRecipe(EntryALM
, DebugLoc());
1341 LaneMaskPhi
->insertAfter(CanonicalIVPHI
);
1343 // Create the active lane mask for the next iteration of the loop before the
1344 // original terminator.
1345 VPRecipeBase
*OriginalTerminator
= EB
->getTerminator();
1346 Builder
.setInsertPoint(OriginalTerminator
);
1347 auto *InLoopIncrement
=
1348 Builder
.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart
,
1349 {IncrementValue
}, {false, false}, DL
);
1350 auto *ALM
= Builder
.createNaryOp(VPInstruction::ActiveLaneMask
,
1351 {InLoopIncrement
, TripCount
}, DL
,
1352 "active.lane.mask.next");
1353 LaneMaskPhi
->addOperand(ALM
);
1355 // Replace the original terminator with BranchOnCond. We have to invert the
1356 // mask here because a true condition means jumping to the exit block.
1357 auto *NotMask
= Builder
.createNot(ALM
, DL
);
1358 Builder
.createNaryOp(VPInstruction::BranchOnCond
, {NotMask
}, DL
);
1359 OriginalTerminator
->eraseFromParent();
1363 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
1364 /// WideCanonicalIV, backedge-taken-count) pattern.
1365 /// TODO: Introduce explicit recipe for header-mask instead of searching
1366 /// for the header-mask pattern manually.
1367 static SmallVector
<VPValue
*> collectAllHeaderMasks(VPlan
&Plan
) {
1368 SmallVector
<VPValue
*> WideCanonicalIVs
;
1369 auto *FoundWidenCanonicalIVUser
=
1370 find_if(Plan
.getCanonicalIV()->users(),
1371 [](VPUser
*U
) { return isa
<VPWidenCanonicalIVRecipe
>(U
); });
1372 assert(count_if(Plan
.getCanonicalIV()->users(),
1373 [](VPUser
*U
) { return isa
<VPWidenCanonicalIVRecipe
>(U
); }) <=
1375 "Must have at most one VPWideCanonicalIVRecipe");
1376 if (FoundWidenCanonicalIVUser
!= Plan
.getCanonicalIV()->users().end()) {
1377 auto *WideCanonicalIV
=
1378 cast
<VPWidenCanonicalIVRecipe
>(*FoundWidenCanonicalIVUser
);
1379 WideCanonicalIVs
.push_back(WideCanonicalIV
);
1382 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
1383 // version of the canonical induction.
1384 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
1385 for (VPRecipeBase
&Phi
: HeaderVPBB
->phis()) {
1386 auto *WidenOriginalIV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(&Phi
);
1387 if (WidenOriginalIV
&& WidenOriginalIV
->isCanonical())
1388 WideCanonicalIVs
.push_back(WidenOriginalIV
);
1391 // Walk users of wide canonical IVs and collect to all compares of the form
1392 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
1393 SmallVector
<VPValue
*> HeaderMasks
;
1394 for (auto *Wide
: WideCanonicalIVs
) {
1395 for (VPUser
*U
: SmallVector
<VPUser
*>(Wide
->users())) {
1396 auto *HeaderMask
= dyn_cast
<VPInstruction
>(U
);
1397 if (!HeaderMask
|| !vputils::isHeaderMask(HeaderMask
, Plan
))
1400 assert(HeaderMask
->getOperand(0) == Wide
&&
1401 "WidenCanonicalIV must be the first operand of the compare");
1402 HeaderMasks
.push_back(HeaderMask
);
1408 void VPlanTransforms::addActiveLaneMask(
1409 VPlan
&Plan
, bool UseActiveLaneMaskForControlFlow
,
1410 bool DataAndControlFlowWithoutRuntimeCheck
) {
1411 assert((!DataAndControlFlowWithoutRuntimeCheck
||
1412 UseActiveLaneMaskForControlFlow
) &&
1413 "DataAndControlFlowWithoutRuntimeCheck implies "
1414 "UseActiveLaneMaskForControlFlow");
1416 auto *FoundWidenCanonicalIVUser
=
1417 find_if(Plan
.getCanonicalIV()->users(),
1418 [](VPUser
*U
) { return isa
<VPWidenCanonicalIVRecipe
>(U
); });
1419 assert(FoundWidenCanonicalIVUser
&&
1420 "Must have widened canonical IV when tail folding!");
1421 auto *WideCanonicalIV
=
1422 cast
<VPWidenCanonicalIVRecipe
>(*FoundWidenCanonicalIVUser
);
1423 VPSingleDefRecipe
*LaneMask
;
1424 if (UseActiveLaneMaskForControlFlow
) {
1425 LaneMask
= addVPLaneMaskPhiAndUpdateExitBranch(
1426 Plan
, DataAndControlFlowWithoutRuntimeCheck
);
1428 VPBuilder B
= VPBuilder::getToInsertAfter(WideCanonicalIV
);
1429 LaneMask
= B
.createNaryOp(VPInstruction::ActiveLaneMask
,
1430 {WideCanonicalIV
, Plan
.getTripCount()}, nullptr,
1431 "active.lane.mask");
1434 // Walk users of WideCanonicalIV and replace all compares of the form
1435 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
1436 // active-lane-mask.
1437 for (VPValue
*HeaderMask
: collectAllHeaderMasks(Plan
))
1438 HeaderMask
->replaceAllUsesWith(LaneMask
);
1441 /// Replace recipes with their EVL variants.
1442 static void transformRecipestoEVLRecipes(VPlan
&Plan
, VPValue
&EVL
) {
1443 using namespace llvm::VPlanPatternMatch
;
1444 Type
*CanonicalIVType
= Plan
.getCanonicalIV()->getScalarType();
1445 VPTypeAnalysis
TypeInfo(CanonicalIVType
);
1446 LLVMContext
&Ctx
= CanonicalIVType
->getContext();
1447 SmallVector
<VPValue
*> HeaderMasks
= collectAllHeaderMasks(Plan
);
1448 for (VPValue
*HeaderMask
: collectAllHeaderMasks(Plan
)) {
1449 for (VPUser
*U
: collectUsersRecursively(HeaderMask
)) {
1450 auto *CurRecipe
= cast
<VPRecipeBase
>(U
);
1451 auto GetNewMask
= [&](VPValue
*OrigMask
) -> VPValue
* {
1452 assert(OrigMask
&& "Unmasked recipe when folding tail");
1453 return HeaderMask
== OrigMask
? nullptr : OrigMask
;
1456 VPRecipeBase
*NewRecipe
=
1457 TypeSwitch
<VPRecipeBase
*, VPRecipeBase
*>(CurRecipe
)
1458 .Case
<VPWidenLoadRecipe
>([&](VPWidenLoadRecipe
*L
) {
1459 VPValue
*NewMask
= GetNewMask(L
->getMask());
1460 return new VPWidenLoadEVLRecipe(*L
, EVL
, NewMask
);
1462 .Case
<VPWidenStoreRecipe
>([&](VPWidenStoreRecipe
*S
) {
1463 VPValue
*NewMask
= GetNewMask(S
->getMask());
1464 return new VPWidenStoreEVLRecipe(*S
, EVL
, NewMask
);
1466 .Case
<VPWidenRecipe
>([&](VPWidenRecipe
*W
) -> VPRecipeBase
* {
1467 unsigned Opcode
= W
->getOpcode();
1468 if (!Instruction::isBinaryOp(Opcode
) &&
1469 !Instruction::isUnaryOp(Opcode
))
1471 return new VPWidenEVLRecipe(*W
, EVL
);
1473 .Case
<VPReductionRecipe
>([&](VPReductionRecipe
*Red
) {
1474 VPValue
*NewMask
= GetNewMask(Red
->getCondOp());
1475 return new VPReductionEVLRecipe(*Red
, EVL
, NewMask
);
1477 .Case
<VPWidenSelectRecipe
>([&](VPWidenSelectRecipe
*Sel
) {
1478 SmallVector
<VPValue
*> Ops(Sel
->operands());
1479 Ops
.push_back(&EVL
);
1480 return new VPWidenIntrinsicRecipe(Intrinsic::vp_select
, Ops
,
1481 TypeInfo
.inferScalarType(Sel
),
1482 Sel
->getDebugLoc());
1484 .Case
<VPInstruction
>([&](VPInstruction
*VPI
) -> VPRecipeBase
* {
1486 // Transform select with a header mask condition
1487 // select(header_mask, LHS, RHS)
1488 // into vector predication merge.
1489 // vp.merge(all-true, LHS, RHS, EVL)
1490 if (!match(VPI
, m_Select(m_Specific(HeaderMask
), m_VPValue(LHS
),
1493 // Use all true as the condition because this transformation is
1494 // limited to selects whose condition is a header mask.
1496 Plan
.getOrAddLiveIn(ConstantInt::getTrue(Ctx
));
1497 return new VPWidenIntrinsicRecipe(
1498 Intrinsic::vp_merge
, {AllTrue
, LHS
, RHS
, &EVL
},
1499 TypeInfo
.inferScalarType(LHS
), VPI
->getDebugLoc());
1501 .Default([&](VPRecipeBase
*R
) { return nullptr; });
1506 [[maybe_unused
]] unsigned NumDefVal
= NewRecipe
->getNumDefinedValues();
1507 assert(NumDefVal
== CurRecipe
->getNumDefinedValues() &&
1508 "New recipe must define the same number of values as the "
1512 "Only supports recipes with a single definition or without users.");
1513 NewRecipe
->insertBefore(CurRecipe
);
1514 if (isa
<VPSingleDefRecipe
, VPWidenLoadEVLRecipe
>(NewRecipe
)) {
1515 VPValue
*CurVPV
= CurRecipe
->getVPSingleValue();
1516 CurVPV
->replaceAllUsesWith(NewRecipe
->getVPSingleValue());
1518 CurRecipe
->eraseFromParent();
1520 recursivelyDeleteDeadRecipes(HeaderMask
);
1524 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
1525 /// replaces all uses except the canonical IV increment of
1526 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
1527 /// is used only for loop iterations counting after this transformation.
1529 /// The function uses the following definitions:
1530 /// %StartV is the canonical induction start value.
1532 /// The function adds the following recipes:
1539 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
1540 /// [ %NextEVLIV, %vector.body ]
1541 /// %AVL = sub original TC, %EVLPhi
1542 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
1544 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
1547 /// If MaxSafeElements is provided, the function adds the following recipes:
1553 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
1554 /// [ %NextEVLIV, %vector.body ]
1555 /// %AVL = sub original TC, %EVLPhi
1556 /// %cmp = cmp ult %AVL, MaxSafeElements
1557 /// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
1558 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
1560 /// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
1563 bool VPlanTransforms::tryAddExplicitVectorLength(
1564 VPlan
&Plan
, const std::optional
<unsigned> &MaxSafeElements
) {
1565 VPBasicBlock
*Header
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
1566 // The transform updates all users of inductions to work based on EVL, instead
1567 // of the VF directly. At the moment, widened inductions cannot be updated, so
1568 // bail out if the plan contains any.
1569 bool ContainsWidenInductions
= any_of(Header
->phis(), [](VPRecipeBase
&Phi
) {
1570 return isa
<VPWidenIntOrFpInductionRecipe
, VPWidenPointerInductionRecipe
>(
1573 if (ContainsWidenInductions
)
1576 auto *CanonicalIVPHI
= Plan
.getCanonicalIV();
1577 VPValue
*StartV
= CanonicalIVPHI
->getStartValue();
1579 // Create the ExplicitVectorLengthPhi recipe in the main loop.
1580 auto *EVLPhi
= new VPEVLBasedIVPHIRecipe(StartV
, DebugLoc());
1581 EVLPhi
->insertAfter(CanonicalIVPHI
);
1582 VPBuilder
Builder(Header
, Header
->getFirstNonPhi());
1583 // Compute original TC - IV as the AVL (application vector length).
1584 VPValue
*AVL
= Builder
.createNaryOp(
1585 Instruction::Sub
, {Plan
.getTripCount(), EVLPhi
}, DebugLoc(), "avl");
1586 if (MaxSafeElements
) {
1587 // Support for MaxSafeDist for correct loop emission.
1588 VPValue
*AVLSafe
= Plan
.getOrAddLiveIn(
1589 ConstantInt::get(CanonicalIVPHI
->getScalarType(), *MaxSafeElements
));
1590 VPValue
*Cmp
= Builder
.createICmp(ICmpInst::ICMP_ULT
, AVL
, AVLSafe
);
1591 AVL
= Builder
.createSelect(Cmp
, AVL
, AVLSafe
, DebugLoc(), "safe_avl");
1593 auto *VPEVL
= Builder
.createNaryOp(VPInstruction::ExplicitVectorLength
, AVL
,
1596 auto *CanonicalIVIncrement
=
1597 cast
<VPInstruction
>(CanonicalIVPHI
->getBackedgeValue());
1598 VPSingleDefRecipe
*OpVPEVL
= VPEVL
;
1599 if (unsigned IVSize
= CanonicalIVPHI
->getScalarType()->getScalarSizeInBits();
1601 OpVPEVL
= new VPScalarCastRecipe(IVSize
< 32 ? Instruction::Trunc
1602 : Instruction::ZExt
,
1603 OpVPEVL
, CanonicalIVPHI
->getScalarType());
1604 OpVPEVL
->insertBefore(CanonicalIVIncrement
);
1607 new VPInstruction(Instruction::Add
, {OpVPEVL
, EVLPhi
},
1608 {CanonicalIVIncrement
->hasNoUnsignedWrap(),
1609 CanonicalIVIncrement
->hasNoSignedWrap()},
1610 CanonicalIVIncrement
->getDebugLoc(), "index.evl.next");
1611 NextEVLIV
->insertBefore(CanonicalIVIncrement
);
1612 EVLPhi
->addOperand(NextEVLIV
);
1614 transformRecipestoEVLRecipes(Plan
, *VPEVL
);
1616 // Replace all uses of VPCanonicalIVPHIRecipe by
1617 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
1618 CanonicalIVPHI
->replaceAllUsesWith(EVLPhi
);
1619 CanonicalIVIncrement
->setOperand(0, CanonicalIVPHI
);
1620 // TODO: support unroll factor > 1.
1625 void VPlanTransforms::dropPoisonGeneratingRecipes(
1626 VPlan
&Plan
, function_ref
<bool(BasicBlock
*)> BlockNeedsPredication
) {
1627 // Collect recipes in the backward slice of `Root` that may generate a poison
1628 // value that is used after vectorization.
1629 SmallPtrSet
<VPRecipeBase
*, 16> Visited
;
1630 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase
*Root
) {
1631 SmallVector
<VPRecipeBase
*, 16> Worklist
;
1632 Worklist
.push_back(Root
);
1634 // Traverse the backward slice of Root through its use-def chain.
1635 while (!Worklist
.empty()) {
1636 VPRecipeBase
*CurRec
= Worklist
.pop_back_val();
1638 if (!Visited
.insert(CurRec
).second
)
1641 // Prune search if we find another recipe generating a widen memory
1642 // instruction. Widen memory instructions involved in address computation
1643 // will lead to gather/scatter instructions, which don't need to be
1645 if (isa
<VPWidenMemoryRecipe
>(CurRec
) || isa
<VPInterleaveRecipe
>(CurRec
) ||
1646 isa
<VPScalarIVStepsRecipe
>(CurRec
) || isa
<VPHeaderPHIRecipe
>(CurRec
))
1649 // This recipe contributes to the address computation of a widen
1650 // load/store. If the underlying instruction has poison-generating flags,
1651 // drop them directly.
1652 if (auto *RecWithFlags
= dyn_cast
<VPRecipeWithIRFlags
>(CurRec
)) {
1654 using namespace llvm::VPlanPatternMatch
;
1655 // Dropping disjoint from an OR may yield incorrect results, as some
1656 // analysis may have converted it to an Add implicitly (e.g. SCEV used
1657 // for dependence analysis). Instead, replace it with an equivalent Add.
1658 // This is possible as all users of the disjoint OR only access lanes
1659 // where the operands are disjoint or poison otherwise.
1660 if (match(RecWithFlags
, m_BinaryOr(m_VPValue(A
), m_VPValue(B
))) &&
1661 RecWithFlags
->isDisjoint()) {
1662 VPBuilder
Builder(RecWithFlags
);
1663 VPInstruction
*New
= Builder
.createOverflowingOp(
1664 Instruction::Add
, {A
, B
}, {false, false},
1665 RecWithFlags
->getDebugLoc());
1666 New
->setUnderlyingValue(RecWithFlags
->getUnderlyingValue());
1667 RecWithFlags
->replaceAllUsesWith(New
);
1668 RecWithFlags
->eraseFromParent();
1671 RecWithFlags
->dropPoisonGeneratingFlags();
1673 Instruction
*Instr
= dyn_cast_or_null
<Instruction
>(
1674 CurRec
->getVPSingleValue()->getUnderlyingValue());
1676 assert((!Instr
|| !Instr
->hasPoisonGeneratingFlags()) &&
1677 "found instruction with poison generating flags not covered by "
1678 "VPRecipeWithIRFlags");
1681 // Add new definitions to the worklist.
1682 for (VPValue
*Operand
: CurRec
->operands())
1683 if (VPRecipeBase
*OpDef
= Operand
->getDefiningRecipe())
1684 Worklist
.push_back(OpDef
);
1688 // Traverse all the recipes in the VPlan and collect the poison-generating
1689 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1690 // VPInterleaveRecipe.
1691 auto Iter
= vp_depth_first_deep(Plan
.getEntry());
1692 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(Iter
)) {
1693 for (VPRecipeBase
&Recipe
: *VPBB
) {
1694 if (auto *WidenRec
= dyn_cast
<VPWidenMemoryRecipe
>(&Recipe
)) {
1695 Instruction
&UnderlyingInstr
= WidenRec
->getIngredient();
1696 VPRecipeBase
*AddrDef
= WidenRec
->getAddr()->getDefiningRecipe();
1697 if (AddrDef
&& WidenRec
->isConsecutive() &&
1698 BlockNeedsPredication(UnderlyingInstr
.getParent()))
1699 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef
);
1700 } else if (auto *InterleaveRec
= dyn_cast
<VPInterleaveRecipe
>(&Recipe
)) {
1701 VPRecipeBase
*AddrDef
= InterleaveRec
->getAddr()->getDefiningRecipe();
1703 // Check if any member of the interleave group needs predication.
1704 const InterleaveGroup
<Instruction
> *InterGroup
=
1705 InterleaveRec
->getInterleaveGroup();
1706 bool NeedPredication
= false;
1707 for (int I
= 0, NumMembers
= InterGroup
->getNumMembers();
1708 I
< NumMembers
; ++I
) {
1709 Instruction
*Member
= InterGroup
->getMember(I
);
1711 NeedPredication
|= BlockNeedsPredication(Member
->getParent());
1714 if (NeedPredication
)
1715 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef
);
1722 void VPlanTransforms::createInterleaveGroups(
1724 const SmallPtrSetImpl
<const InterleaveGroup
<Instruction
> *>
1726 VPRecipeBuilder
&RecipeBuilder
, bool ScalarEpilogueAllowed
) {
1727 if (InterleaveGroups
.empty())
1730 // Interleave memory: for each Interleave Group we marked earlier as relevant
1731 // for this VPlan, replace the Recipes widening its memory instructions with a
1732 // single VPInterleaveRecipe at its insertion point.
1733 VPDominatorTree VPDT
;
1734 VPDT
.recalculate(Plan
);
1735 for (const auto *IG
: InterleaveGroups
) {
1736 SmallVector
<VPValue
*, 4> StoredValues
;
1737 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
1738 if (auto *SI
= dyn_cast_or_null
<StoreInst
>(IG
->getMember(i
))) {
1739 auto *StoreR
= cast
<VPWidenStoreRecipe
>(RecipeBuilder
.getRecipe(SI
));
1740 StoredValues
.push_back(StoreR
->getStoredValue());
1743 bool NeedsMaskForGaps
=
1744 IG
->requiresScalarEpilogue() && !ScalarEpilogueAllowed
;
1746 Instruction
*IRInsertPos
= IG
->getInsertPos();
1748 cast
<VPWidenMemoryRecipe
>(RecipeBuilder
.getRecipe(IRInsertPos
));
1750 // Get or create the start address for the interleave group.
1752 cast
<VPWidenMemoryRecipe
>(RecipeBuilder
.getRecipe(IG
->getMember(0)));
1753 VPValue
*Addr
= Start
->getAddr();
1754 VPRecipeBase
*AddrDef
= Addr
->getDefiningRecipe();
1755 if (AddrDef
&& !VPDT
.properlyDominates(AddrDef
, InsertPos
)) {
1756 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
1757 // InsertPos or sink loads above zero members to join it.
1758 bool InBounds
= false;
1759 if (auto *Gep
= dyn_cast
<GetElementPtrInst
>(
1760 getLoadStorePointerOperand(IRInsertPos
)->stripPointerCasts()))
1761 InBounds
= Gep
->isInBounds();
1763 // We cannot re-use the address of member zero because it does not
1764 // dominate the insert position. Instead, use the address of the insert
1765 // position and create a PtrAdd adjusting it to the address of member
1767 assert(IG
->getIndex(IRInsertPos
) != 0 &&
1768 "index of insert position shouldn't be zero");
1769 auto &DL
= IRInsertPos
->getDataLayout();
1771 DL
.getTypeAllocSize(getLoadStoreType(IRInsertPos
)) *
1772 IG
->getIndex(IRInsertPos
),
1774 VPValue
*OffsetVPV
= Plan
.getOrAddLiveIn(
1775 ConstantInt::get(IRInsertPos
->getParent()->getContext(), -Offset
));
1776 VPBuilder
B(InsertPos
);
1777 Addr
= InBounds
? B
.createInBoundsPtrAdd(InsertPos
->getAddr(), OffsetVPV
)
1778 : B
.createPtrAdd(InsertPos
->getAddr(), OffsetVPV
);
1780 auto *VPIG
= new VPInterleaveRecipe(IG
, Addr
, StoredValues
,
1781 InsertPos
->getMask(), NeedsMaskForGaps
);
1782 VPIG
->insertBefore(InsertPos
);
1785 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
1786 if (Instruction
*Member
= IG
->getMember(i
)) {
1787 VPRecipeBase
*MemberR
= RecipeBuilder
.getRecipe(Member
);
1788 if (!Member
->getType()->isVoidTy()) {
1789 VPValue
*OriginalV
= MemberR
->getVPSingleValue();
1790 OriginalV
->replaceAllUsesWith(VPIG
->getVPValue(J
));
1793 MemberR
->eraseFromParent();