1 //===- LoopVectorizationLegality.cpp --------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides loop vectorization legality analysis. Original code
10 // resided in LoopVectorize.cpp for a long time.
12 // At this point, it is implemented as a utility class, not as an analysis
13 // pass. It should be easy to create an analysis pass around it if there
14 // is a need (but D45420 needs to happen first).
17 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18 #include "llvm/Analysis/Loads.h"
19 #include "llvm/Analysis/LoopInfo.h"
20 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
21 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
22 #include "llvm/Analysis/TargetLibraryInfo.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/Analysis/VectorUtils.h"
26 #include "llvm/IR/IntrinsicInst.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/Transforms/Utils/SizeOpts.h"
29 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
32 using namespace PatternMatch
;
34 #define LV_NAME "loop-vectorize"
35 #define DEBUG_TYPE LV_NAME
38 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden
,
39 cl::desc("Enable if-conversion during vectorization."));
42 AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden
,
43 cl::desc("Enable recognition of non-constant strided "
44 "pointer induction variables."));
48 HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden
,
49 cl::desc("Allow enabling loop hints to reorder "
50 "FP operations during vectorization."));
53 // TODO: Move size-based thresholds out of legality checking, make cost based
54 // decisions instead of hard thresholds.
55 static cl::opt
<unsigned> VectorizeSCEVCheckThreshold(
56 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden
,
57 cl::desc("The maximum number of SCEV checks allowed."));
59 static cl::opt
<unsigned> PragmaVectorizeSCEVCheckThreshold(
60 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden
,
61 cl::desc("The maximum number of SCEV checks allowed with a "
62 "vectorize(enable) pragma"));
64 static cl::opt
<LoopVectorizeHints::ScalableForceKind
>
65 ForceScalableVectorization(
66 "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified
),
68 cl::desc("Control whether the compiler can use scalable vectors to "
71 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly
, "off",
72 "Scalable vectorization is disabled."),
74 LoopVectorizeHints::SK_PreferScalable
, "preferred",
75 "Scalable vectorization is available and favored when the "
76 "cost is inconclusive."),
78 LoopVectorizeHints::SK_PreferScalable
, "on",
79 "Scalable vectorization is available and favored when the "
80 "cost is inconclusive.")));
82 static cl::opt
<bool> EnableHistogramVectorization(
83 "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden
,
84 cl::desc("Enables autovectorization of some loops containing histograms"));
86 /// Maximum vectorization interleave count.
87 static const unsigned MaxInterleaveFactor
= 16;
91 bool LoopVectorizeHints::Hint::validate(unsigned Val
) {
94 return isPowerOf2_32(Val
) && Val
<= VectorizerParams::MaxVectorWidth
;
96 return isPowerOf2_32(Val
) && Val
<= MaxInterleaveFactor
;
102 return (Val
== 0 || Val
== 1);
107 LoopVectorizeHints::LoopVectorizeHints(const Loop
*L
,
108 bool InterleaveOnlyWhenForced
,
109 OptimizationRemarkEmitter
&ORE
,
110 const TargetTransformInfo
*TTI
)
111 : Width("vectorize.width", VectorizerParams::VectorizationFactor
, HK_WIDTH
),
112 Interleave("interleave.count", InterleaveOnlyWhenForced
, HK_INTERLEAVE
),
113 Force("vectorize.enable", FK_Undefined
, HK_FORCE
),
114 IsVectorized("isvectorized", 0, HK_ISVECTORIZED
),
115 Predicate("vectorize.predicate.enable", FK_Undefined
, HK_PREDICATE
),
116 Scalable("vectorize.scalable.enable", SK_Unspecified
, HK_SCALABLE
),
117 TheLoop(L
), ORE(ORE
) {
118 // Populate values with existing loop metadata.
119 getHintsFromMetadata();
121 // force-vector-interleave overrides DisableInterleaving.
122 if (VectorizerParams::isInterleaveForced())
123 Interleave
.Value
= VectorizerParams::VectorizationInterleave
;
125 // If the metadata doesn't explicitly specify whether to enable scalable
126 // vectorization, then decide based on the following criteria (increasing
127 // level of priority):
130 // - Force option (always overrides)
131 if ((LoopVectorizeHints::ScalableForceKind
)Scalable
.Value
== SK_Unspecified
) {
133 Scalable
.Value
= TTI
->enableScalableVectorization() ? SK_PreferScalable
137 // If the width is set, but the metadata says nothing about the scalable
138 // property, then assume it concerns only a fixed-width UserVF.
139 // If width is not set, the flag takes precedence.
140 Scalable
.Value
= SK_FixedWidthOnly
;
143 // If the flag is set to force any use of scalable vectors, override the loop
145 if (ForceScalableVectorization
.getValue() !=
146 LoopVectorizeHints::SK_Unspecified
)
147 Scalable
.Value
= ForceScalableVectorization
.getValue();
149 // Scalable vectorization is disabled if no preference is specified.
150 if ((LoopVectorizeHints::ScalableForceKind
)Scalable
.Value
== SK_Unspecified
)
151 Scalable
.Value
= SK_FixedWidthOnly
;
153 if (IsVectorized
.Value
!= 1)
154 // If the vectorization width and interleaving count are both 1 then
155 // consider the loop to have been already vectorized because there's
156 // nothing more that we can do.
158 getWidth() == ElementCount::getFixed(1) && getInterleave() == 1;
159 LLVM_DEBUG(if (InterleaveOnlyWhenForced
&& getInterleave() == 1) dbgs()
160 << "LV: Interleaving disabled by the pass manager\n");
163 void LoopVectorizeHints::setAlreadyVectorized() {
164 LLVMContext
&Context
= TheLoop
->getHeader()->getContext();
166 MDNode
*IsVectorizedMD
= MDNode::get(
168 {MDString::get(Context
, "llvm.loop.isvectorized"),
169 ConstantAsMetadata::get(ConstantInt::get(Context
, APInt(32, 1)))});
170 MDNode
*LoopID
= TheLoop
->getLoopID();
172 makePostTransformationMetadata(Context
, LoopID
,
173 {Twine(Prefix(), "vectorize.").str(),
174 Twine(Prefix(), "interleave.").str()},
176 TheLoop
->setLoopID(NewLoopID
);
178 // Update internal cache.
179 IsVectorized
.Value
= 1;
182 bool LoopVectorizeHints::allowVectorization(
183 Function
*F
, Loop
*L
, bool VectorizeOnlyWhenForced
) const {
184 if (getForce() == LoopVectorizeHints::FK_Disabled
) {
185 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
186 emitRemarkWithHints();
190 if (VectorizeOnlyWhenForced
&& getForce() != LoopVectorizeHints::FK_Enabled
) {
191 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
192 emitRemarkWithHints();
196 if (getIsVectorized() == 1) {
197 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
198 // FIXME: Add interleave.disable metadata. This will allow
199 // vectorize.disable to be used without disabling the pass and errors
200 // to differentiate between disabled vectorization and a width of 1.
202 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
203 "AllDisabled", L
->getStartLoc(),
205 << "loop not vectorized: vectorization and interleaving are "
206 "explicitly disabled, or the loop has already been "
215 void LoopVectorizeHints::emitRemarkWithHints() const {
219 if (Force
.Value
== LoopVectorizeHints::FK_Disabled
)
220 return OptimizationRemarkMissed(LV_NAME
, "MissedExplicitlyDisabled",
221 TheLoop
->getStartLoc(),
222 TheLoop
->getHeader())
223 << "loop not vectorized: vectorization is explicitly disabled";
225 OptimizationRemarkMissed
R(LV_NAME
, "MissedDetails", TheLoop
->getStartLoc(),
226 TheLoop
->getHeader());
227 R
<< "loop not vectorized";
228 if (Force
.Value
== LoopVectorizeHints::FK_Enabled
) {
229 R
<< " (Force=" << NV("Force", true);
230 if (Width
.Value
!= 0)
231 R
<< ", Vector Width=" << NV("VectorWidth", getWidth());
232 if (getInterleave() != 0)
233 R
<< ", Interleave Count=" << NV("InterleaveCount", getInterleave());
240 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
241 if (getWidth() == ElementCount::getFixed(1))
243 if (getForce() == LoopVectorizeHints::FK_Disabled
)
245 if (getForce() == LoopVectorizeHints::FK_Undefined
&& getWidth().isZero())
247 return OptimizationRemarkAnalysis::AlwaysPrint
;
250 bool LoopVectorizeHints::allowReordering() const {
251 // Allow the vectorizer to change the order of operations if enabling
252 // loop hints are provided
253 ElementCount EC
= getWidth();
254 return HintsAllowReordering
&&
255 (getForce() == LoopVectorizeHints::FK_Enabled
||
256 EC
.getKnownMinValue() > 1);
259 void LoopVectorizeHints::getHintsFromMetadata() {
260 MDNode
*LoopID
= TheLoop
->getLoopID();
264 // First operand should refer to the loop id itself.
265 assert(LoopID
->getNumOperands() > 0 && "requires at least one operand");
266 assert(LoopID
->getOperand(0) == LoopID
&& "invalid loop id");
268 for (const MDOperand
&MDO
: llvm::drop_begin(LoopID
->operands())) {
269 const MDString
*S
= nullptr;
270 SmallVector
<Metadata
*, 4> Args
;
272 // The expected hint is either a MDString or a MDNode with the first
273 // operand a MDString.
274 if (const MDNode
*MD
= dyn_cast
<MDNode
>(MDO
)) {
275 if (!MD
|| MD
->getNumOperands() == 0)
277 S
= dyn_cast
<MDString
>(MD
->getOperand(0));
278 for (unsigned Idx
= 1; Idx
< MD
->getNumOperands(); ++Idx
)
279 Args
.push_back(MD
->getOperand(Idx
));
281 S
= dyn_cast
<MDString
>(MDO
);
282 assert(Args
.size() == 0 && "too many arguments for MDString");
288 // Check if the hint starts with the loop metadata prefix.
289 StringRef Name
= S
->getString();
290 if (Args
.size() == 1)
291 setHint(Name
, Args
[0]);
295 void LoopVectorizeHints::setHint(StringRef Name
, Metadata
*Arg
) {
296 if (!Name
.starts_with(Prefix()))
298 Name
= Name
.substr(Prefix().size(), StringRef::npos
);
300 const ConstantInt
*C
= mdconst::dyn_extract
<ConstantInt
>(Arg
);
303 unsigned Val
= C
->getZExtValue();
305 Hint
*Hints
[] = {&Width
, &Interleave
, &Force
,
306 &IsVectorized
, &Predicate
, &Scalable
};
307 for (auto *H
: Hints
) {
308 if (Name
== H
->Name
) {
309 if (H
->validate(Val
))
312 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name
<< "'\n");
318 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
319 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
320 // executing the inner loop will execute the same iterations). This check is
321 // very constrained for now but it will be relaxed in the future. \p Lp is
322 // considered uniform if it meets all the following conditions:
323 // 1) it has a canonical IV (starting from 0 and with stride 1),
324 // 2) its latch terminator is a conditional branch and,
325 // 3) its latch condition is a compare instruction whose operands are the
326 // canonical IV and an OuterLp invariant.
327 // This check doesn't take into account the uniformity of other conditions not
328 // related to the loop latch because they don't affect the loop uniformity.
330 // NOTE: We decided to keep all these checks and its associated documentation
331 // together so that we can easily have a picture of the current supported loop
332 // nests. However, some of the current checks don't depend on \p OuterLp and
333 // would be redundantly executed for each \p Lp if we invoked this function for
334 // different candidate outer loops. This is not the case for now because we
335 // don't currently have the infrastructure to evaluate multiple candidate outer
336 // loops and \p OuterLp will be a fixed parameter while we only support explicit
337 // outer loop vectorization. It's also very likely that these checks go away
338 // before introducing the aforementioned infrastructure. However, if this is not
339 // the case, we should move the \p OuterLp independent checks to a separate
340 // function that is only executed once for each \p Lp.
341 static bool isUniformLoop(Loop
*Lp
, Loop
*OuterLp
) {
342 assert(Lp
->getLoopLatch() && "Expected loop with a single latch.");
344 // If Lp is the outer loop, it's uniform by definition.
347 assert(OuterLp
->contains(Lp
) && "OuterLp must contain Lp.");
350 PHINode
*IV
= Lp
->getCanonicalInductionVariable();
352 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
357 BasicBlock
*Latch
= Lp
->getLoopLatch();
358 auto *LatchBr
= dyn_cast
<BranchInst
>(Latch
->getTerminator());
359 if (!LatchBr
|| LatchBr
->isUnconditional()) {
360 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
365 auto *LatchCmp
= dyn_cast
<CmpInst
>(LatchBr
->getCondition());
368 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
372 Value
*CondOp0
= LatchCmp
->getOperand(0);
373 Value
*CondOp1
= LatchCmp
->getOperand(1);
374 Value
*IVUpdate
= IV
->getIncomingValueForBlock(Latch
);
375 if (!(CondOp0
== IVUpdate
&& OuterLp
->isLoopInvariant(CondOp1
)) &&
376 !(CondOp1
== IVUpdate
&& OuterLp
->isLoopInvariant(CondOp0
))) {
377 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
384 // Return true if \p Lp and all its nested loops are uniform with regard to \p
386 static bool isUniformLoopNest(Loop
*Lp
, Loop
*OuterLp
) {
387 if (!isUniformLoop(Lp
, OuterLp
))
390 // Check if nested loops are uniform.
391 for (Loop
*SubLp
: *Lp
)
392 if (!isUniformLoopNest(SubLp
, OuterLp
))
398 static Type
*convertPointerToIntegerType(const DataLayout
&DL
, Type
*Ty
) {
399 if (Ty
->isPointerTy())
400 return DL
.getIntPtrType(Ty
);
402 // It is possible that char's or short's overflow when we ask for the loop's
403 // trip count, work around this by changing the type size.
404 if (Ty
->getScalarSizeInBits() < 32)
405 return Type::getInt32Ty(Ty
->getContext());
410 static Type
*getWiderType(const DataLayout
&DL
, Type
*Ty0
, Type
*Ty1
) {
411 Ty0
= convertPointerToIntegerType(DL
, Ty0
);
412 Ty1
= convertPointerToIntegerType(DL
, Ty1
);
413 if (Ty0
->getScalarSizeInBits() > Ty1
->getScalarSizeInBits())
418 /// Check that the instruction has outside loop users and is not an
419 /// identified reduction variable.
420 static bool hasOutsideLoopUser(const Loop
*TheLoop
, Instruction
*Inst
,
421 SmallPtrSetImpl
<Value
*> &AllowedExit
) {
422 // Reductions, Inductions and non-header phis are allowed to have exit users. All
423 // other instructions must not have external users.
424 if (!AllowedExit
.count(Inst
))
425 // Check that all of the users of the loop are inside the BB.
426 for (User
*U
: Inst
->users()) {
427 Instruction
*UI
= cast
<Instruction
>(U
);
428 // This user may be a reduction exit value.
429 if (!TheLoop
->contains(UI
)) {
430 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI
<< '\n');
437 /// Returns true if A and B have same pointer operands or same SCEVs addresses
438 static bool storeToSameAddress(ScalarEvolution
*SE
, StoreInst
*A
,
444 // Otherwise Compare pointers
445 Value
*APtr
= A
->getPointerOperand();
446 Value
*BPtr
= B
->getPointerOperand();
450 // Otherwise compare address SCEVs
451 return SE
->getSCEV(APtr
) == SE
->getSCEV(BPtr
);
454 int LoopVectorizationLegality::isConsecutivePtr(Type
*AccessTy
,
456 // FIXME: Currently, the set of symbolic strides is sometimes queried before
457 // it's collected. This happens from canVectorizeWithIfConvert, when the
458 // pointer is checked to reference consecutive elements suitable for a
460 const auto &Strides
=
461 LAI
? LAI
->getSymbolicStrides() : DenseMap
<Value
*, const SCEV
*>();
463 bool CanAddPredicate
= !llvm::shouldOptimizeForSize(
464 TheLoop
->getHeader(), PSI
, BFI
, PGSOQueryType::IRPass
);
465 int Stride
= getPtrStride(PSE
, AccessTy
, Ptr
, TheLoop
, Strides
,
466 CanAddPredicate
, false).value_or(0);
467 if (Stride
== 1 || Stride
== -1)
472 bool LoopVectorizationLegality::isInvariant(Value
*V
) const {
473 return LAI
->isInvariant(V
);
477 /// A rewriter to build the SCEVs for each of the VF lanes in the expected
478 /// vectorized loop, which can then be compared to detect their uniformity. This
479 /// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
480 /// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
481 /// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
483 class SCEVAddRecForUniformityRewriter
484 : public SCEVRewriteVisitor
<SCEVAddRecForUniformityRewriter
> {
485 /// Multiplier to be applied to the step of AddRecs in TheLoop.
486 unsigned StepMultiplier
;
488 /// Offset to be added to the AddRecs in TheLoop.
491 /// Loop for which to rewrite AddRecsFor.
494 /// Is any sub-expressions not analyzable w.r.t. uniformity?
495 bool CannotAnalyze
= false;
497 bool canAnalyze() const { return !CannotAnalyze
; }
500 SCEVAddRecForUniformityRewriter(ScalarEvolution
&SE
, unsigned StepMultiplier
,
501 unsigned Offset
, Loop
*TheLoop
)
502 : SCEVRewriteVisitor(SE
), StepMultiplier(StepMultiplier
), Offset(Offset
),
505 const SCEV
*visitAddRecExpr(const SCEVAddRecExpr
*Expr
) {
506 assert(Expr
->getLoop() == TheLoop
&&
507 "addrec outside of TheLoop must be invariant and should have been "
509 // Build a new AddRec by multiplying the step by StepMultiplier and
510 // incrementing the start by Offset * step.
511 Type
*Ty
= Expr
->getType();
512 const SCEV
*Step
= Expr
->getStepRecurrence(SE
);
513 if (!SE
.isLoopInvariant(Step
, TheLoop
)) {
514 CannotAnalyze
= true;
517 const SCEV
*NewStep
=
518 SE
.getMulExpr(Step
, SE
.getConstant(Ty
, StepMultiplier
));
519 const SCEV
*ScaledOffset
= SE
.getMulExpr(Step
, SE
.getConstant(Ty
, Offset
));
520 const SCEV
*NewStart
= SE
.getAddExpr(Expr
->getStart(), ScaledOffset
);
521 return SE
.getAddRecExpr(NewStart
, NewStep
, TheLoop
, SCEV::FlagAnyWrap
);
524 const SCEV
*visit(const SCEV
*S
) {
525 if (CannotAnalyze
|| SE
.isLoopInvariant(S
, TheLoop
))
527 return SCEVRewriteVisitor
<SCEVAddRecForUniformityRewriter
>::visit(S
);
530 const SCEV
*visitUnknown(const SCEVUnknown
*S
) {
531 if (SE
.isLoopInvariant(S
, TheLoop
))
533 // The value could vary across iterations.
534 CannotAnalyze
= true;
538 const SCEV
*visitCouldNotCompute(const SCEVCouldNotCompute
*S
) {
539 // Could not analyze the expression.
540 CannotAnalyze
= true;
544 static const SCEV
*rewrite(const SCEV
*S
, ScalarEvolution
&SE
,
545 unsigned StepMultiplier
, unsigned Offset
,
547 /// Bail out if the expression does not contain an UDiv expression.
548 /// Uniform values which are not loop invariant require operations to strip
549 /// out the lowest bits. For now just look for UDivs and use it to avoid
550 /// re-writing UDIV-free expressions for other lanes to limit compile time.
551 if (!SCEVExprContains(S
,
552 [](const SCEV
*S
) { return isa
<SCEVUDivExpr
>(S
); }))
553 return SE
.getCouldNotCompute();
555 SCEVAddRecForUniformityRewriter
Rewriter(SE
, StepMultiplier
, Offset
,
557 const SCEV
*Result
= Rewriter
.visit(S
);
559 if (Rewriter
.canAnalyze())
561 return SE
.getCouldNotCompute();
567 bool LoopVectorizationLegality::isUniform(Value
*V
, ElementCount VF
) const {
575 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
576 // never considered uniform.
577 auto *SE
= PSE
.getSE();
578 if (!SE
->isSCEVable(V
->getType()))
580 const SCEV
*S
= SE
->getSCEV(V
);
582 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
583 // lane 0 matches the expressions for all other lanes.
584 unsigned FixedVF
= VF
.getKnownMinValue();
585 const SCEV
*FirstLaneExpr
=
586 SCEVAddRecForUniformityRewriter::rewrite(S
, *SE
, FixedVF
, 0, TheLoop
);
587 if (isa
<SCEVCouldNotCompute
>(FirstLaneExpr
))
590 // Make sure the expressions for lanes FixedVF-1..1 match the expression for
591 // lane 0. We check lanes in reverse order for compile-time, as frequently
592 // checking the last lane is sufficient to rule out uniformity.
593 return all_of(reverse(seq
<unsigned>(1, FixedVF
)), [&](unsigned I
) {
594 const SCEV
*IthLaneExpr
=
595 SCEVAddRecForUniformityRewriter::rewrite(S
, *SE
, FixedVF
, I
, TheLoop
);
596 return FirstLaneExpr
== IthLaneExpr
;
600 bool LoopVectorizationLegality::isUniformMemOp(Instruction
&I
,
601 ElementCount VF
) const {
602 Value
*Ptr
= getLoadStorePointerOperand(&I
);
605 // Note: There's nothing inherent which prevents predicated loads and
606 // stores from being uniform. The current lowering simply doesn't handle
607 // it; in particular, the cost model distinguishes scatter/gather from
608 // scalar w/predication, and we currently rely on the scalar path.
609 return isUniform(Ptr
, VF
) && !blockNeedsPredication(I
.getParent());
612 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
613 assert(!TheLoop
->isInnermost() && "We are not vectorizing an outer loop.");
614 // Store the result and return it at the end instead of exiting early, in case
615 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
617 bool DoExtraAnalysis
= ORE
->allowExtraAnalysis(DEBUG_TYPE
);
619 for (BasicBlock
*BB
: TheLoop
->blocks()) {
620 // Check whether the BB terminator is a BranchInst. Any other terminator is
621 // not supported yet.
622 auto *Br
= dyn_cast
<BranchInst
>(BB
->getTerminator());
624 reportVectorizationFailure("Unsupported basic block terminator",
625 "loop control flow is not understood by vectorizer",
626 "CFGNotUnderstood", ORE
, TheLoop
);
633 // Check whether the BranchInst is a supported one. Only unconditional
634 // branches, conditional branches with an outer loop invariant condition or
635 // backedges are supported.
636 // FIXME: We skip these checks when VPlan predication is enabled as we
637 // want to allow divergent branches. This whole check will be removed
638 // once VPlan predication is on by default.
639 if (Br
&& Br
->isConditional() &&
640 !TheLoop
->isLoopInvariant(Br
->getCondition()) &&
641 !LI
->isLoopHeader(Br
->getSuccessor(0)) &&
642 !LI
->isLoopHeader(Br
->getSuccessor(1))) {
643 reportVectorizationFailure("Unsupported conditional branch",
644 "loop control flow is not understood by vectorizer",
645 "CFGNotUnderstood", ORE
, TheLoop
);
653 // Check whether inner loops are uniform. At this point, we only support
654 // simple outer loops scenarios with uniform nested loops.
655 if (!isUniformLoopNest(TheLoop
/*loop nest*/,
656 TheLoop
/*context outer loop*/)) {
657 reportVectorizationFailure("Outer loop contains divergent loops",
658 "loop control flow is not understood by vectorizer",
659 "CFGNotUnderstood", ORE
, TheLoop
);
666 // Check whether we are able to set up outer loop induction.
667 if (!setupOuterLoopInductions()) {
668 reportVectorizationFailure("Unsupported outer loop Phi(s)",
669 "Unsupported outer loop Phi(s)",
670 "UnsupportedPhi", ORE
, TheLoop
);
680 void LoopVectorizationLegality::addInductionPhi(
681 PHINode
*Phi
, const InductionDescriptor
&ID
,
682 SmallPtrSetImpl
<Value
*> &AllowedExit
) {
683 Inductions
[Phi
] = ID
;
685 // In case this induction also comes with casts that we know we can ignore
686 // in the vectorized loop body, record them here. All casts could be recorded
687 // here for ignoring, but suffices to record only the first (as it is the
688 // only one that may bw used outside the cast sequence).
689 const SmallVectorImpl
<Instruction
*> &Casts
= ID
.getCastInsts();
691 InductionCastsToIgnore
.insert(*Casts
.begin());
693 Type
*PhiTy
= Phi
->getType();
694 const DataLayout
&DL
= Phi
->getDataLayout();
696 // Get the widest type.
697 if (!PhiTy
->isFloatingPointTy()) {
699 WidestIndTy
= convertPointerToIntegerType(DL
, PhiTy
);
701 WidestIndTy
= getWiderType(DL
, PhiTy
, WidestIndTy
);
704 // Int inductions are special because we only allow one IV.
705 if (ID
.getKind() == InductionDescriptor::IK_IntInduction
&&
706 ID
.getConstIntStepValue() && ID
.getConstIntStepValue()->isOne() &&
707 isa
<Constant
>(ID
.getStartValue()) &&
708 cast
<Constant
>(ID
.getStartValue())->isNullValue()) {
710 // Use the phi node with the widest type as induction. Use the last
711 // one if there are multiple (no good reason for doing this other
712 // than it is expedient). We've checked that it begins at zero and
713 // steps by one, so this is a canonical induction variable.
714 if (!PrimaryInduction
|| PhiTy
== WidestIndTy
)
715 PrimaryInduction
= Phi
;
718 // Both the PHI node itself, and the "post-increment" value feeding
719 // back into the PHI node may have external users.
720 // We can allow those uses, except if the SCEVs we have for them rely
721 // on predicates that only hold within the loop, since allowing the exit
722 // currently means re-using this SCEV outside the loop (see PR33706 for more
724 if (PSE
.getPredicate().isAlwaysTrue()) {
725 AllowedExit
.insert(Phi
);
726 AllowedExit
.insert(Phi
->getIncomingValueForBlock(TheLoop
->getLoopLatch()));
729 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
732 bool LoopVectorizationLegality::setupOuterLoopInductions() {
733 BasicBlock
*Header
= TheLoop
->getHeader();
735 // Returns true if a given Phi is a supported induction.
736 auto IsSupportedPhi
= [&](PHINode
&Phi
) -> bool {
737 InductionDescriptor ID
;
738 if (InductionDescriptor::isInductionPHI(&Phi
, TheLoop
, PSE
, ID
) &&
739 ID
.getKind() == InductionDescriptor::IK_IntInduction
) {
740 addInductionPhi(&Phi
, ID
, AllowedExit
);
743 // Bail out for any Phi in the outer loop header that is not a supported
746 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
750 return llvm::all_of(Header
->phis(), IsSupportedPhi
);
753 /// Checks if a function is scalarizable according to the TLI, in
754 /// the sense that it should be vectorized and then expanded in
755 /// multiple scalar calls. This is represented in the
756 /// TLI via mappings that do not specify a vector name, as in the
757 /// following example:
759 /// const VecDesc VecIntrinsics[] = {
760 /// {"llvm.phx.abs.i32", "", 4}
762 static bool isTLIScalarize(const TargetLibraryInfo
&TLI
, const CallInst
&CI
) {
763 const StringRef ScalarName
= CI
.getCalledFunction()->getName();
764 bool Scalarize
= TLI
.isFunctionVectorizable(ScalarName
);
765 // Check that all known VFs are not associated to a vector
766 // function, i.e. the vector name is emty.
768 ElementCount WidestFixedVF
, WidestScalableVF
;
769 TLI
.getWidestVF(ScalarName
, WidestFixedVF
, WidestScalableVF
);
770 for (ElementCount VF
= ElementCount::getFixed(2);
771 ElementCount::isKnownLE(VF
, WidestFixedVF
); VF
*= 2)
772 Scalarize
&= !TLI
.isFunctionVectorizable(ScalarName
, VF
);
773 for (ElementCount VF
= ElementCount::getScalable(1);
774 ElementCount::isKnownLE(VF
, WidestScalableVF
); VF
*= 2)
775 Scalarize
&= !TLI
.isFunctionVectorizable(ScalarName
, VF
);
776 assert((WidestScalableVF
.isZero() || !Scalarize
) &&
777 "Caller may decide to scalarize a variant using a scalable VF");
782 bool LoopVectorizationLegality::canVectorizeInstrs() {
783 BasicBlock
*Header
= TheLoop
->getHeader();
785 // For each block in the loop.
786 for (BasicBlock
*BB
: TheLoop
->blocks()) {
787 // Scan the instructions in the block and look for hazards.
788 for (Instruction
&I
: *BB
) {
789 if (auto *Phi
= dyn_cast
<PHINode
>(&I
)) {
790 Type
*PhiTy
= Phi
->getType();
791 // Check that this PHI type is allowed.
792 if (!PhiTy
->isIntegerTy() && !PhiTy
->isFloatingPointTy() &&
793 !PhiTy
->isPointerTy()) {
794 reportVectorizationFailure("Found a non-int non-pointer PHI",
795 "loop control flow is not understood by vectorizer",
796 "CFGNotUnderstood", ORE
, TheLoop
);
800 // If this PHINode is not in the header block, then we know that we
801 // can convert it to select during if-conversion. No need to check if
802 // the PHIs in this block are induction or reduction variables.
804 // Non-header phi nodes that have outside uses can be vectorized. Add
805 // them to the list of allowed exits.
806 // Unsafe cyclic dependencies with header phis are identified during
807 // legalization for reduction, induction and fixed order
809 AllowedExit
.insert(&I
);
813 // We only allow if-converted PHIs with exactly two incoming values.
814 if (Phi
->getNumIncomingValues() != 2) {
815 reportVectorizationFailure("Found an invalid PHI",
816 "loop control flow is not understood by vectorizer",
817 "CFGNotUnderstood", ORE
, TheLoop
, Phi
);
821 RecurrenceDescriptor RedDes
;
822 if (RecurrenceDescriptor::isReductionPHI(Phi
, TheLoop
, RedDes
, DB
, AC
,
824 Requirements
->addExactFPMathInst(RedDes
.getExactFPMathInst());
825 AllowedExit
.insert(RedDes
.getLoopExitInstr());
826 Reductions
[Phi
] = RedDes
;
830 // We prevent matching non-constant strided pointer IVS to preserve
831 // historical vectorizer behavior after a generalization of the
832 // IVDescriptor code. The intent is to remove this check, but we
833 // have to fix issues around code quality for such loops first.
834 auto IsDisallowedStridedPointerInduction
=
835 [](const InductionDescriptor
&ID
) {
836 if (AllowStridedPointerIVs
)
838 return ID
.getKind() == InductionDescriptor::IK_PtrInduction
&&
839 ID
.getConstIntStepValue() == nullptr;
842 // TODO: Instead of recording the AllowedExit, it would be good to
843 // record the complementary set: NotAllowedExit. These include (but may
844 // not be limited to):
845 // 1. Reduction phis as they represent the one-before-last value, which
846 // is not available when vectorized
847 // 2. Induction phis and increment when SCEV predicates cannot be used
848 // outside the loop - see addInductionPhi
849 // 3. Non-Phis with outside uses when SCEV predicates cannot be used
850 // outside the loop - see call to hasOutsideLoopUser in the non-phi
852 // 4. FixedOrderRecurrence phis that can possibly be handled by
854 // By recording these, we can then reason about ways to vectorize each
855 // of these NotAllowedExit.
856 InductionDescriptor ID
;
857 if (InductionDescriptor::isInductionPHI(Phi
, TheLoop
, PSE
, ID
) &&
858 !IsDisallowedStridedPointerInduction(ID
)) {
859 addInductionPhi(Phi
, ID
, AllowedExit
);
860 Requirements
->addExactFPMathInst(ID
.getExactFPMathInst());
864 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi
, TheLoop
, DT
)) {
865 AllowedExit
.insert(Phi
);
866 FixedOrderRecurrences
.insert(Phi
);
870 // As a last resort, coerce the PHI to a AddRec expression
871 // and re-try classifying it a an induction PHI.
872 if (InductionDescriptor::isInductionPHI(Phi
, TheLoop
, PSE
, ID
, true) &&
873 !IsDisallowedStridedPointerInduction(ID
)) {
874 addInductionPhi(Phi
, ID
, AllowedExit
);
878 reportVectorizationFailure("Found an unidentified PHI",
879 "value that could not be identified as "
880 "reduction is used outside the loop",
881 "NonReductionValueUsedOutsideLoop", ORE
, TheLoop
, Phi
);
883 } // end of PHI handling
885 // We handle calls that:
886 // * Are debug info intrinsics.
887 // * Have a mapping to an IR intrinsic.
888 // * Have a vector version available.
889 auto *CI
= dyn_cast
<CallInst
>(&I
);
891 if (CI
&& !getVectorIntrinsicIDForCall(CI
, TLI
) &&
892 !isa
<DbgInfoIntrinsic
>(CI
) &&
893 !(CI
->getCalledFunction() && TLI
&&
894 (!VFDatabase::getMappings(*CI
).empty() ||
895 isTLIScalarize(*TLI
, *CI
)))) {
896 // If the call is a recognized math libary call, it is likely that
897 // we can vectorize it given loosened floating-point constraints.
900 TLI
&& CI
->getCalledFunction() &&
901 CI
->getType()->isFloatingPointTy() &&
902 TLI
->getLibFunc(CI
->getCalledFunction()->getName(), Func
) &&
903 TLI
->hasOptimizedCodeGen(Func
);
906 // TODO: Ideally, we should not use clang-specific language here,
907 // but it's hard to provide meaningful yet generic advice.
908 // Also, should this be guarded by allowExtraAnalysis() and/or be part
909 // of the returned info from isFunctionVectorizable()?
910 reportVectorizationFailure(
911 "Found a non-intrinsic callsite",
912 "library call cannot be vectorized. "
913 "Try compiling with -fno-math-errno, -ffast-math, "
915 "CantVectorizeLibcall", ORE
, TheLoop
, CI
);
917 reportVectorizationFailure("Found a non-intrinsic callsite",
918 "call instruction cannot be vectorized",
919 "CantVectorizeLibcall", ORE
, TheLoop
, CI
);
924 // Some intrinsics have scalar arguments and should be same in order for
925 // them to be vectorized (i.e. loop invariant).
927 auto *SE
= PSE
.getSE();
928 Intrinsic::ID IntrinID
= getVectorIntrinsicIDForCall(CI
, TLI
);
929 for (unsigned Idx
= 0; Idx
< CI
->arg_size(); ++Idx
)
930 if (isVectorIntrinsicWithScalarOpAtArg(IntrinID
, Idx
)) {
931 if (!SE
->isLoopInvariant(PSE
.getSCEV(CI
->getOperand(Idx
)),
933 reportVectorizationFailure("Found unvectorizable intrinsic",
934 "intrinsic instruction cannot be vectorized",
935 "CantVectorizeIntrinsic", ORE
, TheLoop
, CI
);
941 // If we found a vectorized variant of a function, note that so LV can
942 // make better decisions about maximum VF.
943 if (CI
&& !VFDatabase::getMappings(*CI
).empty())
944 VecCallVariantsFound
= true;
946 // Check that the instruction return type is vectorizable.
947 // We can't vectorize casts from vector type to scalar type.
948 // Also, we can't vectorize extractelement instructions.
949 if ((!VectorType::isValidElementType(I
.getType()) &&
950 !I
.getType()->isVoidTy()) ||
952 !VectorType::isValidElementType(I
.getOperand(0)->getType())) ||
953 isa
<ExtractElementInst
>(I
)) {
954 reportVectorizationFailure("Found unvectorizable type",
955 "instruction return type cannot be vectorized",
956 "CantVectorizeInstructionReturnType", ORE
, TheLoop
, &I
);
960 // Check that the stored type is vectorizable.
961 if (auto *ST
= dyn_cast
<StoreInst
>(&I
)) {
962 Type
*T
= ST
->getValueOperand()->getType();
963 if (!VectorType::isValidElementType(T
)) {
964 reportVectorizationFailure("Store instruction cannot be vectorized",
965 "store instruction cannot be vectorized",
966 "CantVectorizeStore", ORE
, TheLoop
, ST
);
970 // For nontemporal stores, check that a nontemporal vector version is
971 // supported on the target.
972 if (ST
->getMetadata(LLVMContext::MD_nontemporal
)) {
973 // Arbitrarily try a vector of 2 elements.
974 auto *VecTy
= FixedVectorType::get(T
, /*NumElts=*/2);
975 assert(VecTy
&& "did not find vectorized version of stored type");
976 if (!TTI
->isLegalNTStore(VecTy
, ST
->getAlign())) {
977 reportVectorizationFailure(
978 "nontemporal store instruction cannot be vectorized",
979 "nontemporal store instruction cannot be vectorized",
980 "CantVectorizeNontemporalStore", ORE
, TheLoop
, ST
);
985 } else if (auto *LD
= dyn_cast
<LoadInst
>(&I
)) {
986 if (LD
->getMetadata(LLVMContext::MD_nontemporal
)) {
987 // For nontemporal loads, check that a nontemporal vector version is
988 // supported on the target (arbitrarily try a vector of 2 elements).
989 auto *VecTy
= FixedVectorType::get(I
.getType(), /*NumElts=*/2);
990 assert(VecTy
&& "did not find vectorized version of load type");
991 if (!TTI
->isLegalNTLoad(VecTy
, LD
->getAlign())) {
992 reportVectorizationFailure(
993 "nontemporal load instruction cannot be vectorized",
994 "nontemporal load instruction cannot be vectorized",
995 "CantVectorizeNontemporalLoad", ORE
, TheLoop
, LD
);
1000 // FP instructions can allow unsafe algebra, thus vectorizable by
1001 // non-IEEE-754 compliant SIMD units.
1002 // This applies to floating-point math operations and calls, not memory
1003 // operations, shuffles, or casts, as they don't change precision or
1005 } else if (I
.getType()->isFloatingPointTy() && (CI
|| I
.isBinaryOp()) &&
1007 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1008 Hints
->setPotentiallyUnsafe();
1011 // Reduction instructions are allowed to have exit users.
1012 // All other instructions must not have external users.
1013 if (hasOutsideLoopUser(TheLoop
, &I
, AllowedExit
)) {
1014 // We can safely vectorize loops where instructions within the loop are
1015 // used outside the loop only if the SCEV predicates within the loop is
1016 // same as outside the loop. Allowing the exit means reusing the SCEV
1017 // outside the loop.
1018 if (PSE
.getPredicate().isAlwaysTrue()) {
1019 AllowedExit
.insert(&I
);
1022 reportVectorizationFailure("Value cannot be used outside the loop",
1023 "value cannot be used outside the loop",
1024 "ValueUsedOutsideLoop", ORE
, TheLoop
, &I
);
1030 if (!PrimaryInduction
) {
1031 if (Inductions
.empty()) {
1032 reportVectorizationFailure("Did not find one integer induction var",
1033 "loop induction variable could not be identified",
1034 "NoInductionVariable", ORE
, TheLoop
);
1038 reportVectorizationFailure("Did not find one integer induction var",
1039 "integer loop induction variable could not be identified",
1040 "NoIntegerInductionVariable", ORE
, TheLoop
);
1043 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1046 // Now we know the widest induction type, check if our found induction
1047 // is the same size. If it's not, unset it here and InnerLoopVectorizer
1048 // will create another.
1049 if (PrimaryInduction
&& WidestIndTy
!= PrimaryInduction
->getType())
1050 PrimaryInduction
= nullptr;
1055 /// Find histogram operations that match high-level code in loops:
1057 /// buckets[indices[i]]+=step;
1060 /// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1061 /// array the computed histogram. It uses a BinOp to sum all counts, storing
1062 /// them using a loop-variant index Load from the 'indices' input array.
1064 /// On successful matches it updates the STATISTIC 'HistogramsDetected',
1065 /// regardless of hardware support. When there is support, it additionally
1066 /// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1067 /// used to update histogram in \p HistogramPtrs.
1068 static bool findHistogram(LoadInst
*LI
, StoreInst
*HSt
, Loop
*TheLoop
,
1069 const PredicatedScalarEvolution
&PSE
,
1070 SmallVectorImpl
<HistogramInfo
> &Histograms
) {
1072 // Store value must come from a Binary Operation.
1073 Instruction
*HPtrInstr
= nullptr;
1074 BinaryOperator
*HBinOp
= nullptr;
1075 if (!match(HSt
, m_Store(m_BinOp(HBinOp
), m_Instruction(HPtrInstr
))))
1078 // BinOp must be an Add or a Sub modifying the bucket value by a
1079 // loop invariant amount.
1080 // FIXME: We assume the loop invariant term is on the RHS.
1081 // Fine for an immediate/constant, but maybe not a generic value?
1082 Value
*HIncVal
= nullptr;
1083 if (!match(HBinOp
, m_Add(m_Load(m_Specific(HPtrInstr
)), m_Value(HIncVal
))) &&
1084 !match(HBinOp
, m_Sub(m_Load(m_Specific(HPtrInstr
)), m_Value(HIncVal
))))
1087 // Make sure the increment value is loop invariant.
1088 if (!TheLoop
->isLoopInvariant(HIncVal
))
1091 // The address to store is calculated through a GEP Instruction.
1092 GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(HPtrInstr
);
1096 // Restrict address calculation to constant indices except for the last term.
1097 Value
*HIdx
= nullptr;
1098 for (Value
*Index
: GEP
->indices()) {
1101 if (!isa
<ConstantInt
>(Index
))
1108 // Check that the index is calculated by loading from another array. Ignore
1110 // FIXME: Support indices from other sources than a linear load from memory?
1111 // We're currently trying to match an operation looping over an array
1112 // of indices, but there could be additional levels of indirection
1113 // in place, or possibly some additional calculation to form the index
1114 // from the loaded data.
1116 if (!match(HIdx
, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal
)))))
1119 // Make sure the index address varies in this loop, not an outer loop.
1120 const auto *AR
= dyn_cast
<SCEVAddRecExpr
>(PSE
.getSE()->getSCEV(VPtrVal
));
1121 if (!AR
|| AR
->getLoop() != TheLoop
)
1124 // Ensure we'll have the same mask by checking that all parts of the histogram
1125 // (gather load, update, scatter store) are in the same block.
1126 LoadInst
*IndexedLoad
= cast
<LoadInst
>(HBinOp
->getOperand(0));
1127 BasicBlock
*LdBB
= IndexedLoad
->getParent();
1128 if (LdBB
!= HBinOp
->getParent() || LdBB
!= HSt
->getParent())
1131 LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt
<< "\n");
1133 // Store the operations that make up the histogram.
1134 Histograms
.emplace_back(IndexedLoad
, HBinOp
, HSt
);
1138 bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1139 // For now, we only support an IndirectUnsafe dependency that calculates
1141 if (!EnableHistogramVectorization
)
1144 // Find a single IndirectUnsafe dependency.
1145 const MemoryDepChecker::Dependence
*IUDep
= nullptr;
1146 const MemoryDepChecker
&DepChecker
= LAI
->getDepChecker();
1147 const auto *Deps
= DepChecker
.getDependences();
1148 // If there were too many dependences, LAA abandons recording them. We can't
1149 // proceed safely if we don't know what the dependences are.
1153 for (const MemoryDepChecker::Dependence
&Dep
: *Deps
) {
1154 // Ignore dependencies that are either known to be safe or can be
1155 // checked at runtime.
1156 if (MemoryDepChecker::Dependence::isSafeForVectorization(Dep
.Type
) !=
1157 MemoryDepChecker::VectorizationSafetyStatus::Unsafe
)
1160 // We're only interested in IndirectUnsafe dependencies here, where the
1161 // address might come from a load from memory. We also only want to handle
1162 // one such dependency, at least for now.
1163 if (Dep
.Type
!= MemoryDepChecker::Dependence::IndirectUnsafe
|| IUDep
)
1171 // For now only normal loads and stores are supported.
1172 LoadInst
*LI
= dyn_cast
<LoadInst
>(IUDep
->getSource(DepChecker
));
1173 StoreInst
*SI
= dyn_cast
<StoreInst
>(IUDep
->getDestination(DepChecker
));
1178 LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI
<< "\n");
1179 return findHistogram(LI
, SI
, TheLoop
, LAI
->getPSE(), Histograms
);
1182 bool LoopVectorizationLegality::canVectorizeMemory() {
1183 LAI
= &LAIs
.getInfo(*TheLoop
);
1184 const OptimizationRemarkAnalysis
*LAR
= LAI
->getReport();
1187 return OptimizationRemarkAnalysis(Hints
->vectorizeAnalysisPassName(),
1188 "loop not vectorized: ", *LAR
);
1192 if (!LAI
->canVectorizeMemory())
1193 return canVectorizeIndirectUnsafeDependences();
1195 if (LAI
->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1196 reportVectorizationFailure("We don't allow storing to uniform addresses",
1197 "write to a loop invariant address could not "
1199 "CantVectorizeStoreToLoopInvariantAddress", ORE
,
1204 // We can vectorize stores to invariant address when final reduction value is
1205 // guaranteed to be stored at the end of the loop. Also, if decision to
1206 // vectorize loop is made, runtime checks are added so as to make sure that
1207 // invariant address won't alias with any other objects.
1208 if (!LAI
->getStoresToInvariantAddresses().empty()) {
1209 // For each invariant address, check if last stored value is unconditional
1210 // and the address is not calculated inside the loop.
1211 for (StoreInst
*SI
: LAI
->getStoresToInvariantAddresses()) {
1212 if (!isInvariantStoreOfReduction(SI
))
1215 if (blockNeedsPredication(SI
->getParent())) {
1216 reportVectorizationFailure(
1217 "We don't allow storing to uniform addresses",
1218 "write of conditional recurring variant value to a loop "
1219 "invariant address could not be vectorized",
1220 "CantVectorizeStoreToLoopInvariantAddress", ORE
, TheLoop
);
1224 // Invariant address should be defined outside of loop. LICM pass usually
1225 // makes sure it happens, but in rare cases it does not, we do not want
1226 // to overcomplicate vectorization to support this case.
1227 if (Instruction
*Ptr
= dyn_cast
<Instruction
>(SI
->getPointerOperand())) {
1228 if (TheLoop
->contains(Ptr
)) {
1229 reportVectorizationFailure(
1230 "Invariant address is calculated inside the loop",
1231 "write to a loop invariant address could not "
1233 "CantVectorizeStoreToLoopInvariantAddress", ORE
, TheLoop
);
1239 if (LAI
->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1240 // For each invariant address, check its last stored value is the result
1241 // of one of our reductions.
1243 // We do not check if dependence with loads exists because that is already
1244 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1245 ScalarEvolution
*SE
= PSE
.getSE();
1246 SmallVector
<StoreInst
*, 4> UnhandledStores
;
1247 for (StoreInst
*SI
: LAI
->getStoresToInvariantAddresses()) {
1248 if (isInvariantStoreOfReduction(SI
)) {
1249 // Earlier stores to this address are effectively deadcode.
1250 // With opaque pointers it is possible for one pointer to be used with
1251 // different sizes of stored values:
1252 // store i32 0, ptr %x
1253 // store i8 0, ptr %x
1254 // The latest store doesn't complitely overwrite the first one in the
1255 // example. That is why we have to make sure that types of stored
1257 // TODO: Check that bitwidth of unhandled store is smaller then the
1258 // one that overwrites it and add a test.
1259 erase_if(UnhandledStores
, [SE
, SI
](StoreInst
*I
) {
1260 return storeToSameAddress(SE
, SI
, I
) &&
1261 I
->getValueOperand()->getType() ==
1262 SI
->getValueOperand()->getType();
1266 UnhandledStores
.push_back(SI
);
1269 bool IsOK
= UnhandledStores
.empty();
1270 // TODO: we should also validate against InvariantMemSets.
1272 reportVectorizationFailure(
1273 "We don't allow storing to uniform addresses",
1274 "write to a loop invariant address could not "
1276 "CantVectorizeStoreToLoopInvariantAddress", ORE
, TheLoop
);
1282 PSE
.addPredicate(LAI
->getPSE().getPredicate());
1286 bool LoopVectorizationLegality::canVectorizeFPMath(
1287 bool EnableStrictReductions
) {
1289 // First check if there is any ExactFP math or if we allow reassociations
1290 if (!Requirements
->getExactFPInst() || Hints
->allowReordering())
1293 // If the above is false, we have ExactFPMath & do not allow reordering.
1294 // If the EnableStrictReductions flag is set, first check if we have any
1295 // Exact FP induction vars, which we cannot vectorize.
1296 if (!EnableStrictReductions
||
1297 any_of(getInductionVars(), [&](auto &Induction
) -> bool {
1298 InductionDescriptor IndDesc
= Induction
.second
;
1299 return IndDesc
.getExactFPMathInst();
1303 // We can now only vectorize if all reductions with Exact FP math also
1304 // have the isOrdered flag set, which indicates that we can move the
1305 // reduction operations in-loop.
1306 return (all_of(getReductionVars(), [&](auto &Reduction
) -> bool {
1307 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
1308 return !RdxDesc
.hasExactFPMath() || RdxDesc
.isOrdered();
1312 bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst
*SI
) {
1313 return any_of(getReductionVars(), [&](auto &Reduction
) -> bool {
1314 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
1315 return RdxDesc
.IntermediateStore
== SI
;
1319 bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value
*V
) {
1320 return any_of(getReductionVars(), [&](auto &Reduction
) -> bool {
1321 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
1322 if (!RdxDesc
.IntermediateStore
)
1325 ScalarEvolution
*SE
= PSE
.getSE();
1326 Value
*InvariantAddress
= RdxDesc
.IntermediateStore
->getPointerOperand();
1327 return V
== InvariantAddress
||
1328 SE
->getSCEV(V
) == SE
->getSCEV(InvariantAddress
);
1332 bool LoopVectorizationLegality::isInductionPhi(const Value
*V
) const {
1333 Value
*In0
= const_cast<Value
*>(V
);
1334 PHINode
*PN
= dyn_cast_or_null
<PHINode
>(In0
);
1338 return Inductions
.count(PN
);
1341 const InductionDescriptor
*
1342 LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode
*Phi
) const {
1343 if (!isInductionPhi(Phi
))
1345 auto &ID
= getInductionVars().find(Phi
)->second
;
1346 if (ID
.getKind() == InductionDescriptor::IK_IntInduction
||
1347 ID
.getKind() == InductionDescriptor::IK_FpInduction
)
1352 const InductionDescriptor
*
1353 LoopVectorizationLegality::getPointerInductionDescriptor(PHINode
*Phi
) const {
1354 if (!isInductionPhi(Phi
))
1356 auto &ID
= getInductionVars().find(Phi
)->second
;
1357 if (ID
.getKind() == InductionDescriptor::IK_PtrInduction
)
1362 bool LoopVectorizationLegality::isCastedInductionVariable(
1363 const Value
*V
) const {
1364 auto *Inst
= dyn_cast
<Instruction
>(V
);
1365 return (Inst
&& InductionCastsToIgnore
.count(Inst
));
1368 bool LoopVectorizationLegality::isInductionVariable(const Value
*V
) const {
1369 return isInductionPhi(V
) || isCastedInductionVariable(V
);
1372 bool LoopVectorizationLegality::isFixedOrderRecurrence(
1373 const PHINode
*Phi
) const {
1374 return FixedOrderRecurrences
.count(Phi
);
1377 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock
*BB
) const {
1378 // When vectorizing early exits, create predicates for the latch block only.
1379 // The early exiting block must be a direct predecessor of the latch at the
1381 BasicBlock
*Latch
= TheLoop
->getLoopLatch();
1382 if (hasUncountableEarlyExit()) {
1384 is_contained(predecessors(Latch
), getUncountableEarlyExitingBlock()) &&
1385 "Uncountable exiting block must be a direct predecessor of latch");
1388 return LoopAccessInfo::blockNeedsPredication(BB
, TheLoop
, DT
);
1391 bool LoopVectorizationLegality::blockCanBePredicated(
1392 BasicBlock
*BB
, SmallPtrSetImpl
<Value
*> &SafePtrs
,
1393 SmallPtrSetImpl
<const Instruction
*> &MaskedOp
) const {
1394 for (Instruction
&I
: *BB
) {
1395 // We can predicate blocks with calls to assume, as long as we drop them in
1396 // case we flatten the CFG via predication.
1397 if (match(&I
, m_Intrinsic
<Intrinsic::assume
>())) {
1398 MaskedOp
.insert(&I
);
1402 // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1403 // TODO: there might be cases that it should block the vectorization. Let's
1404 // ignore those for now.
1405 if (isa
<NoAliasScopeDeclInst
>(&I
))
1408 // We can allow masked calls if there's at least one vector variant, even
1409 // if we end up scalarizing due to the cost model calculations.
1410 // TODO: Allow other calls if they have appropriate attributes... readonly
1412 if (CallInst
*CI
= dyn_cast
<CallInst
>(&I
))
1413 if (VFDatabase::hasMaskedVariant(*CI
)) {
1414 MaskedOp
.insert(CI
);
1418 // Loads are handled via masking (or speculated if safe to do so.)
1419 if (auto *LI
= dyn_cast
<LoadInst
>(&I
)) {
1420 if (!SafePtrs
.count(LI
->getPointerOperand()))
1421 MaskedOp
.insert(LI
);
1425 // Predicated store requires some form of masking:
1426 // 1) masked store HW instruction,
1427 // 2) emulation via load-blend-store (only if safe and legal to do so,
1428 // be aware on the race conditions), or
1429 // 3) element-by-element predicate check and scalar store.
1430 if (auto *SI
= dyn_cast
<StoreInst
>(&I
)) {
1431 MaskedOp
.insert(SI
);
1435 if (I
.mayReadFromMemory() || I
.mayWriteToMemory() || I
.mayThrow())
1442 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1443 if (!EnableIfConversion
) {
1444 reportVectorizationFailure("If-conversion is disabled",
1445 "if-conversion is disabled",
1446 "IfConversionDisabled",
1451 assert(TheLoop
->getNumBlocks() > 1 && "Single block loops are vectorizable");
1453 // A list of pointers which are known to be dereferenceable within scope of
1454 // the loop body for each iteration of the loop which executes. That is,
1455 // the memory pointed to can be dereferenced (with the access size implied by
1456 // the value's type) unconditionally within the loop header without
1457 // introducing a new fault.
1458 SmallPtrSet
<Value
*, 8> SafePointers
;
1460 // Collect safe addresses.
1461 for (BasicBlock
*BB
: TheLoop
->blocks()) {
1462 if (!blockNeedsPredication(BB
)) {
1463 for (Instruction
&I
: *BB
)
1464 if (auto *Ptr
= getLoadStorePointerOperand(&I
))
1465 SafePointers
.insert(Ptr
);
1469 // For a block which requires predication, a address may be safe to access
1470 // in the loop w/o predication if we can prove dereferenceability facts
1471 // sufficient to ensure it'll never fault within the loop. For the moment,
1472 // we restrict this to loads; stores are more complicated due to
1473 // concurrency restrictions.
1474 ScalarEvolution
&SE
= *PSE
.getSE();
1475 SmallVector
<const SCEVPredicate
*, 4> Predicates
;
1476 for (Instruction
&I
: *BB
) {
1477 LoadInst
*LI
= dyn_cast
<LoadInst
>(&I
);
1478 // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1479 // that it will consider loops that need guarding by SCEV checks. The
1480 // vectoriser will generate these checks if we decide to vectorise.
1481 if (LI
&& !LI
->getType()->isVectorTy() && !mustSuppressSpeculation(*LI
) &&
1482 isDereferenceableAndAlignedInLoop(LI
, TheLoop
, SE
, *DT
, AC
,
1484 SafePointers
.insert(LI
->getPointerOperand());
1489 // Collect the blocks that need predication.
1490 for (BasicBlock
*BB
: TheLoop
->blocks()) {
1491 // We support only branches and switch statements as terminators inside the
1493 if (isa
<SwitchInst
>(BB
->getTerminator())) {
1494 if (TheLoop
->isLoopExiting(BB
)) {
1495 reportVectorizationFailure("Loop contains an unsupported switch",
1496 "loop contains an unsupported switch",
1497 "LoopContainsUnsupportedSwitch", ORE
,
1498 TheLoop
, BB
->getTerminator());
1501 } else if (!isa
<BranchInst
>(BB
->getTerminator())) {
1502 reportVectorizationFailure("Loop contains an unsupported terminator",
1503 "loop contains an unsupported terminator",
1504 "LoopContainsUnsupportedTerminator", ORE
,
1505 TheLoop
, BB
->getTerminator());
1509 // We must be able to predicate all blocks that need to be predicated.
1510 if (blockNeedsPredication(BB
) &&
1511 !blockCanBePredicated(BB
, SafePointers
, MaskedOp
)) {
1512 reportVectorizationFailure(
1513 "Control flow cannot be substituted for a select",
1514 "control flow cannot be substituted for a select", "NoCFGForSelect",
1515 ORE
, TheLoop
, BB
->getTerminator());
1520 // We can if-convert this loop.
1524 // Helper function to canVectorizeLoopNestCFG.
1525 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop
*Lp
,
1526 bool UseVPlanNativePath
) {
1527 assert((UseVPlanNativePath
|| Lp
->isInnermost()) &&
1528 "VPlan-native path is not enabled.");
1530 // TODO: ORE should be improved to show more accurate information when an
1531 // outer loop can't be vectorized because a nested loop is not understood or
1532 // legal. Something like: "outer_loop_location: loop not vectorized:
1533 // (inner_loop_location) loop control flow is not understood by vectorizer".
1535 // Store the result and return it at the end instead of exiting early, in case
1536 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1538 bool DoExtraAnalysis
= ORE
->allowExtraAnalysis(DEBUG_TYPE
);
1540 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1541 // be canonicalized.
1542 if (!Lp
->getLoopPreheader()) {
1543 reportVectorizationFailure("Loop doesn't have a legal pre-header",
1544 "loop control flow is not understood by vectorizer",
1545 "CFGNotUnderstood", ORE
, TheLoop
);
1546 if (DoExtraAnalysis
)
1552 // We must have a single backedge.
1553 if (Lp
->getNumBackEdges() != 1) {
1554 reportVectorizationFailure("The loop must have a single backedge",
1555 "loop control flow is not understood by vectorizer",
1556 "CFGNotUnderstood", ORE
, TheLoop
);
1557 if (DoExtraAnalysis
)
1566 bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1567 Loop
*Lp
, bool UseVPlanNativePath
) {
1568 // Store the result and return it at the end instead of exiting early, in case
1569 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1571 bool DoExtraAnalysis
= ORE
->allowExtraAnalysis(DEBUG_TYPE
);
1572 if (!canVectorizeLoopCFG(Lp
, UseVPlanNativePath
)) {
1573 if (DoExtraAnalysis
)
1579 // Recursively check whether the loop control flow of nested loops is
1581 for (Loop
*SubLp
: *Lp
)
1582 if (!canVectorizeLoopNestCFG(SubLp
, UseVPlanNativePath
)) {
1583 if (DoExtraAnalysis
)
1592 bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1593 BasicBlock
*LatchBB
= TheLoop
->getLoopLatch();
1595 reportVectorizationFailure("Loop does not have a latch",
1596 "Cannot vectorize early exit loop",
1597 "NoLatchEarlyExit", ORE
, TheLoop
);
1601 if (Reductions
.size() || FixedOrderRecurrences
.size()) {
1602 reportVectorizationFailure(
1603 "Found reductions or recurrences in early-exit loop",
1604 "Cannot vectorize early exit loop with reductions or recurrences",
1605 "RecurrencesInEarlyExitLoop", ORE
, TheLoop
);
1609 SmallVector
<BasicBlock
*, 8> ExitingBlocks
;
1610 TheLoop
->getExitingBlocks(ExitingBlocks
);
1612 // Keep a record of all the exiting blocks.
1613 SmallVector
<const SCEVPredicate
*, 4> Predicates
;
1614 for (BasicBlock
*BB
: ExitingBlocks
) {
1616 PSE
.getSE()->getPredicatedExitCount(TheLoop
, BB
, &Predicates
);
1617 if (isa
<SCEVCouldNotCompute
>(EC
)) {
1618 UncountableExitingBlocks
.push_back(BB
);
1620 SmallVector
<BasicBlock
*, 2> Succs(successors(BB
));
1621 if (Succs
.size() != 2) {
1622 reportVectorizationFailure(
1623 "Early exiting block does not have exactly two successors",
1624 "Incorrect number of successors from early exiting block",
1625 "EarlyExitTooManySuccessors", ORE
, TheLoop
);
1629 BasicBlock
*ExitBlock
;
1630 if (!TheLoop
->contains(Succs
[0]))
1631 ExitBlock
= Succs
[0];
1633 assert(!TheLoop
->contains(Succs
[1]));
1634 ExitBlock
= Succs
[1];
1636 UncountableExitBlocks
.push_back(ExitBlock
);
1638 CountableExitingBlocks
.push_back(BB
);
1640 // We can safely ignore the predicates here because when vectorizing the loop
1641 // the PredicatatedScalarEvolution class will keep track of all predicates
1642 // for each exiting block anyway. This happens when calling
1643 // PSE.getSymbolicMaxBackedgeTakenCount() below.
1646 // We only support one uncountable early exit.
1647 if (getUncountableExitingBlocks().size() != 1) {
1648 reportVectorizationFailure(
1649 "Loop has too many uncountable exits",
1650 "Cannot vectorize early exit loop with more than one early exit",
1651 "TooManyUncountableEarlyExits", ORE
, TheLoop
);
1655 // The only supported early exit loops so far are ones where the early
1656 // exiting block is a unique predecessor of the latch block.
1657 BasicBlock
*LatchPredBB
= LatchBB
->getUniquePredecessor();
1658 if (LatchPredBB
!= getUncountableEarlyExitingBlock()) {
1659 reportVectorizationFailure("Early exit is not the latch predecessor",
1660 "Cannot vectorize early exit loop",
1661 "EarlyExitNotLatchPredecessor", ORE
, TheLoop
);
1665 // The latch block must have a countable exit.
1666 if (isa
<SCEVCouldNotCompute
>(
1667 PSE
.getSE()->getPredicatedExitCount(TheLoop
, LatchBB
, &Predicates
))) {
1668 reportVectorizationFailure(
1669 "Cannot determine exact exit count for latch block",
1670 "Cannot vectorize early exit loop",
1671 "UnknownLatchExitCountEarlyExitLoop", ORE
, TheLoop
);
1674 assert(llvm::is_contained(CountableExitingBlocks
, LatchBB
) &&
1675 "Latch block not found in list of countable exits!");
1677 // Check to see if there are instructions that could potentially generate
1678 // exceptions or have side-effects.
1679 auto IsSafeOperation
= [](Instruction
*I
) -> bool {
1680 switch (I
->getOpcode()) {
1681 case Instruction::Load
:
1682 case Instruction::Store
:
1683 case Instruction::PHI
:
1684 case Instruction::Br
:
1685 // These are checked separately.
1688 return isSafeToSpeculativelyExecute(I
);
1692 for (auto *BB
: TheLoop
->blocks())
1693 for (auto &I
: *BB
) {
1694 if (I
.mayWriteToMemory()) {
1695 // We don't support writes to memory.
1696 reportVectorizationFailure(
1697 "Writes to memory unsupported in early exit loops",
1698 "Cannot vectorize early exit loop with writes to memory",
1699 "WritesInEarlyExitLoop", ORE
, TheLoop
);
1701 } else if (!IsSafeOperation(&I
)) {
1702 reportVectorizationFailure("Early exit loop contains operations that "
1703 "cannot be speculatively executed",
1704 "Early exit loop contains operations that "
1705 "cannot be speculatively executed",
1706 "UnsafeOperationsEarlyExitLoop", ORE
,
1712 // The vectoriser cannot handle loads that occur after the early exit block.
1713 assert(LatchBB
->getUniquePredecessor() == getUncountableEarlyExitingBlock() &&
1714 "Expected latch predecessor to be the early exiting block");
1716 // TODO: Handle loops that may fault.
1718 if (!isDereferenceableReadOnlyLoop(TheLoop
, PSE
.getSE(), DT
, AC
,
1720 reportVectorizationFailure(
1722 "Cannot vectorize potentially faulting early exit loop",
1723 "PotentiallyFaultingEarlyExitLoop", ORE
, TheLoop
);
1727 [[maybe_unused
]] const SCEV
*SymbolicMaxBTC
=
1728 PSE
.getSymbolicMaxBackedgeTakenCount();
1729 // Since we have an exact exit count for the latch and the early exit
1730 // dominates the latch, then this should guarantee a computed SCEV value.
1731 assert(!isa
<SCEVCouldNotCompute
>(SymbolicMaxBTC
) &&
1732 "Failed to get symbolic expression for backedge taken count");
1733 LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1734 "backedge taken count: "
1735 << *SymbolicMaxBTC
<< '\n');
1739 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath
) {
1740 // Store the result and return it at the end instead of exiting early, in case
1741 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1744 bool DoExtraAnalysis
= ORE
->allowExtraAnalysis(DEBUG_TYPE
);
1745 // Check whether the loop-related control flow in the loop nest is expected by
1747 if (!canVectorizeLoopNestCFG(TheLoop
, UseVPlanNativePath
)) {
1748 if (DoExtraAnalysis
) {
1749 LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1756 // We need to have a loop header.
1757 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop
->getHeader()->getName()
1760 // Specific checks for outer loops. We skip the remaining legal checks at this
1761 // point because they don't support outer loops.
1762 if (!TheLoop
->isInnermost()) {
1763 assert(UseVPlanNativePath
&& "VPlan-native path is not enabled.");
1765 if (!canVectorizeOuterLoop()) {
1766 reportVectorizationFailure("Unsupported outer loop",
1767 "unsupported outer loop",
1768 "UnsupportedOuterLoop",
1770 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1775 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1779 assert(TheLoop
->isInnermost() && "Inner loop expected.");
1780 // Check if we can if-convert non-single-bb loops.
1781 unsigned NumBlocks
= TheLoop
->getNumBlocks();
1782 if (NumBlocks
!= 1 && !canVectorizeWithIfConvert()) {
1783 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1784 if (DoExtraAnalysis
)
1790 // Check if we can vectorize the instructions and CFG in this loop.
1791 if (!canVectorizeInstrs()) {
1792 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1793 if (DoExtraAnalysis
)
1799 HasUncountableEarlyExit
= false;
1800 if (isa
<SCEVCouldNotCompute
>(PSE
.getBackedgeTakenCount())) {
1801 HasUncountableEarlyExit
= true;
1802 if (!isVectorizableEarlyExitLoop()) {
1803 UncountableExitingBlocks
.clear();
1804 HasUncountableEarlyExit
= false;
1805 if (DoExtraAnalysis
)
1812 // Go over each instruction and look at memory deps.
1813 if (!canVectorizeMemory()) {
1814 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1815 if (DoExtraAnalysis
)
1822 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1823 << (LAI
->getRuntimePointerChecking()->Need
1824 ? " (with a runtime bound check)"
1829 unsigned SCEVThreshold
= VectorizeSCEVCheckThreshold
;
1830 if (Hints
->getForce() == LoopVectorizeHints::FK_Enabled
)
1831 SCEVThreshold
= PragmaVectorizeSCEVCheckThreshold
;
1833 if (PSE
.getPredicate().getComplexity() > SCEVThreshold
) {
1834 LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
1835 "due to SCEVThreshold");
1836 reportVectorizationFailure("Too many SCEV checks needed",
1837 "Too many SCEV assumptions need to be made and checked at runtime",
1838 "TooManySCEVRunTimeChecks", ORE
, TheLoop
);
1839 if (DoExtraAnalysis
)
1845 // Okay! We've done all the tests. If any have failed, return false. Otherwise
1846 // we can vectorize, and at this point we don't have any other mem analysis
1847 // which may limit our maximum vectorization factor, so just return true with
1852 bool LoopVectorizationLegality::canFoldTailByMasking() const {
1854 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1856 SmallPtrSet
<const Value
*, 8> ReductionLiveOuts
;
1858 for (const auto &Reduction
: getReductionVars())
1859 ReductionLiveOuts
.insert(Reduction
.second
.getLoopExitInstr());
1861 // TODO: handle non-reduction outside users when tail is folded by masking.
1862 for (auto *AE
: AllowedExit
) {
1863 // Check that all users of allowed exit values are inside the loop or
1864 // are the live-out of a reduction.
1865 if (ReductionLiveOuts
.count(AE
))
1867 for (User
*U
: AE
->users()) {
1868 Instruction
*UI
= cast
<Instruction
>(U
);
1869 if (TheLoop
->contains(UI
))
1873 << "LV: Cannot fold tail by masking, loop has an outside user for "
1879 for (const auto &Entry
: getInductionVars()) {
1880 PHINode
*OrigPhi
= Entry
.first
;
1881 for (User
*U
: OrigPhi
->users()) {
1882 auto *UI
= cast
<Instruction
>(U
);
1883 if (!TheLoop
->contains(UI
)) {
1884 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1892 // The list of pointers that we can safely read and write to remains empty.
1893 SmallPtrSet
<Value
*, 8> SafePointers
;
1895 // Check all blocks for predication, including those that ordinarily do not
1896 // need predication such as the header block.
1897 SmallPtrSet
<const Instruction
*, 8> TmpMaskedOp
;
1898 for (BasicBlock
*BB
: TheLoop
->blocks()) {
1899 if (!blockCanBePredicated(BB
, SafePointers
, TmpMaskedOp
)) {
1900 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
1905 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1910 void LoopVectorizationLegality::prepareToFoldTailByMasking() {
1911 // The list of pointers that we can safely read and write to remains empty.
1912 SmallPtrSet
<Value
*, 8> SafePointers
;
1914 // Mark all blocks for predication, including those that ordinarily do not
1915 // need predication such as the header block.
1916 for (BasicBlock
*BB
: TheLoop
->blocks()) {
1917 [[maybe_unused
]] bool R
= blockCanBePredicated(BB
, SafePointers
, MaskedOp
);
1918 assert(R
&& "Must be able to predicate block when tail-folding.");