[Alignment] fix dubious min function alignment
[llvm-complete.git] / lib / Transforms / Vectorize / LoopVectorizationLegality.cpp
blobabd19391c0cc78c9195117f4d5186bb14785bb36
1 //===- LoopVectorizationLegality.cpp --------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides loop vectorization legality analysis. Original code
10 // resided in LoopVectorize.cpp for a long time.
12 // At this point, it is implemented as a utility class, not as an analysis
13 // pass. It should be easy to create an analysis pass around it if there
14 // is a need (but D45420 needs to happen first).
16 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
17 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18 #include "llvm/Analysis/VectorUtils.h"
19 #include "llvm/IR/IntrinsicInst.h"
21 using namespace llvm;
23 #define LV_NAME "loop-vectorize"
24 #define DEBUG_TYPE LV_NAME
26 extern cl::opt<bool> EnableVPlanPredication;
28 static cl::opt<bool>
29 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
30 cl::desc("Enable if-conversion during vectorization."));
32 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
33 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
34 cl::desc("The maximum allowed number of runtime memory checks with a "
35 "vectorize(enable) pragma."));
37 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
38 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
39 cl::desc("The maximum number of SCEV checks allowed."));
41 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
42 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
43 cl::desc("The maximum number of SCEV checks allowed with a "
44 "vectorize(enable) pragma"));
46 /// Maximum vectorization interleave count.
47 static const unsigned MaxInterleaveFactor = 16;
49 namespace llvm {
51 bool LoopVectorizeHints::Hint::validate(unsigned Val) {
52 switch (Kind) {
53 case HK_WIDTH:
54 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
55 case HK_UNROLL:
56 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
57 case HK_FORCE:
58 return (Val <= 1);
59 case HK_ISVECTORIZED:
60 case HK_PREDICATE:
61 return (Val == 0 || Val == 1);
63 return false;
66 LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
67 bool InterleaveOnlyWhenForced,
68 OptimizationRemarkEmitter &ORE)
69 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
70 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
71 Force("vectorize.enable", FK_Undefined, HK_FORCE),
72 IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
73 Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),
74 ORE(ORE) {
75 // Populate values with existing loop metadata.
76 getHintsFromMetadata();
78 // force-vector-interleave overrides DisableInterleaving.
79 if (VectorizerParams::isInterleaveForced())
80 Interleave.Value = VectorizerParams::VectorizationInterleave;
82 if (IsVectorized.Value != 1)
83 // If the vectorization width and interleaving count are both 1 then
84 // consider the loop to have been already vectorized because there's
85 // nothing more that we can do.
86 IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
87 LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
88 << "LV: Interleaving disabled by the pass manager\n");
91 void LoopVectorizeHints::setAlreadyVectorized() {
92 LLVMContext &Context = TheLoop->getHeader()->getContext();
94 MDNode *IsVectorizedMD = MDNode::get(
95 Context,
96 {MDString::get(Context, "llvm.loop.isvectorized"),
97 ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
98 MDNode *LoopID = TheLoop->getLoopID();
99 MDNode *NewLoopID =
100 makePostTransformationMetadata(Context, LoopID,
101 {Twine(Prefix(), "vectorize.").str(),
102 Twine(Prefix(), "interleave.").str()},
103 {IsVectorizedMD});
104 TheLoop->setLoopID(NewLoopID);
106 // Update internal cache.
107 IsVectorized.Value = 1;
110 bool LoopVectorizeHints::allowVectorization(
111 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
112 if (getForce() == LoopVectorizeHints::FK_Disabled) {
113 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
114 emitRemarkWithHints();
115 return false;
118 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
119 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
120 emitRemarkWithHints();
121 return false;
124 if (getIsVectorized() == 1) {
125 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
126 // FIXME: Add interleave.disable metadata. This will allow
127 // vectorize.disable to be used without disabling the pass and errors
128 // to differentiate between disabled vectorization and a width of 1.
129 ORE.emit([&]() {
130 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
131 "AllDisabled", L->getStartLoc(),
132 L->getHeader())
133 << "loop not vectorized: vectorization and interleaving are "
134 "explicitly disabled, or the loop has already been "
135 "vectorized";
137 return false;
140 return true;
143 void LoopVectorizeHints::emitRemarkWithHints() const {
144 using namespace ore;
146 ORE.emit([&]() {
147 if (Force.Value == LoopVectorizeHints::FK_Disabled)
148 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
149 TheLoop->getStartLoc(),
150 TheLoop->getHeader())
151 << "loop not vectorized: vectorization is explicitly disabled";
152 else {
153 OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
154 TheLoop->getStartLoc(), TheLoop->getHeader());
155 R << "loop not vectorized";
156 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
157 R << " (Force=" << NV("Force", true);
158 if (Width.Value != 0)
159 R << ", Vector Width=" << NV("VectorWidth", Width.Value);
160 if (Interleave.Value != 0)
161 R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
162 R << ")";
164 return R;
169 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
170 if (getWidth() == 1)
171 return LV_NAME;
172 if (getForce() == LoopVectorizeHints::FK_Disabled)
173 return LV_NAME;
174 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
175 return LV_NAME;
176 return OptimizationRemarkAnalysis::AlwaysPrint;
179 void LoopVectorizeHints::getHintsFromMetadata() {
180 MDNode *LoopID = TheLoop->getLoopID();
181 if (!LoopID)
182 return;
184 // First operand should refer to the loop id itself.
185 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
186 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
188 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
189 const MDString *S = nullptr;
190 SmallVector<Metadata *, 4> Args;
192 // The expected hint is either a MDString or a MDNode with the first
193 // operand a MDString.
194 if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
195 if (!MD || MD->getNumOperands() == 0)
196 continue;
197 S = dyn_cast<MDString>(MD->getOperand(0));
198 for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
199 Args.push_back(MD->getOperand(i));
200 } else {
201 S = dyn_cast<MDString>(LoopID->getOperand(i));
202 assert(Args.size() == 0 && "too many arguments for MDString");
205 if (!S)
206 continue;
208 // Check if the hint starts with the loop metadata prefix.
209 StringRef Name = S->getString();
210 if (Args.size() == 1)
211 setHint(Name, Args[0]);
215 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
216 if (!Name.startswith(Prefix()))
217 return;
218 Name = Name.substr(Prefix().size(), StringRef::npos);
220 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
221 if (!C)
222 return;
223 unsigned Val = C->getZExtValue();
225 Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
226 for (auto H : Hints) {
227 if (Name == H->Name) {
228 if (H->validate(Val))
229 H->Value = Val;
230 else
231 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
232 break;
237 bool LoopVectorizationRequirements::doesNotMeet(
238 Function *F, Loop *L, const LoopVectorizeHints &Hints) {
239 const char *PassName = Hints.vectorizeAnalysisPassName();
240 bool Failed = false;
241 if (UnsafeAlgebraInst && !Hints.allowReordering()) {
242 ORE.emit([&]() {
243 return OptimizationRemarkAnalysisFPCommute(
244 PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
245 UnsafeAlgebraInst->getParent())
246 << "loop not vectorized: cannot prove it is safe to reorder "
247 "floating-point operations";
249 Failed = true;
252 // Test if runtime memcheck thresholds are exceeded.
253 bool PragmaThresholdReached =
254 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
255 bool ThresholdReached =
256 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
257 if ((ThresholdReached && !Hints.allowReordering()) ||
258 PragmaThresholdReached) {
259 ORE.emit([&]() {
260 return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
261 L->getStartLoc(),
262 L->getHeader())
263 << "loop not vectorized: cannot prove it is safe to reorder "
264 "memory operations";
266 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
267 Failed = true;
270 return Failed;
273 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
274 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
275 // executing the inner loop will execute the same iterations). This check is
276 // very constrained for now but it will be relaxed in the future. \p Lp is
277 // considered uniform if it meets all the following conditions:
278 // 1) it has a canonical IV (starting from 0 and with stride 1),
279 // 2) its latch terminator is a conditional branch and,
280 // 3) its latch condition is a compare instruction whose operands are the
281 // canonical IV and an OuterLp invariant.
282 // This check doesn't take into account the uniformity of other conditions not
283 // related to the loop latch because they don't affect the loop uniformity.
285 // NOTE: We decided to keep all these checks and its associated documentation
286 // together so that we can easily have a picture of the current supported loop
287 // nests. However, some of the current checks don't depend on \p OuterLp and
288 // would be redundantly executed for each \p Lp if we invoked this function for
289 // different candidate outer loops. This is not the case for now because we
290 // don't currently have the infrastructure to evaluate multiple candidate outer
291 // loops and \p OuterLp will be a fixed parameter while we only support explicit
292 // outer loop vectorization. It's also very likely that these checks go away
293 // before introducing the aforementioned infrastructure. However, if this is not
294 // the case, we should move the \p OuterLp independent checks to a separate
295 // function that is only executed once for each \p Lp.
296 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
297 assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
299 // If Lp is the outer loop, it's uniform by definition.
300 if (Lp == OuterLp)
301 return true;
302 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
304 // 1.
305 PHINode *IV = Lp->getCanonicalInductionVariable();
306 if (!IV) {
307 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
308 return false;
311 // 2.
312 BasicBlock *Latch = Lp->getLoopLatch();
313 auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
314 if (!LatchBr || LatchBr->isUnconditional()) {
315 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
316 return false;
319 // 3.
320 auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
321 if (!LatchCmp) {
322 LLVM_DEBUG(
323 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
324 return false;
327 Value *CondOp0 = LatchCmp->getOperand(0);
328 Value *CondOp1 = LatchCmp->getOperand(1);
329 Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
330 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
331 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
332 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
333 return false;
336 return true;
339 // Return true if \p Lp and all its nested loops are uniform with regard to \p
340 // OuterLp.
341 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
342 if (!isUniformLoop(Lp, OuterLp))
343 return false;
345 // Check if nested loops are uniform.
346 for (Loop *SubLp : *Lp)
347 if (!isUniformLoopNest(SubLp, OuterLp))
348 return false;
350 return true;
353 /// Check whether it is safe to if-convert this phi node.
355 /// Phi nodes with constant expressions that can trap are not safe to if
356 /// convert.
357 static bool canIfConvertPHINodes(BasicBlock *BB) {
358 for (PHINode &Phi : BB->phis()) {
359 for (Value *V : Phi.incoming_values())
360 if (auto *C = dyn_cast<Constant>(V))
361 if (C->canTrap())
362 return false;
364 return true;
367 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
368 if (Ty->isPointerTy())
369 return DL.getIntPtrType(Ty);
371 // It is possible that char's or short's overflow when we ask for the loop's
372 // trip count, work around this by changing the type size.
373 if (Ty->getScalarSizeInBits() < 32)
374 return Type::getInt32Ty(Ty->getContext());
376 return Ty;
379 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
380 Ty0 = convertPointerToIntegerType(DL, Ty0);
381 Ty1 = convertPointerToIntegerType(DL, Ty1);
382 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
383 return Ty0;
384 return Ty1;
387 /// Check that the instruction has outside loop users and is not an
388 /// identified reduction variable.
389 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
390 SmallPtrSetImpl<Value *> &AllowedExit) {
391 // Reductions, Inductions and non-header phis are allowed to have exit users. All
392 // other instructions must not have external users.
393 if (!AllowedExit.count(Inst))
394 // Check that all of the users of the loop are inside the BB.
395 for (User *U : Inst->users()) {
396 Instruction *UI = cast<Instruction>(U);
397 // This user may be a reduction exit value.
398 if (!TheLoop->contains(UI)) {
399 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
400 return true;
403 return false;
406 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
407 const ValueToValueMap &Strides =
408 getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
410 int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
411 if (Stride == 1 || Stride == -1)
412 return Stride;
413 return 0;
416 bool LoopVectorizationLegality::isUniform(Value *V) {
417 return LAI->isUniform(V);
420 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
421 assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
422 // Store the result and return it at the end instead of exiting early, in case
423 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
424 bool Result = true;
425 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
427 for (BasicBlock *BB : TheLoop->blocks()) {
428 // Check whether the BB terminator is a BranchInst. Any other terminator is
429 // not supported yet.
430 auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
431 if (!Br) {
432 reportVectorizationFailure("Unsupported basic block terminator",
433 "loop control flow is not understood by vectorizer",
434 "CFGNotUnderstood", ORE, TheLoop);
435 if (DoExtraAnalysis)
436 Result = false;
437 else
438 return false;
441 // Check whether the BranchInst is a supported one. Only unconditional
442 // branches, conditional branches with an outer loop invariant condition or
443 // backedges are supported.
444 // FIXME: We skip these checks when VPlan predication is enabled as we
445 // want to allow divergent branches. This whole check will be removed
446 // once VPlan predication is on by default.
447 if (!EnableVPlanPredication && Br && Br->isConditional() &&
448 !TheLoop->isLoopInvariant(Br->getCondition()) &&
449 !LI->isLoopHeader(Br->getSuccessor(0)) &&
450 !LI->isLoopHeader(Br->getSuccessor(1))) {
451 reportVectorizationFailure("Unsupported conditional branch",
452 "loop control flow is not understood by vectorizer",
453 "CFGNotUnderstood", ORE, TheLoop);
454 if (DoExtraAnalysis)
455 Result = false;
456 else
457 return false;
461 // Check whether inner loops are uniform. At this point, we only support
462 // simple outer loops scenarios with uniform nested loops.
463 if (!isUniformLoopNest(TheLoop /*loop nest*/,
464 TheLoop /*context outer loop*/)) {
465 reportVectorizationFailure("Outer loop contains divergent loops",
466 "loop control flow is not understood by vectorizer",
467 "CFGNotUnderstood", ORE, TheLoop);
468 if (DoExtraAnalysis)
469 Result = false;
470 else
471 return false;
474 // Check whether we are able to set up outer loop induction.
475 if (!setupOuterLoopInductions()) {
476 reportVectorizationFailure("Unsupported outer loop Phi(s)",
477 "Unsupported outer loop Phi(s)",
478 "UnsupportedPhi", ORE, TheLoop);
479 if (DoExtraAnalysis)
480 Result = false;
481 else
482 return false;
485 return Result;
488 void LoopVectorizationLegality::addInductionPhi(
489 PHINode *Phi, const InductionDescriptor &ID,
490 SmallPtrSetImpl<Value *> &AllowedExit) {
491 Inductions[Phi] = ID;
493 // In case this induction also comes with casts that we know we can ignore
494 // in the vectorized loop body, record them here. All casts could be recorded
495 // here for ignoring, but suffices to record only the first (as it is the
496 // only one that may bw used outside the cast sequence).
497 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
498 if (!Casts.empty())
499 InductionCastsToIgnore.insert(*Casts.begin());
501 Type *PhiTy = Phi->getType();
502 const DataLayout &DL = Phi->getModule()->getDataLayout();
504 // Get the widest type.
505 if (!PhiTy->isFloatingPointTy()) {
506 if (!WidestIndTy)
507 WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
508 else
509 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
512 // Int inductions are special because we only allow one IV.
513 if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
514 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
515 isa<Constant>(ID.getStartValue()) &&
516 cast<Constant>(ID.getStartValue())->isNullValue()) {
518 // Use the phi node with the widest type as induction. Use the last
519 // one if there are multiple (no good reason for doing this other
520 // than it is expedient). We've checked that it begins at zero and
521 // steps by one, so this is a canonical induction variable.
522 if (!PrimaryInduction || PhiTy == WidestIndTy)
523 PrimaryInduction = Phi;
526 // Both the PHI node itself, and the "post-increment" value feeding
527 // back into the PHI node may have external users.
528 // We can allow those uses, except if the SCEVs we have for them rely
529 // on predicates that only hold within the loop, since allowing the exit
530 // currently means re-using this SCEV outside the loop (see PR33706 for more
531 // details).
532 if (PSE.getUnionPredicate().isAlwaysTrue()) {
533 AllowedExit.insert(Phi);
534 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
537 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
540 bool LoopVectorizationLegality::setupOuterLoopInductions() {
541 BasicBlock *Header = TheLoop->getHeader();
543 // Returns true if a given Phi is a supported induction.
544 auto isSupportedPhi = [&](PHINode &Phi) -> bool {
545 InductionDescriptor ID;
546 if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
547 ID.getKind() == InductionDescriptor::IK_IntInduction) {
548 addInductionPhi(&Phi, ID, AllowedExit);
549 return true;
550 } else {
551 // Bail out for any Phi in the outer loop header that is not a supported
552 // induction.
553 LLVM_DEBUG(
554 dbgs()
555 << "LV: Found unsupported PHI for outer loop vectorization.\n");
556 return false;
560 if (llvm::all_of(Header->phis(), isSupportedPhi))
561 return true;
562 else
563 return false;
566 bool LoopVectorizationLegality::canVectorizeInstrs() {
567 BasicBlock *Header = TheLoop->getHeader();
569 // Look for the attribute signaling the absence of NaNs.
570 Function &F = *Header->getParent();
571 HasFunNoNaNAttr =
572 F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
574 // For each block in the loop.
575 for (BasicBlock *BB : TheLoop->blocks()) {
576 // Scan the instructions in the block and look for hazards.
577 for (Instruction &I : *BB) {
578 if (auto *Phi = dyn_cast<PHINode>(&I)) {
579 Type *PhiTy = Phi->getType();
580 // Check that this PHI type is allowed.
581 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
582 !PhiTy->isPointerTy()) {
583 reportVectorizationFailure("Found a non-int non-pointer PHI",
584 "loop control flow is not understood by vectorizer",
585 "CFGNotUnderstood", ORE, TheLoop);
586 return false;
589 // If this PHINode is not in the header block, then we know that we
590 // can convert it to select during if-conversion. No need to check if
591 // the PHIs in this block are induction or reduction variables.
592 if (BB != Header) {
593 // Non-header phi nodes that have outside uses can be vectorized. Add
594 // them to the list of allowed exits.
595 // Unsafe cyclic dependencies with header phis are identified during
596 // legalization for reduction, induction and first order
597 // recurrences.
598 AllowedExit.insert(&I);
599 continue;
602 // We only allow if-converted PHIs with exactly two incoming values.
603 if (Phi->getNumIncomingValues() != 2) {
604 reportVectorizationFailure("Found an invalid PHI",
605 "loop control flow is not understood by vectorizer",
606 "CFGNotUnderstood", ORE, TheLoop, Phi);
607 return false;
610 RecurrenceDescriptor RedDes;
611 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
612 DT)) {
613 if (RedDes.hasUnsafeAlgebra())
614 Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
615 AllowedExit.insert(RedDes.getLoopExitInstr());
616 Reductions[Phi] = RedDes;
617 continue;
620 // TODO: Instead of recording the AllowedExit, it would be good to record the
621 // complementary set: NotAllowedExit. These include (but may not be
622 // limited to):
623 // 1. Reduction phis as they represent the one-before-last value, which
624 // is not available when vectorized
625 // 2. Induction phis and increment when SCEV predicates cannot be used
626 // outside the loop - see addInductionPhi
627 // 3. Non-Phis with outside uses when SCEV predicates cannot be used
628 // outside the loop - see call to hasOutsideLoopUser in the non-phi
629 // handling below
630 // 4. FirstOrderRecurrence phis that can possibly be handled by
631 // extraction.
632 // By recording these, we can then reason about ways to vectorize each
633 // of these NotAllowedExit.
634 InductionDescriptor ID;
635 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
636 addInductionPhi(Phi, ID, AllowedExit);
637 if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
638 Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
639 continue;
642 if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
643 SinkAfter, DT)) {
644 FirstOrderRecurrences.insert(Phi);
645 continue;
648 // As a last resort, coerce the PHI to a AddRec expression
649 // and re-try classifying it a an induction PHI.
650 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
651 addInductionPhi(Phi, ID, AllowedExit);
652 continue;
655 reportVectorizationFailure("Found an unidentified PHI",
656 "value that could not be identified as "
657 "reduction is used outside the loop",
658 "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
659 return false;
660 } // end of PHI handling
662 // We handle calls that:
663 // * Are debug info intrinsics.
664 // * Have a mapping to an IR intrinsic.
665 // * Have a vector version available.
666 auto *CI = dyn_cast<CallInst>(&I);
667 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
668 !isa<DbgInfoIntrinsic>(CI) &&
669 !(CI->getCalledFunction() && TLI &&
670 TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
671 // If the call is a recognized math libary call, it is likely that
672 // we can vectorize it given loosened floating-point constraints.
673 LibFunc Func;
674 bool IsMathLibCall =
675 TLI && CI->getCalledFunction() &&
676 CI->getType()->isFloatingPointTy() &&
677 TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
678 TLI->hasOptimizedCodeGen(Func);
680 if (IsMathLibCall) {
681 // TODO: Ideally, we should not use clang-specific language here,
682 // but it's hard to provide meaningful yet generic advice.
683 // Also, should this be guarded by allowExtraAnalysis() and/or be part
684 // of the returned info from isFunctionVectorizable()?
685 reportVectorizationFailure("Found a non-intrinsic callsite",
686 "library call cannot be vectorized. "
687 "Try compiling with -fno-math-errno, -ffast-math, "
688 "or similar flags",
689 "CantVectorizeLibcall", ORE, TheLoop, CI);
690 } else {
691 reportVectorizationFailure("Found a non-intrinsic callsite",
692 "call instruction cannot be vectorized",
693 "CantVectorizeLibcall", ORE, TheLoop, CI);
695 return false;
698 // Some intrinsics have scalar arguments and should be same in order for
699 // them to be vectorized (i.e. loop invariant).
700 if (CI) {
701 auto *SE = PSE.getSE();
702 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
703 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
704 if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
705 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
706 reportVectorizationFailure("Found unvectorizable intrinsic",
707 "intrinsic instruction cannot be vectorized",
708 "CantVectorizeIntrinsic", ORE, TheLoop, CI);
709 return false;
714 // Check that the instruction return type is vectorizable.
715 // Also, we can't vectorize extractelement instructions.
716 if ((!VectorType::isValidElementType(I.getType()) &&
717 !I.getType()->isVoidTy()) ||
718 isa<ExtractElementInst>(I)) {
719 reportVectorizationFailure("Found unvectorizable type",
720 "instruction return type cannot be vectorized",
721 "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
722 return false;
725 // Check that the stored type is vectorizable.
726 if (auto *ST = dyn_cast<StoreInst>(&I)) {
727 Type *T = ST->getValueOperand()->getType();
728 if (!VectorType::isValidElementType(T)) {
729 reportVectorizationFailure("Store instruction cannot be vectorized",
730 "store instruction cannot be vectorized",
731 "CantVectorizeStore", ORE, TheLoop, ST);
732 return false;
735 // For nontemporal stores, check that a nontemporal vector version is
736 // supported on the target.
737 if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
738 // Arbitrarily try a vector of 2 elements.
739 Type *VecTy = VectorType::get(T, /*NumElements=*/2);
740 assert(VecTy && "did not find vectorized version of stored type");
741 unsigned Alignment = getLoadStoreAlignment(ST);
742 assert(Alignment && "Alignment should be set");
743 if (!TTI->isLegalNTStore(VecTy, llvm::Align(Alignment))) {
744 reportVectorizationFailure(
745 "nontemporal store instruction cannot be vectorized",
746 "nontemporal store instruction cannot be vectorized",
747 "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
748 return false;
752 } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
753 if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
754 // For nontemporal loads, check that a nontemporal vector version is
755 // supported on the target (arbitrarily try a vector of 2 elements).
756 Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
757 assert(VecTy && "did not find vectorized version of load type");
758 unsigned Alignment = getLoadStoreAlignment(LD);
759 assert(Alignment && "Alignment should be set");
760 if (!TTI->isLegalNTLoad(VecTy, llvm::Align(Alignment))) {
761 reportVectorizationFailure(
762 "nontemporal load instruction cannot be vectorized",
763 "nontemporal load instruction cannot be vectorized",
764 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
765 return false;
769 // FP instructions can allow unsafe algebra, thus vectorizable by
770 // non-IEEE-754 compliant SIMD units.
771 // This applies to floating-point math operations and calls, not memory
772 // operations, shuffles, or casts, as they don't change precision or
773 // semantics.
774 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
775 !I.isFast()) {
776 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
777 Hints->setPotentiallyUnsafe();
780 // Reduction instructions are allowed to have exit users.
781 // All other instructions must not have external users.
782 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
783 // We can safely vectorize loops where instructions within the loop are
784 // used outside the loop only if the SCEV predicates within the loop is
785 // same as outside the loop. Allowing the exit means reusing the SCEV
786 // outside the loop.
787 if (PSE.getUnionPredicate().isAlwaysTrue()) {
788 AllowedExit.insert(&I);
789 continue;
791 reportVectorizationFailure("Value cannot be used outside the loop",
792 "value cannot be used outside the loop",
793 "ValueUsedOutsideLoop", ORE, TheLoop, &I);
794 return false;
796 } // next instr.
799 if (!PrimaryInduction) {
800 if (Inductions.empty()) {
801 reportVectorizationFailure("Did not find one integer induction var",
802 "loop induction variable could not be identified",
803 "NoInductionVariable", ORE, TheLoop);
804 return false;
805 } else if (!WidestIndTy) {
806 reportVectorizationFailure("Did not find one integer induction var",
807 "integer loop induction variable could not be identified",
808 "NoIntegerInductionVariable", ORE, TheLoop);
809 return false;
810 } else {
811 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
815 // Now we know the widest induction type, check if our found induction
816 // is the same size. If it's not, unset it here and InnerLoopVectorizer
817 // will create another.
818 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
819 PrimaryInduction = nullptr;
821 return true;
824 bool LoopVectorizationLegality::canVectorizeMemory() {
825 LAI = &(*GetLAA)(*TheLoop);
826 const OptimizationRemarkAnalysis *LAR = LAI->getReport();
827 if (LAR) {
828 ORE->emit([&]() {
829 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
830 "loop not vectorized: ", *LAR);
833 if (!LAI->canVectorizeMemory())
834 return false;
836 if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
837 reportVectorizationFailure("Stores to a uniform address",
838 "write to a loop invariant address could not be vectorized",
839 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
840 return false;
842 Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
843 PSE.addPredicate(LAI->getPSE().getUnionPredicate());
845 return true;
848 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
849 Value *In0 = const_cast<Value *>(V);
850 PHINode *PN = dyn_cast_or_null<PHINode>(In0);
851 if (!PN)
852 return false;
854 return Inductions.count(PN);
857 bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
858 auto *Inst = dyn_cast<Instruction>(V);
859 return (Inst && InductionCastsToIgnore.count(Inst));
862 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
863 return isInductionPhi(V) || isCastedInductionVariable(V);
866 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
867 return FirstOrderRecurrences.count(Phi);
870 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
871 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
874 bool LoopVectorizationLegality::blockCanBePredicated(
875 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
876 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
878 for (Instruction &I : *BB) {
879 // Check that we don't have a constant expression that can trap as operand.
880 for (Value *Operand : I.operands()) {
881 if (auto *C = dyn_cast<Constant>(Operand))
882 if (C->canTrap())
883 return false;
885 // We might be able to hoist the load.
886 if (I.mayReadFromMemory()) {
887 auto *LI = dyn_cast<LoadInst>(&I);
888 if (!LI)
889 return false;
890 if (!SafePtrs.count(LI->getPointerOperand())) {
891 // !llvm.mem.parallel_loop_access implies if-conversion safety.
892 // Otherwise, record that the load needs (real or emulated) masking
893 // and let the cost model decide.
894 if (!IsAnnotatedParallel || PreserveGuards)
895 MaskedOp.insert(LI);
896 continue;
900 if (I.mayWriteToMemory()) {
901 auto *SI = dyn_cast<StoreInst>(&I);
902 if (!SI)
903 return false;
904 // Predicated store requires some form of masking:
905 // 1) masked store HW instruction,
906 // 2) emulation via load-blend-store (only if safe and legal to do so,
907 // be aware on the race conditions), or
908 // 3) element-by-element predicate check and scalar store.
909 MaskedOp.insert(SI);
910 continue;
912 if (I.mayThrow())
913 return false;
916 return true;
919 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
920 if (!EnableIfConversion) {
921 reportVectorizationFailure("If-conversion is disabled",
922 "if-conversion is disabled",
923 "IfConversionDisabled",
924 ORE, TheLoop);
925 return false;
928 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
930 // A list of pointers which are known to be dereferenceable within scope of
931 // the loop body for each iteration of the loop which executes. That is,
932 // the memory pointed to can be dereferenced (with the access size implied by
933 // the value's type) unconditionally within the loop header without
934 // introducing a new fault.
935 SmallPtrSet<Value *, 8> SafePointes;
937 // Collect safe addresses.
938 for (BasicBlock *BB : TheLoop->blocks()) {
939 if (blockNeedsPredication(BB))
940 continue;
942 for (Instruction &I : *BB)
943 if (auto *Ptr = getLoadStorePointerOperand(&I))
944 SafePointes.insert(Ptr);
947 // Collect the blocks that need predication.
948 BasicBlock *Header = TheLoop->getHeader();
949 for (BasicBlock *BB : TheLoop->blocks()) {
950 // We don't support switch statements inside loops.
951 if (!isa<BranchInst>(BB->getTerminator())) {
952 reportVectorizationFailure("Loop contains a switch statement",
953 "loop contains a switch statement",
954 "LoopContainsSwitch", ORE, TheLoop,
955 BB->getTerminator());
956 return false;
959 // We must be able to predicate all blocks that need to be predicated.
960 if (blockNeedsPredication(BB)) {
961 if (!blockCanBePredicated(BB, SafePointes)) {
962 reportVectorizationFailure(
963 "Control flow cannot be substituted for a select",
964 "control flow cannot be substituted for a select",
965 "NoCFGForSelect", ORE, TheLoop,
966 BB->getTerminator());
967 return false;
969 } else if (BB != Header && !canIfConvertPHINodes(BB)) {
970 reportVectorizationFailure(
971 "Control flow cannot be substituted for a select",
972 "control flow cannot be substituted for a select",
973 "NoCFGForSelect", ORE, TheLoop,
974 BB->getTerminator());
975 return false;
979 // We can if-convert this loop.
980 return true;
983 // Helper function to canVectorizeLoopNestCFG.
984 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
985 bool UseVPlanNativePath) {
986 assert((UseVPlanNativePath || Lp->empty()) &&
987 "VPlan-native path is not enabled.");
989 // TODO: ORE should be improved to show more accurate information when an
990 // outer loop can't be vectorized because a nested loop is not understood or
991 // legal. Something like: "outer_loop_location: loop not vectorized:
992 // (inner_loop_location) loop control flow is not understood by vectorizer".
994 // Store the result and return it at the end instead of exiting early, in case
995 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
996 bool Result = true;
997 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
999 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1000 // be canonicalized.
1001 if (!Lp->getLoopPreheader()) {
1002 reportVectorizationFailure("Loop doesn't have a legal pre-header",
1003 "loop control flow is not understood by vectorizer",
1004 "CFGNotUnderstood", ORE, TheLoop);
1005 if (DoExtraAnalysis)
1006 Result = false;
1007 else
1008 return false;
1011 // We must have a single backedge.
1012 if (Lp->getNumBackEdges() != 1) {
1013 reportVectorizationFailure("The loop must have a single backedge",
1014 "loop control flow is not understood by vectorizer",
1015 "CFGNotUnderstood", ORE, TheLoop);
1016 if (DoExtraAnalysis)
1017 Result = false;
1018 else
1019 return false;
1022 // We must have a single exiting block.
1023 if (!Lp->getExitingBlock()) {
1024 reportVectorizationFailure("The loop must have an exiting block",
1025 "loop control flow is not understood by vectorizer",
1026 "CFGNotUnderstood", ORE, TheLoop);
1027 if (DoExtraAnalysis)
1028 Result = false;
1029 else
1030 return false;
1033 // We only handle bottom-tested loops, i.e. loop in which the condition is
1034 // checked at the end of each iteration. With that we can assume that all
1035 // instructions in the loop are executed the same number of times.
1036 if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
1037 reportVectorizationFailure("The exiting block is not the loop latch",
1038 "loop control flow is not understood by vectorizer",
1039 "CFGNotUnderstood", ORE, TheLoop);
1040 if (DoExtraAnalysis)
1041 Result = false;
1042 else
1043 return false;
1046 return Result;
1049 bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1050 Loop *Lp, bool UseVPlanNativePath) {
1051 // Store the result and return it at the end instead of exiting early, in case
1052 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1053 bool Result = true;
1054 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1055 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1056 if (DoExtraAnalysis)
1057 Result = false;
1058 else
1059 return false;
1062 // Recursively check whether the loop control flow of nested loops is
1063 // understood.
1064 for (Loop *SubLp : *Lp)
1065 if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
1066 if (DoExtraAnalysis)
1067 Result = false;
1068 else
1069 return false;
1072 return Result;
1075 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1076 // Store the result and return it at the end instead of exiting early, in case
1077 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1078 bool Result = true;
1080 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1081 // Check whether the loop-related control flow in the loop nest is expected by
1082 // vectorizer.
1083 if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
1084 if (DoExtraAnalysis)
1085 Result = false;
1086 else
1087 return false;
1090 // We need to have a loop header.
1091 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1092 << '\n');
1094 // Specific checks for outer loops. We skip the remaining legal checks at this
1095 // point because they don't support outer loops.
1096 if (!TheLoop->empty()) {
1097 assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1099 if (!canVectorizeOuterLoop()) {
1100 reportVectorizationFailure("Unsupported outer loop",
1101 "unsupported outer loop",
1102 "UnsupportedOuterLoop",
1103 ORE, TheLoop);
1104 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1105 // outer loops.
1106 return false;
1109 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1110 return Result;
1113 assert(TheLoop->empty() && "Inner loop expected.");
1114 // Check if we can if-convert non-single-bb loops.
1115 unsigned NumBlocks = TheLoop->getNumBlocks();
1116 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1117 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1118 if (DoExtraAnalysis)
1119 Result = false;
1120 else
1121 return false;
1124 // Check if we can vectorize the instructions and CFG in this loop.
1125 if (!canVectorizeInstrs()) {
1126 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1127 if (DoExtraAnalysis)
1128 Result = false;
1129 else
1130 return false;
1133 // Go over each instruction and look at memory deps.
1134 if (!canVectorizeMemory()) {
1135 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1136 if (DoExtraAnalysis)
1137 Result = false;
1138 else
1139 return false;
1142 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1143 << (LAI->getRuntimePointerChecking()->Need
1144 ? " (with a runtime bound check)"
1145 : "")
1146 << "!\n");
1148 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1149 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1150 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1152 if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
1153 reportVectorizationFailure("Too many SCEV checks needed",
1154 "Too many SCEV assumptions need to be made and checked at runtime",
1155 "TooManySCEVRunTimeChecks", ORE, TheLoop);
1156 if (DoExtraAnalysis)
1157 Result = false;
1158 else
1159 return false;
1162 // Okay! We've done all the tests. If any have failed, return false. Otherwise
1163 // we can vectorize, and at this point we don't have any other mem analysis
1164 // which may limit our maximum vectorization factor, so just return true with
1165 // no restrictions.
1166 return Result;
1169 bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
1171 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1173 if (!PrimaryInduction) {
1174 reportVectorizationFailure(
1175 "No primary induction, cannot fold tail by masking",
1176 "Missing a primary induction variable in the loop, which is "
1177 "needed in order to fold tail by masking as required.",
1178 "NoPrimaryInduction", ORE, TheLoop);
1179 return false;
1182 SmallPtrSet<const Value *, 8> ReductionLiveOuts;
1184 for (auto &Reduction : *getReductionVars())
1185 ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
1187 // TODO: handle non-reduction outside users when tail is folded by masking.
1188 for (auto *AE : AllowedExit) {
1189 // Check that all users of allowed exit values are inside the loop or
1190 // are the live-out of a reduction.
1191 if (ReductionLiveOuts.count(AE))
1192 continue;
1193 for (User *U : AE->users()) {
1194 Instruction *UI = cast<Instruction>(U);
1195 if (TheLoop->contains(UI))
1196 continue;
1197 reportVectorizationFailure(
1198 "Cannot fold tail by masking, loop has an outside user for",
1199 "Cannot fold tail by masking in the presence of live outs.",
1200 "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
1201 return false;
1205 // The list of pointers that we can safely read and write to remains empty.
1206 SmallPtrSet<Value *, 8> SafePointers;
1208 // Check and mark all blocks for predication, including those that ordinarily
1209 // do not need predication such as the header block.
1210 for (BasicBlock *BB : TheLoop->blocks()) {
1211 if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
1212 reportVectorizationFailure(
1213 "Cannot fold tail by masking as required",
1214 "control flow cannot be substituted for a select",
1215 "NoCFGForSelect", ORE, TheLoop,
1216 BB->getTerminator());
1217 return false;
1221 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1222 return true;
1225 } // namespace llvm