include/llvm/Analysis/VectorUtils.h

   1 //===- llvm/Analysis/VectorUtils.h - Vector utilities -----------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines some vectorizer utilities.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef LLVM_ANALYSIS_VECTORUTILS_H
  14 #define LLVM_ANALYSIS_VECTORUTILS_H
  15
  16 #include "llvm/ADT/MapVector.h"
  17 #include "llvm/Analysis/LoopAccessAnalysis.h"
  18 #include "llvm/IR/IRBuilder.h"
  19 #include "llvm/Support/CheckedArithmetic.h"
  20
  21 namespace llvm {
  22
  23 /// Describes the type of Parameters
  24 enum class VFParamKind {
  25   Vector,            // No semantic information.
  26   OMP_Linear,        // declare simd linear(i)
  27   OMP_LinearRef,     // declare simd linear(ref(i))
  28   OMP_LinearVal,     // declare simd linear(val(i))
  29   OMP_LinearUVal,    // declare simd linear(uval(i))
  30   OMP_LinearPos,     // declare simd linear(i:c) uniform(c)
  31   OMP_LinearValPos,  // declare simd linear(val(i:c)) uniform(c)
  32   OMP_LinearRefPos,  // declare simd linear(ref(i:c)) uniform(c)
  33   OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c
  34   OMP_Uniform,       // declare simd uniform(i)
  35   GlobalPredicate,   // Global logical predicate that acts on all lanes
  36                      // of the input and output mask concurrently. For
  37                      // example, it is implied by the `M` token in the
  38                      // Vector Function ABI mangled name.
  39   Unknown
  40 };
  41
  42 /// Describes the type of Instruction Set Architecture
  43 enum class VFISAKind {
  44   AdvancedSIMD, // AArch64 Advanced SIMD (NEON)
  45   SVE,          // AArch64 Scalable Vector Extension
  46   SSE,          // x86 SSE
  47   AVX,          // x86 AVX
  48   AVX2,         // x86 AVX2
  49   AVX512,       // x86 AVX512
  50   Unknown       // Unknown ISA
  51 };
  52
  53 /// Encapsulates information needed to describe a parameter.
  54 ///
  55 /// The description of the parameter is not linked directly to
  56 /// OpenMP or any other vector function description. This structure
  57 /// is extendible to handle other paradigms that describe vector
  58 /// functions and their parameters.
  59 struct VFParameter {
  60   unsigned ParamPos;         // Parameter Position in Scalar Function.
  61   VFParamKind ParamKind;     // Kind of Parameter.
  62   int LinearStepOrPos = 0;   // Step or Position of the Parameter.
  63   Align Alignment = Align(); // Optional aligment in bytes, defaulted to 1.
  64
  65   // Comparison operator.
  66   bool operator==(const VFParameter &Other) const {
  67     return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) ==
  68            std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos,
  69                     Other.Alignment);
  70   }
  71 };
  72
  73 /// Contains the information about the kind of vectorization
  74 /// available.
  75 ///
  76 /// This object in independent on the paradigm used to
  77 /// represent vector functions. in particular, it is not attached to
  78 /// any target-specific ABI.
  79 struct VFShape {
  80   unsigned VF;     // Vectorization factor.
  81   bool IsScalable; // True if the function is a scalable function.
  82   VFISAKind ISA;   // Instruction Set Architecture.
  83   SmallVector<VFParameter, 8> Parameters; // List of parameter informations.
  84   // Comparison operator.
  85   bool operator==(const VFShape &Other) const {
  86     return std::tie(VF, IsScalable, ISA, Parameters) ==
  87            std::tie(Other.VF, Other.IsScalable, Other.ISA, Other.Parameters);
  88   }
  89 };
  90
  91 /// Holds the VFShape for a specific scalar to vector function mapping.
  92 struct VFInfo {
  93   VFShape Shape;        // Classification of the vector function.
  94   StringRef ScalarName; // Scalar Function Name.
  95   StringRef VectorName; // Vector Function Name associated to this VFInfo.
  96
  97   // Comparison operator.
  98   bool operator==(const VFInfo &Other) const {
  99     return std::tie(Shape, ScalarName, VectorName) ==
 100            std::tie(Shape, Other.ScalarName, Other.VectorName);
 101   }
 102 };
 103
 104 namespace VFABI {
 105 /// Function to contruct a VFInfo out of a mangled names in the
 106 /// following format:
 107 ///
 108 /// <VFABI_name>{(<redirection>)}
 109 ///
 110 /// where <VFABI_name> is the name of the vector function, mangled according
 111 /// to the rules described in the Vector Function ABI of the target vector
 112 /// extentsion (or <isa> from now on). The <VFABI_name> is in the following
 113 /// format:
 114 ///
 115 /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
 116 ///
 117 /// This methods support demangling rules for the following <isa>:
 118 ///
 119 /// * AArch64: https://developer.arm.com/docs/101129/latest
 120 ///
 121 /// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and
 122 ///  https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
 123 ///
 124 ///
 125 ///
 126 /// \param MangledName -> input string in the format
 127 /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)].
 128 Optional<VFInfo> tryDemangleForVFABI(StringRef MangledName);
 129
 130 /// Retrieve the `VFParamKind` from a string token.
 131 VFParamKind getVFParamKindFromString(const StringRef Token);
 132 } // end namespace VFABI
 133
 134 template <typename T> class ArrayRef;
 135 class DemandedBits;
 136 class GetElementPtrInst;
 137 template <typename InstTy> class InterleaveGroup;
 138 class Loop;
 139 class ScalarEvolution;
 140 class TargetLibraryInfo;
 141 class TargetTransformInfo;
 142 class Type;
 143 class Value;
 144
 145 namespace Intrinsic {
 146 enum ID : unsigned;
 147 }
 148
 149 /// Identify if the intrinsic is trivially vectorizable.
 150 /// This method returns true if the intrinsic's argument types are all scalars
 151 /// for the scalar form of the intrinsic and all vectors (or scalars handled by
 152 /// hasVectorInstrinsicScalarOpd) for the vector form of the intrinsic.
 153 bool isTriviallyVectorizable(Intrinsic::ID ID);
 154
 155 /// Identifies if the vector form of the intrinsic has a scalar operand.
 156 bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx);
 157
 158 /// Returns intrinsic ID for call.
 159 /// For the input call instruction it finds mapping intrinsic and returns
 160 /// its intrinsic ID, in case it does not found it return not_intrinsic.
 161 Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
 162                                           const TargetLibraryInfo *TLI);
 163
 164 /// Find the operand of the GEP that should be checked for consecutive
 165 /// stores. This ignores trailing indices that have no effect on the final
 166 /// pointer.
 167 unsigned getGEPInductionOperand(const GetElementPtrInst *Gep);
 168
 169 /// If the argument is a GEP, then returns the operand identified by
 170 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 171 /// operand, it returns that instead.
 172 Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
 173
 174 /// If a value has only one user that is a CastInst, return it.
 175 Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty);
 176
 177 /// Get the stride of a pointer access in a loop. Looks for symbolic
 178 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
 179 Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
 180
 181 /// Given a vector and an element number, see if the scalar value is
 182 /// already around as a register, for example if it were inserted then extracted
 183 /// from the vector.
 184 Value *findScalarElement(Value *V, unsigned EltNo);
 185
 186 /// Get splat value if the input is a splat vector or return nullptr.
 187 /// The value may be extracted from a splat constants vector or from
 188 /// a sequence of instructions that broadcast a single value into a vector.
 189 const Value *getSplatValue(const Value *V);
 190
 191 /// Return true if the input value is known to be a vector with all identical
 192 /// elements (potentially including undefined elements).
 193 /// This may be more powerful than the related getSplatValue() because it is
 194 /// not limited by finding a scalar source value to a splatted vector.
 195 bool isSplatValue(const Value *V, unsigned Depth = 0);
 196
 197 /// Compute a map of integer instructions to their minimum legal type
 198 /// size.
 199 ///
 200 /// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int
 201 /// type (e.g. i32) whenever arithmetic is performed on them.
 202 ///
 203 /// For targets with native i8 or i16 operations, usually InstCombine can shrink
 204 /// the arithmetic type down again. However InstCombine refuses to create
 205 /// illegal types, so for targets without i8 or i16 registers, the lengthening
 206 /// and shrinking remains.
 207 ///
 208 /// Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when
 209 /// their scalar equivalents do not, so during vectorization it is important to
 210 /// remove these lengthens and truncates when deciding the profitability of
 211 /// vectorization.
 212 ///
 213 /// This function analyzes the given range of instructions and determines the
 214 /// minimum type size each can be converted to. It attempts to remove or
 215 /// minimize type size changes across each def-use chain, so for example in the
 216 /// following code:
 217 ///
 218 ///   %1 = load i8, i8*
 219 ///   %2 = add i8 %1, 2
 220 ///   %3 = load i16, i16*
 221 ///   %4 = zext i8 %2 to i32
 222 ///   %5 = zext i16 %3 to i32
 223 ///   %6 = add i32 %4, %5
 224 ///   %7 = trunc i32 %6 to i16
 225 ///
 226 /// Instruction %6 must be done at least in i16, so computeMinimumValueSizes
 227 /// will return: {%1: 16, %2: 16, %3: 16, %4: 16, %5: 16, %6: 16, %7: 16}.
 228 ///
 229 /// If the optional TargetTransformInfo is provided, this function tries harder
 230 /// to do less work by only looking at illegal types.
 231 MapVector<Instruction*, uint64_t>
 232 computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 233                          DemandedBits &DB,
 234                          const TargetTransformInfo *TTI=nullptr);
 235
 236 /// Compute the union of two access-group lists.
 237 ///
 238 /// If the list contains just one access group, it is returned directly. If the
 239 /// list is empty, returns nullptr.
 240 MDNode *uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2);
 241
 242 /// Compute the access-group list of access groups that @p Inst1 and @p Inst2
 243 /// are both in. If either instruction does not access memory at all, it is
 244 /// considered to be in every list.
 245 ///
 246 /// If the list contains just one access group, it is returned directly. If the
 247 /// list is empty, returns nullptr.
 248 MDNode *intersectAccessGroups(const Instruction *Inst1,
 249                               const Instruction *Inst2);
 250
 251 /// Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath,
 252 /// MD_nontemporal, MD_access_group].
 253 /// For K in Kinds, we get the MDNode for K from each of the
 254 /// elements of VL, compute their "intersection" (i.e., the most generic
 255 /// metadata value that covers all of the individual values), and set I's
 256 /// metadata for M equal to the intersection value.
 257 ///
 258 /// This function always sets a (possibly null) value for each K in Kinds.
 259 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 260
 261 /// Create a mask that filters the members of an interleave group where there
 262 /// are gaps.
 263 ///
 264 /// For example, the mask for \p Group with interleave-factor 3
 265 /// and \p VF 4, that has only its first member present is:
 266 ///
 267 ///   <1,0,0,1,0,0,1,0,0,1,0,0>
 268 ///
 269 /// Note: The result is a mask of 0's and 1's, as opposed to the other
 270 /// create[*]Mask() utilities which create a shuffle mask (mask that
 271 /// consists of indices).
 272 Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
 273                                const InterleaveGroup<Instruction> &Group);
 274
 275 /// Create a mask with replicated elements.
 276 ///
 277 /// This function creates a shuffle mask for replicating each of the \p VF
 278 /// elements in a vector \p ReplicationFactor times. It can be used to
 279 /// transform a mask of \p VF elements into a mask of
 280 /// \p VF * \p ReplicationFactor elements used by a predicated
 281 /// interleaved-group of loads/stores whose Interleaved-factor ==
 282 /// \p ReplicationFactor.
 283 ///
 284 /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
 285 ///
 286 ///   <0,0,0,1,1,1,2,2,2,3,3,3>
 287 Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
 288                                unsigned VF);
 289
 290 /// Create an interleave shuffle mask.
 291 ///
 292 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
 293 /// vectorization factor \p VF into a single wide vector. The mask is of the
 294 /// form:
 295 ///
 296 ///   <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...>
 297 ///
 298 /// For example, the mask for VF = 4 and NumVecs = 2 is:
 299 ///
 300 ///   <0, 4, 1, 5, 2, 6, 3, 7>.
 301 Constant *createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
 302                                unsigned NumVecs);
 303
 304 /// Create a stride shuffle mask.
 305 ///
 306 /// This function creates a shuffle mask whose elements begin at \p Start and
 307 /// are incremented by \p Stride. The mask can be used to deinterleave an
 308 /// interleaved vector into separate vectors of vectorization factor \p VF. The
 309 /// mask is of the form:
 310 ///
 311 ///   <Start, Start + Stride, ..., Start + Stride * (VF - 1)>
 312 ///
 313 /// For example, the mask for Start = 0, Stride = 2, and VF = 4 is:
 314 ///
 315 ///   <0, 2, 4, 6>
 316 Constant *createStrideMask(IRBuilder<> &Builder, unsigned Start,
 317                            unsigned Stride, unsigned VF);
 318
 319 /// Create a sequential shuffle mask.
 320 ///
 321 /// This function creates shuffle mask whose elements are sequential and begin
 322 /// at \p Start.  The mask contains \p NumInts integers and is padded with \p
 323 /// NumUndefs undef values. The mask is of the form:
 324 ///
 325 ///   <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs>
 326 ///
 327 /// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is:
 328 ///
 329 ///   <0, 1, 2, 3, undef, undef, undef, undef>
 330 Constant *createSequentialMask(IRBuilder<> &Builder, unsigned Start,
 331                                unsigned NumInts, unsigned NumUndefs);
 332
 333 /// Concatenate a list of vectors.
 334 ///
 335 /// This function generates code that concatenate the vectors in \p Vecs into a
 336 /// single large vector. The number of vectors should be greater than one, and
 337 /// their element types should be the same. The number of elements in the
 338 /// vectors should also be the same; however, if the last vector has fewer
 339 /// elements, it will be padded with undefs.
 340 Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs);
 341
 342 /// Given a mask vector of the form <Y x i1>, Return true if all of the
 343 /// elements of this predicate mask are false or undef.  That is, return true
 344 /// if all lanes can be assumed inactive.
 345 bool maskIsAllZeroOrUndef(Value *Mask);
 346
 347 /// Given a mask vector of the form <Y x i1>, Return true if all of the
 348 /// elements of this predicate mask are true or undef.  That is, return true
 349 /// if all lanes can be assumed active.
 350 bool maskIsAllOneOrUndef(Value *Mask);
 351
 352 /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
 353 /// for each lane which may be active.
 354 APInt possiblyDemandedEltsInMask(Value *Mask);
 355
 356 /// The group of interleaved loads/stores sharing the same stride and
 357 /// close to each other.
 358 ///
 359 /// Each member in this group has an index starting from 0, and the largest
 360 /// index should be less than interleaved factor, which is equal to the absolute
 361 /// value of the access's stride.
 362 ///
 363 /// E.g. An interleaved load group of factor 4:
 364 ///        for (unsigned i = 0; i < 1024; i+=4) {
 365 ///          a = A[i];                           // Member of index 0
 366 ///          b = A[i+1];                         // Member of index 1
 367 ///          d = A[i+3];                         // Member of index 3
 368 ///          ...
 369 ///        }
 370 ///
 371 ///      An interleaved store group of factor 4:
 372 ///        for (unsigned i = 0; i < 1024; i+=4) {
 373 ///          ...
 374 ///          A[i]   = a;                         // Member of index 0
 375 ///          A[i+1] = b;                         // Member of index 1
 376 ///          A[i+2] = c;                         // Member of index 2
 377 ///          A[i+3] = d;                         // Member of index 3
 378 ///        }
 379 ///
 380 /// Note: the interleaved load group could have gaps (missing members), but
 381 /// the interleaved store group doesn't allow gaps.
 382 template <typename InstTy> class InterleaveGroup {
 383 public:
 384   InterleaveGroup(uint32_t Factor, bool Reverse, uint32_t Align)
 385       : Factor(Factor), Reverse(Reverse), Align(Align), InsertPos(nullptr) {}
 386
 387   InterleaveGroup(InstTy *Instr, int32_t Stride, uint32_t Align)
 388       : Align(Align), InsertPos(Instr) {
 389     assert(Align && "The alignment should be non-zero");
 390
 391     Factor = std::abs(Stride);
 392     assert(Factor > 1 && "Invalid interleave factor");
 393
 394     Reverse = Stride < 0;
 395     Members[0] = Instr;
 396   }
 397
 398   bool isReverse() const { return Reverse; }
 399   uint32_t getFactor() const { return Factor; }
 400   uint32_t getAlignment() const { return Align; }
 401   uint32_t getNumMembers() const { return Members.size(); }
 402
 403   /// Try to insert a new member \p Instr with index \p Index and
 404   /// alignment \p NewAlign. The index is related to the leader and it could be
 405   /// negative if it is the new leader.
 406   ///
 407   /// \returns false if the instruction doesn't belong to the group.
 408   bool insertMember(InstTy *Instr, int32_t Index, uint32_t NewAlign) {
 409     assert(NewAlign && "The new member's alignment should be non-zero");
 410
 411     // Make sure the key fits in an int32_t.
 412     Optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey);
 413     if (!MaybeKey)
 414       return false;
 415     int32_t Key = *MaybeKey;
 416
 417     // Skip if there is already a member with the same index.
 418     if (Members.find(Key) != Members.end())
 419       return false;
 420
 421     if (Key > LargestKey) {
 422       // The largest index is always less than the interleave factor.
 423       if (Index >= static_cast<int32_t>(Factor))
 424         return false;
 425
 426       LargestKey = Key;
 427     } else if (Key < SmallestKey) {
 428
 429       // Make sure the largest index fits in an int32_t.
 430       Optional<int32_t> MaybeLargestIndex = checkedSub(LargestKey, Key);
 431       if (!MaybeLargestIndex)
 432         return false;
 433
 434       // The largest index is always less than the interleave factor.
 435       if (*MaybeLargestIndex >= static_cast<int64_t>(Factor))
 436         return false;
 437
 438       SmallestKey = Key;
 439     }
 440
 441     // It's always safe to select the minimum alignment.
 442     Align = std::min(Align, NewAlign);
 443     Members[Key] = Instr;
 444     return true;
 445   }
 446
 447   /// Get the member with the given index \p Index
 448   ///
 449   /// \returns nullptr if contains no such member.
 450   InstTy *getMember(uint32_t Index) const {
 451     int32_t Key = SmallestKey + Index;
 452     auto Member = Members.find(Key);
 453     if (Member == Members.end())
 454       return nullptr;
 455
 456     return Member->second;
 457   }
 458
 459   /// Get the index for the given member. Unlike the key in the member
 460   /// map, the index starts from 0.
 461   uint32_t getIndex(const InstTy *Instr) const {
 462     for (auto I : Members) {
 463       if (I.second == Instr)
 464         return I.first - SmallestKey;
 465     }
 466
 467     llvm_unreachable("InterleaveGroup contains no such member");
 468   }
 469
 470   InstTy *getInsertPos() const { return InsertPos; }
 471   void setInsertPos(InstTy *Inst) { InsertPos = Inst; }
 472
 473   /// Add metadata (e.g. alias info) from the instructions in this group to \p
 474   /// NewInst.
 475   ///
 476   /// FIXME: this function currently does not add noalias metadata a'la
 477   /// addNewMedata.  To do that we need to compute the intersection of the
 478   /// noalias info from all members.
 479   void addMetadata(InstTy *NewInst) const;
 480
 481   /// Returns true if this Group requires a scalar iteration to handle gaps.
 482   bool requiresScalarEpilogue() const {
 483     // If the last member of the Group exists, then a scalar epilog is not
 484     // needed for this group.
 485     if (getMember(getFactor() - 1))
 486       return false;
 487
 488     // We have a group with gaps. It therefore cannot be a group of stores,
 489     // and it can't be a reversed access, because such groups get invalidated.
 490     assert(!getMember(0)->mayWriteToMemory() &&
 491            "Group should have been invalidated");
 492     assert(!isReverse() && "Group should have been invalidated");
 493
 494     // This is a group of loads, with gaps, and without a last-member
 495     return true;
 496   }
 497
 498 private:
 499   uint32_t Factor; // Interleave Factor.
 500   bool Reverse;
 501   uint32_t Align;
 502   DenseMap<int32_t, InstTy *> Members;
 503   int32_t SmallestKey = 0;
 504   int32_t LargestKey = 0;
 505
 506   // To avoid breaking dependences, vectorized instructions of an interleave
 507   // group should be inserted at either the first load or the last store in
 508   // program order.
 509   //
 510   // E.g. %even = load i32             // Insert Position
 511   //      %add = add i32 %even         // Use of %even
 512   //      %odd = load i32
 513   //
 514   //      store i32 %even
 515   //      %odd = add i32               // Def of %odd
 516   //      store i32 %odd               // Insert Position
 517   InstTy *InsertPos;
 518 };
 519
 520 /// Drive the analysis of interleaved memory accesses in the loop.
 521 ///
 522 /// Use this class to analyze interleaved accesses only when we can vectorize
 523 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
 524 /// on interleaved accesses is unsafe.
 525 ///
 526 /// The analysis collects interleave groups and records the relationships
 527 /// between the member and the group in a map.
 528 class InterleavedAccessInfo {
 529 public:
 530   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
 531                         DominatorTree *DT, LoopInfo *LI,
 532                         const LoopAccessInfo *LAI)
 533       : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 534
 535   ~InterleavedAccessInfo() { reset(); }
 536
 537   /// Analyze the interleaved accesses and collect them in interleave
 538   /// groups. Substitute symbolic strides using \p Strides.
 539   /// Consider also predicated loads/stores in the analysis if
 540   /// \p EnableMaskedInterleavedGroup is true.
 541   void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
 542
 543   /// Invalidate groups, e.g., in case all blocks in loop will be predicated
 544   /// contrary to original assumption. Although we currently prevent group
 545   /// formation for predicated accesses, we may be able to relax this limitation
 546   /// in the future once we handle more complicated blocks.
 547   void reset() {
 548     SmallPtrSet<InterleaveGroup<Instruction> *, 4> DelSet;
 549     // Avoid releasing a pointer twice.
 550     for (auto &I : InterleaveGroupMap)
 551       DelSet.insert(I.second);
 552     for (auto *Ptr : DelSet)
 553       delete Ptr;
 554     InterleaveGroupMap.clear();
 555     RequiresScalarEpilogue = false;
 556   }
 557
 558
 559   /// Check if \p Instr belongs to any interleave group.
 560   bool isInterleaved(Instruction *Instr) const {
 561     return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end();
 562   }
 563
 564   /// Get the interleave group that \p Instr belongs to.
 565   ///
 566   /// \returns nullptr if doesn't have such group.
 567   InterleaveGroup<Instruction> *
 568   getInterleaveGroup(const Instruction *Instr) const {
 569     if (InterleaveGroupMap.count(Instr))
 570       return InterleaveGroupMap.find(Instr)->second;
 571     return nullptr;
 572   }
 573
 574   iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>>
 575   getInterleaveGroups() {
 576     return make_range(InterleaveGroups.begin(), InterleaveGroups.end());
 577   }
 578
 579   /// Returns true if an interleaved group that may access memory
 580   /// out-of-bounds requires a scalar epilogue iteration for correctness.
 581   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 582
 583   /// Invalidate groups that require a scalar epilogue (due to gaps). This can
 584   /// happen when optimizing for size forbids a scalar epilogue, and the gap
 585   /// cannot be filtered by masking the load/store.
 586   void invalidateGroupsRequiringScalarEpilogue();
 587
 588 private:
 589   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
 590   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
 591   /// The interleaved access analysis can also add new predicates (for example
 592   /// by versioning strides of pointers).
 593   PredicatedScalarEvolution &PSE;
 594
 595   Loop *TheLoop;
 596   DominatorTree *DT;
 597   LoopInfo *LI;
 598   const LoopAccessInfo *LAI;
 599
 600   /// True if the loop may contain non-reversed interleaved groups with
 601   /// out-of-bounds accesses. We ensure we don't speculatively access memory
 602   /// out-of-bounds by executing at least one scalar epilogue iteration.
 603   bool RequiresScalarEpilogue = false;
 604
 605   /// Holds the relationships between the members and the interleave group.
 606   DenseMap<Instruction *, InterleaveGroup<Instruction> *> InterleaveGroupMap;
 607
 608   SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups;
 609
 610   /// Holds dependences among the memory accesses in the loop. It maps a source
 611   /// access to a set of dependent sink accesses.
 612   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
 613
 614   /// The descriptor for a strided memory access.
 615   struct StrideDescriptor {
 616     StrideDescriptor() = default;
 617     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
 618                      unsigned Align)
 619         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
 620
 621     // The access's stride. It is negative for a reverse access.
 622     int64_t Stride = 0;
 623
 624     // The scalar expression of this access.
 625     const SCEV *Scev = nullptr;
 626
 627     // The size of the memory object.
 628     uint64_t Size = 0;
 629
 630     // The alignment of this access.
 631     unsigned Align = 0;
 632   };
 633
 634   /// A type for holding instructions and their stride descriptors.
 635   using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
 636
 637   /// Create a new interleave group with the given instruction \p Instr,
 638   /// stride \p Stride and alignment \p Align.
 639   ///
 640   /// \returns the newly created interleave group.
 641   InterleaveGroup<Instruction> *
 642   createInterleaveGroup(Instruction *Instr, int Stride, unsigned Align) {
 643     assert(!InterleaveGroupMap.count(Instr) &&
 644            "Already in an interleaved access group");
 645     InterleaveGroupMap[Instr] =
 646         new InterleaveGroup<Instruction>(Instr, Stride, Align);
 647     InterleaveGroups.insert(InterleaveGroupMap[Instr]);
 648     return InterleaveGroupMap[Instr];
 649   }
 650
 651   /// Release the group and remove all the relationships.
 652   void releaseGroup(InterleaveGroup<Instruction> *Group) {
 653     for (unsigned i = 0; i < Group->getFactor(); i++)
 654       if (Instruction *Member = Group->getMember(i))
 655         InterleaveGroupMap.erase(Member);
 656
 657     InterleaveGroups.erase(Group);
 658     delete Group;
 659   }
 660
 661   /// Collect all the accesses with a constant stride in program order.
 662   void collectConstStrideAccesses(
 663       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
 664       const ValueToValueMap &Strides);
 665
 666   /// Returns true if \p Stride is allowed in an interleaved group.
 667   static bool isStrided(int Stride);
 668
 669   /// Returns true if \p BB is a predicated block.
 670   bool isPredicated(BasicBlock *BB) const {
 671     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 672   }
 673
 674   /// Returns true if LoopAccessInfo can be used for dependence queries.
 675   bool areDependencesValid() const {
 676     return LAI && LAI->getDepChecker().getDependences();
 677   }
 678
 679   /// Returns true if memory accesses \p A and \p B can be reordered, if
 680   /// necessary, when constructing interleaved groups.
 681   ///
 682   /// \p A must precede \p B in program order. We return false if reordering is
 683   /// not necessary or is prevented because \p A and \p B may be dependent.
 684   bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
 685                                                  StrideEntry *B) const {
 686     // Code motion for interleaved accesses can potentially hoist strided loads
 687     // and sink strided stores. The code below checks the legality of the
 688     // following two conditions:
 689     //
 690     // 1. Potentially moving a strided load (B) before any store (A) that
 691     //    precedes B, or
 692     //
 693     // 2. Potentially moving a strided store (A) after any load or store (B)
 694     //    that A precedes.
 695     //
 696     // It's legal to reorder A and B if we know there isn't a dependence from A
 697     // to B. Note that this determination is conservative since some
 698     // dependences could potentially be reordered safely.
 699
 700     // A is potentially the source of a dependence.
 701     auto *Src = A->first;
 702     auto SrcDes = A->second;
 703
 704     // B is potentially the sink of a dependence.
 705     auto *Sink = B->first;
 706     auto SinkDes = B->second;
 707
 708     // Code motion for interleaved accesses can't violate WAR dependences.
 709     // Thus, reordering is legal if the source isn't a write.
 710     if (!Src->mayWriteToMemory())
 711       return true;
 712
 713     // At least one of the accesses must be strided.
 714     if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
 715       return true;
 716
 717     // If dependence information is not available from LoopAccessInfo,
 718     // conservatively assume the instructions can't be reordered.
 719     if (!areDependencesValid())
 720       return false;
 721
 722     // If we know there is a dependence from source to sink, assume the
 723     // instructions can't be reordered. Otherwise, reordering is legal.
 724     return Dependences.find(Src) == Dependences.end() ||
 725            !Dependences.lookup(Src).count(Sink);
 726   }
 727
 728   /// Collect the dependences from LoopAccessInfo.
 729   ///
 730   /// We process the dependences once during the interleaved access analysis to
 731   /// enable constant-time dependence queries.
 732   void collectDependences() {
 733     if (!areDependencesValid())
 734       return;
 735     auto *Deps = LAI->getDepChecker().getDependences();
 736     for (auto Dep : *Deps)
 737       Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
 738   }
 739 };
 740
 741 } // llvm namespace
 742
 743 #endif