include/llvm/Analysis/VectorUtils.h

   1 //===- llvm/Analysis/VectorUtils.h - Vector utilities -----------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines some vectorizer utilities.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef LLVM_ANALYSIS_VECTORUTILS_H
  14 #define LLVM_ANALYSIS_VECTORUTILS_H
  15
  16 #include "llvm/ADT/MapVector.h"
  17 #include "llvm/Analysis/LoopAccessAnalysis.h"
  18 #include "llvm/IR/IRBuilder.h"
  19 #include "llvm/Support/CheckedArithmetic.h"
  20
  21 namespace llvm {
  22
  23 /// Describes the type of Parameters
  24 enum class VFParamKind {
  25   Vector,            // No semantic information.
  26   OMP_Linear,        // declare simd linear(i)
  27   OMP_LinearRef,     // declare simd linear(ref(i))
  28   OMP_LinearVal,     // declare simd linear(val(i))
  29   OMP_LinearUVal,    // declare simd linear(uval(i))
  30   OMP_LinearPos,     // declare simd linear(i:c) uniform(c)
  31   OMP_LinearValPos,  // declare simd linear(val(i:c)) uniform(c)
  32   OMP_LinearRefPos,  // declare simd linear(ref(i:c)) uniform(c)
  33   OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c
  34   OMP_Uniform,       // declare simd uniform(i)
  35   GlobalPredicate,   // Global logical predicate that acts on all lanes
  36                      // of the input and output mask concurrently. For
  37                      // example, it is implied by the `M` token in the
  38                      // Vector Function ABI mangled name.
  39   Unknown
  40 };
  41
  42 /// Describes the type of Instruction Set Architecture
  43 enum class VFISAKind {
  44   AdvancedSIMD, // AArch64 Advanced SIMD (NEON)
  45   SVE,          // AArch64 Scalable Vector Extension
  46   SSE,          // x86 SSE
  47   AVX,          // x86 AVX
  48   AVX2,         // x86 AVX2
  49   AVX512,       // x86 AVX512
  50   Unknown       // Unknown ISA
  51 };
  52
  53 /// Encapsulates information needed to describe a parameter.
  54 ///
  55 /// The description of the parameter is not linked directly to
  56 /// OpenMP or any other vector function description. This structure
  57 /// is extendible to handle other paradigms that describe vector
  58 /// functions and their parameters.
  59 struct VFParameter {
  60   unsigned ParamPos;         // Parameter Position in Scalar Function.
  61   VFParamKind ParamKind;     // Kind of Parameter.
  62   int LinearStepOrPos = 0;   // Step or Position of the Parameter.
  63   Align Alignment = Align(); // Optional aligment in bytes, defaulted to 1.
  64
  65   // Comparison operator.
  66   bool operator==(const VFParameter &Other) const {
  67     return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) ==
  68            std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos,
  69                     Other.Alignment);
  70   }
  71 };
  72
  73 /// Contains the information about the kind of vectorization
  74 /// available.
  75 ///
  76 /// This object in independent on the paradigm used to
  77 /// represent vector functions. in particular, it is not attached to
  78 /// any target-specific ABI.
  79 struct VFShape {
  80   unsigned VF;     // Vectorization factor.
  81   bool IsScalable; // True if the function is a scalable function.
  82   VFISAKind ISA;   // Instruction Set Architecture.
  83   SmallVector<VFParameter, 8> Parameters; // List of parameter informations.
  84   // Comparison operator.
  85   bool operator==(const VFShape &Other) const {
  86     return std::tie(VF, IsScalable, ISA, Parameters) ==
  87            std::tie(Other.VF, Other.IsScalable, Other.ISA, Other.Parameters);
  88   }
  89 };
  90
  91 /// Holds the VFShape for a specific scalar to vector function mapping.
  92 struct VFInfo {
  93   VFShape Shape;        // Classification of the vector function.
  94   StringRef ScalarName; // Scalar Function Name.
  95   StringRef VectorName; // Vector Function Name associated to this VFInfo.
  96
  97   // Comparison operator.
  98   bool operator==(const VFInfo &Other) const {
  99     return std::tie(Shape, ScalarName, VectorName) ==
 100            std::tie(Shape, Other.ScalarName, Other.VectorName);
 101   }
 102 };
 103
 104 namespace VFABI {
 105 /// Function to contruct a VFInfo out of a mangled names in the
 106 /// following format:
 107 ///
 108 /// <VFABI_name>{(<redirection>)}
 109 ///
 110 /// where <VFABI_name> is the name of the vector function, mangled according
 111 /// to the rules described in the Vector Function ABI of the target vector
 112 /// extentsion (or <isa> from now on). The <VFABI_name> is in the following
 113 /// format:
 114 ///
 115 /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
 116 ///
 117 /// This methods support demangling rules for the following <isa>:
 118 ///
 119 /// * AArch64: https://developer.arm.com/docs/101129/latest
 120 ///
 121 /// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and
 122 ///  https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
 123 ///
 124 ///
 125 ///
 126 /// \param MangledName -> input string in the format
 127 /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)].
 128 Optional<VFInfo> tryDemangleForVFABI(StringRef MangledName);
 129
 130 /// Retrieve the `VFParamKind` from a string token.
 131 VFParamKind getVFParamKindFromString(const StringRef Token);
 132 } // end namespace VFABI
 133
 134 template <typename T> class ArrayRef;
 135 class DemandedBits;
 136 class GetElementPtrInst;
 137 template <typename InstTy> class InterleaveGroup;
 138 class Loop;
 139 class ScalarEvolution;
 140 class TargetLibraryInfo;
 141 class TargetTransformInfo;
 142 class Type;
 143 class Value;
 144
 145 namespace Intrinsic {
 146 enum ID : unsigned;
 147 }
 148
 149 /// Identify if the intrinsic is trivially vectorizable.
 150 /// This method returns true if the intrinsic's argument types are all scalars
 151 /// for the scalar form of the intrinsic and all vectors (or scalars handled by
 152 /// hasVectorInstrinsicScalarOpd) for the vector form of the intrinsic.
 153 bool isTriviallyVectorizable(Intrinsic::ID ID);
 154
 155 /// Identifies if the vector form of the intrinsic has a scalar operand.
 156 bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx);
 157
 158 /// Returns intrinsic ID for call.
 159 /// For the input call instruction it finds mapping intrinsic and returns
 160 /// its intrinsic ID, in case it does not found it return not_intrinsic.
 161 Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
 162                                           const TargetLibraryInfo *TLI);
 163
 164 /// Find the operand of the GEP that should be checked for consecutive
 165 /// stores. This ignores trailing indices that have no effect on the final
 166 /// pointer.
 167 unsigned getGEPInductionOperand(const GetElementPtrInst *Gep);
 168
 169 /// If the argument is a GEP, then returns the operand identified by
 170 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 171 /// operand, it returns that instead.
 172 Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
 173
 174 /// If a value has only one user that is a CastInst, return it.
 175 Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty);
 176
 177 /// Get the stride of a pointer access in a loop. Looks for symbolic
 178 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
 179 Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
 180
 181 /// Given a vector and an element number, see if the scalar value is
 182 /// already around as a register, for example if it were inserted then extracted
 183 /// from the vector.
 184 Value *findScalarElement(Value *V, unsigned EltNo);
 185
 186 /// Get splat value if the input is a splat vector or return nullptr.
 187 /// The value may be extracted from a splat constants vector or from
 188 /// a sequence of instructions that broadcast a single value into a vector.
 189 const Value *getSplatValue(const Value *V);
 190
 191 /// Return true if the input value is known to be a vector with all identical
 192 /// elements (potentially including undefined elements).
 193 /// This may be more powerful than the related getSplatValue() because it is
 194 /// not limited by finding a scalar source value to a splatted vector.
 195 bool isSplatValue(const Value *V, unsigned Depth = 0);
 196
 197 /// Compute a map of integer instructions to their minimum legal type
 198 /// size.
 199 ///
 200 /// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int
 201 /// type (e.g. i32) whenever arithmetic is performed on them.
 202 ///
 203 /// For targets with native i8 or i16 operations, usually InstCombine can shrink
 204 /// the arithmetic type down again. However InstCombine refuses to create
 205 /// illegal types, so for targets without i8 or i16 registers, the lengthening
 206 /// and shrinking remains.
 207 ///
 208 /// Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when
 209 /// their scalar equivalents do not, so during vectorization it is important to
 210 /// remove these lengthens and truncates when deciding the profitability of
 211 /// vectorization.
 212 ///
 213 /// This function analyzes the given range of instructions and determines the
 214 /// minimum type size each can be converted to. It attempts to remove or
 215 /// minimize type size changes across each def-use chain, so for example in the
 216 /// following code:
 217 ///
 218 ///   %1 = load i8, i8*
 219 ///   %2 = add i8 %1, 2
 220 ///   %3 = load i16, i16*
 221 ///   %4 = zext i8 %2 to i32
 222 ///   %5 = zext i16 %3 to i32
 223 ///   %6 = add i32 %4, %5
 224 ///   %7 = trunc i32 %6 to i16
 225 ///
 226 /// Instruction %6 must be done at least in i16, so computeMinimumValueSizes
 227 /// will return: {%1: 16, %2: 16, %3: 16, %4: 16, %5: 16, %6: 16, %7: 16}.
 228 ///
 229 /// If the optional TargetTransformInfo is provided, this function tries harder
 230 /// to do less work by only looking at illegal types.
 231 MapVector<Instruction*, uint64_t>
 232 computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 233                          DemandedBits &DB,
 234                          const TargetTransformInfo *TTI=nullptr);
 235
 236 /// Compute the union of two access-group lists.
 237 ///
 238 /// If the list contains just one access group, it is returned directly. If the
 239 /// list is empty, returns nullptr.
 240 MDNode *uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2);
 241
 242 /// Compute the access-group list of access groups that @p Inst1 and @p Inst2
 243 /// are both in. If either instruction does not access memory at all, it is
 244 /// considered to be in every list.
 245 ///
 246 /// If the list contains just one access group, it is returned directly. If the
 247 /// list is empty, returns nullptr.
 248 MDNode *intersectAccessGroups(const Instruction *Inst1,
 249                               const Instruction *Inst2);
 250
 251 /// Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath,
 252 /// MD_nontemporal, MD_access_group].
 253 /// For K in Kinds, we get the MDNode for K from each of the
 254 /// elements of VL, compute their "intersection" (i.e., the most generic
 255 /// metadata value that covers all of the individual values), and set I's
 256 /// metadata for M equal to the intersection value.
 257 ///
 258 /// This function always sets a (possibly null) value for each K in Kinds.
 259 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 260
 261 /// Create a mask that filters the members of an interleave group where there
 262 /// are gaps.
 263 ///
 264 /// For example, the mask for \p Group with interleave-factor 3
 265 /// and \p VF 4, that has only its first member present is:
 266 ///
 267 ///   <1,0,0,1,0,0,1,0,0,1,0,0>
 268 ///
 269 /// Note: The result is a mask of 0's and 1's, as opposed to the other
 270 /// create[*]Mask() utilities which create a shuffle mask (mask that
 271 /// consists of indices).
 272 Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
 273                                const InterleaveGroup<Instruction> &Group);
 274
 275 /// Create a mask with replicated elements.
 276 ///
 277 /// This function creates a shuffle mask for replicating each of the \p VF
 278 /// elements in a vector \p ReplicationFactor times. It can be used to
 279 /// transform a mask of \p VF elements into a mask of
 280 /// \p VF * \p ReplicationFactor elements used by a predicated
 281 /// interleaved-group of loads/stores whose Interleaved-factor ==
 282 /// \p ReplicationFactor.
 283 ///
 284 /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
 285 ///
 286 ///   <0,0,0,1,1,1,2,2,2,3,3,3>
 287 Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
 288                                unsigned VF);
 289
 290 /// Create an interleave shuffle mask.
 291 ///
 292 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
 293 /// vectorization factor \p VF into a single wide vector. The mask is of the
 294 /// form:
 295 ///
 296 ///   <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...>
 297 ///
 298 /// For example, the mask for VF = 4 and NumVecs = 2 is:
 299 ///
 300 ///   <0, 4, 1, 5, 2, 6, 3, 7>.
 301 Constant *createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
 302                                unsigned NumVecs);
 303
 304 /// Create a stride shuffle mask.
 305 ///
 306 /// This function creates a shuffle mask whose elements begin at \p Start and
 307 /// are incremented by \p Stride. The mask can be used to deinterleave an
 308 /// interleaved vector into separate vectors of vectorization factor \p VF. The
 309 /// mask is of the form:
 310 ///
 311 ///   <Start, Start + Stride, ..., Start + Stride * (VF - 1)>
 312 ///
 313 /// For example, the mask for Start = 0, Stride = 2, and VF = 4 is:
 314 ///
 315 ///   <0, 2, 4, 6>
 316 Constant *createStrideMask(IRBuilder<> &Builder, unsigned Start,
 317                            unsigned Stride, unsigned VF);
 318
 319 /// Create a sequential shuffle mask.
 320 ///
 321 /// This function creates shuffle mask whose elements are sequential and begin
 322 /// at \p Start.  The mask contains \p NumInts integers and is padded with \p
 323 /// NumUndefs undef values. The mask is of the form:
 324 ///
 325 ///   <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs>
 326 ///
 327 /// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is:
 328 ///
 329 ///   <0, 1, 2, 3, undef, undef, undef, undef>
 330 Constant *createSequentialMask(IRBuilder<> &Builder, unsigned Start,
 331                                unsigned NumInts, unsigned NumUndefs);
 332
 333 /// Concatenate a list of vectors.
 334 ///
 335 /// This function generates code that concatenate the vectors in \p Vecs into a
 336 /// single large vector. The number of vectors should be greater than one, and
 337 /// their element types should be the same. The number of elements in the
 338 /// vectors should also be the same; however, if the last vector has fewer
 339 /// elements, it will be padded with undefs.
 340 Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs);
 341
 342 /// Given a mask vector of the form <Y x i1>, Return true if all of the
 343 /// elements of this predicate mask are false or undef.  That is, return true
 344 /// if all lanes can be assumed inactive.
 345 bool maskIsAllZeroOrUndef(Value *Mask);
 346
 347 /// Given a mask vector of the form <Y x i1>, Return true if all of the
 348 /// elements of this predicate mask are true or undef.  That is, return true
 349 /// if all lanes can be assumed active.
 350 bool maskIsAllOneOrUndef(Value *Mask);
 351
 352 /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
 353 /// for each lane which may be active.
 354 APInt possiblyDemandedEltsInMask(Value *Mask);
 355
 356 /// The group of interleaved loads/stores sharing the same stride and
 357 /// close to each other.
 358 ///
 359 /// Each member in this group has an index starting from 0, and the largest
 360 /// index should be less than interleaved factor, which is equal to the absolute
 361 /// value of the access's stride.
 362 ///
 363 /// E.g. An interleaved load group of factor 4:
 364 ///        for (unsigned i = 0; i < 1024; i+=4) {
 365 ///          a = A[i];                           // Member of index 0
 366 ///          b = A[i+1];                         // Member of index 1
 367 ///          d = A[i+3];                         // Member of index 3
 368 ///          ...
 369 ///        }
 370 ///
 371 ///      An interleaved store group of factor 4:
 372 ///        for (unsigned i = 0; i < 1024; i+=4) {
 373 ///          ...
 374 ///          A[i]   = a;                         // Member of index 0
 375 ///          A[i+1] = b;                         // Member of index 1
 376 ///          A[i+2] = c;                         // Member of index 2
 377 ///          A[i+3] = d;                         // Member of index 3
 378 ///        }
 379 ///
 380 /// Note: the interleaved load group could have gaps (missing members), but
 381 /// the interleaved store group doesn't allow gaps.
 382 template <typename InstTy> class InterleaveGroup {
 383 public:
 384   InterleaveGroup(uint32_t Factor, bool Reverse, Align Alignment)
 385       : Factor(Factor), Reverse(Reverse), Alignment(Alignment),
 386         InsertPos(nullptr) {}
 387
 388   InterleaveGroup(InstTy *Instr, int32_t Stride, Align Alignment)
 389       : Alignment(Alignment), InsertPos(Instr) {
 390     Factor = std::abs(Stride);
 391     assert(Factor > 1 && "Invalid interleave factor");
 392
 393     Reverse = Stride < 0;
 394     Members[0] = Instr;
 395   }
 396
 397   bool isReverse() const { return Reverse; }
 398   uint32_t getFactor() const { return Factor; }
 399   uint32_t getAlignment() const { return Alignment.value(); }
 400   uint32_t getNumMembers() const { return Members.size(); }
 401
 402   /// Try to insert a new member \p Instr with index \p Index and
 403   /// alignment \p NewAlign. The index is related to the leader and it could be
 404   /// negative if it is the new leader.
 405   ///
 406   /// \returns false if the instruction doesn't belong to the group.
 407   bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign) {
 408     // Make sure the key fits in an int32_t.
 409     Optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey);
 410     if (!MaybeKey)
 411       return false;
 412     int32_t Key = *MaybeKey;
 413
 414     // Skip if there is already a member with the same index.
 415     if (Members.find(Key) != Members.end())
 416       return false;
 417
 418     if (Key > LargestKey) {
 419       // The largest index is always less than the interleave factor.
 420       if (Index >= static_cast<int32_t>(Factor))
 421         return false;
 422
 423       LargestKey = Key;
 424     } else if (Key < SmallestKey) {
 425
 426       // Make sure the largest index fits in an int32_t.
 427       Optional<int32_t> MaybeLargestIndex = checkedSub(LargestKey, Key);
 428       if (!MaybeLargestIndex)
 429         return false;
 430
 431       // The largest index is always less than the interleave factor.
 432       if (*MaybeLargestIndex >= static_cast<int64_t>(Factor))
 433         return false;
 434
 435       SmallestKey = Key;
 436     }
 437
 438     // It's always safe to select the minimum alignment.
 439     Alignment = std::min(Alignment, NewAlign);
 440     Members[Key] = Instr;
 441     return true;
 442   }
 443
 444   /// Get the member with the given index \p Index
 445   ///
 446   /// \returns nullptr if contains no such member.
 447   InstTy *getMember(uint32_t Index) const {
 448     int32_t Key = SmallestKey + Index;
 449     auto Member = Members.find(Key);
 450     if (Member == Members.end())
 451       return nullptr;
 452
 453     return Member->second;
 454   }
 455
 456   /// Get the index for the given member. Unlike the key in the member
 457   /// map, the index starts from 0.
 458   uint32_t getIndex(const InstTy *Instr) const {
 459     for (auto I : Members) {
 460       if (I.second == Instr)
 461         return I.first - SmallestKey;
 462     }
 463
 464     llvm_unreachable("InterleaveGroup contains no such member");
 465   }
 466
 467   InstTy *getInsertPos() const { return InsertPos; }
 468   void setInsertPos(InstTy *Inst) { InsertPos = Inst; }
 469
 470   /// Add metadata (e.g. alias info) from the instructions in this group to \p
 471   /// NewInst.
 472   ///
 473   /// FIXME: this function currently does not add noalias metadata a'la
 474   /// addNewMedata.  To do that we need to compute the intersection of the
 475   /// noalias info from all members.
 476   void addMetadata(InstTy *NewInst) const;
 477
 478   /// Returns true if this Group requires a scalar iteration to handle gaps.
 479   bool requiresScalarEpilogue() const {
 480     // If the last member of the Group exists, then a scalar epilog is not
 481     // needed for this group.
 482     if (getMember(getFactor() - 1))
 483       return false;
 484
 485     // We have a group with gaps. It therefore cannot be a group of stores,
 486     // and it can't be a reversed access, because such groups get invalidated.
 487     assert(!getMember(0)->mayWriteToMemory() &&
 488            "Group should have been invalidated");
 489     assert(!isReverse() && "Group should have been invalidated");
 490
 491     // This is a group of loads, with gaps, and without a last-member
 492     return true;
 493   }
 494
 495 private:
 496   uint32_t Factor; // Interleave Factor.
 497   bool Reverse;
 498   Align Alignment;
 499   DenseMap<int32_t, InstTy *> Members;
 500   int32_t SmallestKey = 0;
 501   int32_t LargestKey = 0;
 502
 503   // To avoid breaking dependences, vectorized instructions of an interleave
 504   // group should be inserted at either the first load or the last store in
 505   // program order.
 506   //
 507   // E.g. %even = load i32             // Insert Position
 508   //      %add = add i32 %even         // Use of %even
 509   //      %odd = load i32
 510   //
 511   //      store i32 %even
 512   //      %odd = add i32               // Def of %odd
 513   //      store i32 %odd               // Insert Position
 514   InstTy *InsertPos;
 515 };
 516
 517 /// Drive the analysis of interleaved memory accesses in the loop.
 518 ///
 519 /// Use this class to analyze interleaved accesses only when we can vectorize
 520 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
 521 /// on interleaved accesses is unsafe.
 522 ///
 523 /// The analysis collects interleave groups and records the relationships
 524 /// between the member and the group in a map.
 525 class InterleavedAccessInfo {
 526 public:
 527   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
 528                         DominatorTree *DT, LoopInfo *LI,
 529                         const LoopAccessInfo *LAI)
 530       : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 531
 532   ~InterleavedAccessInfo() { reset(); }
 533
 534   /// Analyze the interleaved accesses and collect them in interleave
 535   /// groups. Substitute symbolic strides using \p Strides.
 536   /// Consider also predicated loads/stores in the analysis if
 537   /// \p EnableMaskedInterleavedGroup is true.
 538   void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
 539
 540   /// Invalidate groups, e.g., in case all blocks in loop will be predicated
 541   /// contrary to original assumption. Although we currently prevent group
 542   /// formation for predicated accesses, we may be able to relax this limitation
 543   /// in the future once we handle more complicated blocks.
 544   void reset() {
 545     SmallPtrSet<InterleaveGroup<Instruction> *, 4> DelSet;
 546     // Avoid releasing a pointer twice.
 547     for (auto &I : InterleaveGroupMap)
 548       DelSet.insert(I.second);
 549     for (auto *Ptr : DelSet)
 550       delete Ptr;
 551     InterleaveGroupMap.clear();
 552     RequiresScalarEpilogue = false;
 553   }
 554
 555
 556   /// Check if \p Instr belongs to any interleave group.
 557   bool isInterleaved(Instruction *Instr) const {
 558     return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end();
 559   }
 560
 561   /// Get the interleave group that \p Instr belongs to.
 562   ///
 563   /// \returns nullptr if doesn't have such group.
 564   InterleaveGroup<Instruction> *
 565   getInterleaveGroup(const Instruction *Instr) const {
 566     if (InterleaveGroupMap.count(Instr))
 567       return InterleaveGroupMap.find(Instr)->second;
 568     return nullptr;
 569   }
 570
 571   iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>>
 572   getInterleaveGroups() {
 573     return make_range(InterleaveGroups.begin(), InterleaveGroups.end());
 574   }
 575
 576   /// Returns true if an interleaved group that may access memory
 577   /// out-of-bounds requires a scalar epilogue iteration for correctness.
 578   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 579
 580   /// Invalidate groups that require a scalar epilogue (due to gaps). This can
 581   /// happen when optimizing for size forbids a scalar epilogue, and the gap
 582   /// cannot be filtered by masking the load/store.
 583   void invalidateGroupsRequiringScalarEpilogue();
 584
 585 private:
 586   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
 587   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
 588   /// The interleaved access analysis can also add new predicates (for example
 589   /// by versioning strides of pointers).
 590   PredicatedScalarEvolution &PSE;
 591
 592   Loop *TheLoop;
 593   DominatorTree *DT;
 594   LoopInfo *LI;
 595   const LoopAccessInfo *LAI;
 596
 597   /// True if the loop may contain non-reversed interleaved groups with
 598   /// out-of-bounds accesses. We ensure we don't speculatively access memory
 599   /// out-of-bounds by executing at least one scalar epilogue iteration.
 600   bool RequiresScalarEpilogue = false;
 601
 602   /// Holds the relationships between the members and the interleave group.
 603   DenseMap<Instruction *, InterleaveGroup<Instruction> *> InterleaveGroupMap;
 604
 605   SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups;
 606
 607   /// Holds dependences among the memory accesses in the loop. It maps a source
 608   /// access to a set of dependent sink accesses.
 609   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
 610
 611   /// The descriptor for a strided memory access.
 612   struct StrideDescriptor {
 613     StrideDescriptor() = default;
 614     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
 615                      Align Alignment)
 616         : Stride(Stride), Scev(Scev), Size(Size), Alignment(Alignment) {}
 617
 618     // The access's stride. It is negative for a reverse access.
 619     int64_t Stride = 0;
 620
 621     // The scalar expression of this access.
 622     const SCEV *Scev = nullptr;
 623
 624     // The size of the memory object.
 625     uint64_t Size = 0;
 626
 627     // The alignment of this access.
 628     Align Alignment;
 629   };
 630
 631   /// A type for holding instructions and their stride descriptors.
 632   using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
 633
 634   /// Create a new interleave group with the given instruction \p Instr,
 635   /// stride \p Stride and alignment \p Align.
 636   ///
 637   /// \returns the newly created interleave group.
 638   InterleaveGroup<Instruction> *
 639   createInterleaveGroup(Instruction *Instr, int Stride, Align Alignment) {
 640     assert(!InterleaveGroupMap.count(Instr) &&
 641            "Already in an interleaved access group");
 642     InterleaveGroupMap[Instr] =
 643         new InterleaveGroup<Instruction>(Instr, Stride, Alignment);
 644     InterleaveGroups.insert(InterleaveGroupMap[Instr]);
 645     return InterleaveGroupMap[Instr];
 646   }
 647
 648   /// Release the group and remove all the relationships.
 649   void releaseGroup(InterleaveGroup<Instruction> *Group) {
 650     for (unsigned i = 0; i < Group->getFactor(); i++)
 651       if (Instruction *Member = Group->getMember(i))
 652         InterleaveGroupMap.erase(Member);
 653
 654     InterleaveGroups.erase(Group);
 655     delete Group;
 656   }
 657
 658   /// Collect all the accesses with a constant stride in program order.
 659   void collectConstStrideAccesses(
 660       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
 661       const ValueToValueMap &Strides);
 662
 663   /// Returns true if \p Stride is allowed in an interleaved group.
 664   static bool isStrided(int Stride);
 665
 666   /// Returns true if \p BB is a predicated block.
 667   bool isPredicated(BasicBlock *BB) const {
 668     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 669   }
 670
 671   /// Returns true if LoopAccessInfo can be used for dependence queries.
 672   bool areDependencesValid() const {
 673     return LAI && LAI->getDepChecker().getDependences();
 674   }
 675
 676   /// Returns true if memory accesses \p A and \p B can be reordered, if
 677   /// necessary, when constructing interleaved groups.
 678   ///
 679   /// \p A must precede \p B in program order. We return false if reordering is
 680   /// not necessary or is prevented because \p A and \p B may be dependent.
 681   bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
 682                                                  StrideEntry *B) const {
 683     // Code motion for interleaved accesses can potentially hoist strided loads
 684     // and sink strided stores. The code below checks the legality of the
 685     // following two conditions:
 686     //
 687     // 1. Potentially moving a strided load (B) before any store (A) that
 688     //    precedes B, or
 689     //
 690     // 2. Potentially moving a strided store (A) after any load or store (B)
 691     //    that A precedes.
 692     //
 693     // It's legal to reorder A and B if we know there isn't a dependence from A
 694     // to B. Note that this determination is conservative since some
 695     // dependences could potentially be reordered safely.
 696
 697     // A is potentially the source of a dependence.
 698     auto *Src = A->first;
 699     auto SrcDes = A->second;
 700
 701     // B is potentially the sink of a dependence.
 702     auto *Sink = B->first;
 703     auto SinkDes = B->second;
 704
 705     // Code motion for interleaved accesses can't violate WAR dependences.
 706     // Thus, reordering is legal if the source isn't a write.
 707     if (!Src->mayWriteToMemory())
 708       return true;
 709
 710     // At least one of the accesses must be strided.
 711     if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
 712       return true;
 713
 714     // If dependence information is not available from LoopAccessInfo,
 715     // conservatively assume the instructions can't be reordered.
 716     if (!areDependencesValid())
 717       return false;
 718
 719     // If we know there is a dependence from source to sink, assume the
 720     // instructions can't be reordered. Otherwise, reordering is legal.
 721     return Dependences.find(Src) == Dependences.end() ||
 722            !Dependences.lookup(Src).count(Sink);
 723   }
 724
 725   /// Collect the dependences from LoopAccessInfo.
 726   ///
 727   /// We process the dependences once during the interleaved access analysis to
 728   /// enable constant-time dependence queries.
 729   void collectDependences() {
 730     if (!areDependencesValid())
 731       return;
 732     auto *Deps = LAI->getDepChecker().getDependences();
 733     for (auto Dep : *Deps)
 734       Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
 735   }
 736 };
 737
 738 } // llvm namespace
 739
 740 #endif