lib/Target/X86/X86InterleavedAccess.cpp

   1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// This file contains the X86 implementation of the interleaved accesses
  12 /// optimization generating X86-specific instructions/intrinsics for
  13 /// interleaved access groups.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "X86ISelLowering.h"
  18 #include "X86Subtarget.h"
  19 #include "llvm/ADT/ArrayRef.h"
  20 #include "llvm/ADT/SmallVector.h"
  21 #include "llvm/Analysis/VectorUtils.h"
  22 #include "llvm/IR/Constants.h"
  23 #include "llvm/IR/DataLayout.h"
  24 #include "llvm/IR/DerivedTypes.h"
  25 #include "llvm/IR/IRBuilder.h"
  26 #include "llvm/IR/Instruction.h"
  27 #include "llvm/IR/Instructions.h"
  28 #include "llvm/IR/Module.h"
  29 #include "llvm/IR/Type.h"
  30 #include "llvm/IR/Value.h"
  31 #include "llvm/Support/Casting.h"
  32 #include "llvm/Support/MachineValueType.h"
  33 #include <algorithm>
  34 #include <cassert>
  35 #include <cmath>
  36 #include <cstdint>
  37
  38 using namespace llvm;
  39
  40 namespace {
  41
  42 /// This class holds necessary information to represent an interleaved
  43 /// access group and supports utilities to lower the group into
  44 /// X86-specific instructions/intrinsics.
  45 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
  46 ///       other element)
  47 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  48 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  49 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  50 class X86InterleavedAccessGroup {
  51   /// Reference to the wide-load instruction of an interleaved access
  52   /// group.
  53   Instruction *const Inst;
  54
  55   /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
  56   ArrayRef<ShuffleVectorInst *> Shuffles;
  57
  58   /// Reference to the starting index of each user-shuffle.
  59   ArrayRef<unsigned> Indices;
  60
  61   /// Reference to the interleaving stride in terms of elements.
  62   const unsigned Factor;
  63
  64   /// Reference to the underlying target.
  65   const X86Subtarget &Subtarget;
  66
  67   const DataLayout &DL;
  68
  69   IRBuilder<> &Builder;
  70
  71   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
  72   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
  73   void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
  74                  SmallVectorImpl<Instruction *> &DecomposedVectors);
  75
  76   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
  77   /// returns the transposed-vectors in \p TransposedVectors.
  78   /// E.g.
  79   /// InputVectors:
  80   ///   In-V0 = p1, p2, p3, p4
  81   ///   In-V1 = q1, q2, q3, q4
  82   ///   In-V2 = r1, r2, r3, r4
  83   ///   In-V3 = s1, s2, s3, s4
  84   /// OutputVectors:
  85   ///   Out-V0 = p1, q1, r1, s1
  86   ///   Out-V1 = p2, q2, r2, s2
  87   ///   Out-V2 = p3, q3, r3, s3
  88   ///   Out-V3 = P4, q4, r4, s4
  89   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
  90                      SmallVectorImpl<Value *> &TransposedMatrix);
  91   void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
  92                              SmallVectorImpl<Value *> &TransposedMatrix,
  93                              unsigned NumSubVecElems);
  94   void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
  95                                 SmallVectorImpl<Value *> &TransposedMatrix);
  96   void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
  97                              SmallVectorImpl<Value *> &TransposedMatrix,
  98                              unsigned NumSubVecElems);
  99   void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
 100                                SmallVectorImpl<Value *> &TransposedMatrix,
 101                                unsigned NumSubVecElems);
 102
 103 public:
 104   /// In order to form an interleaved access group X86InterleavedAccessGroup
 105   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
 106   /// \p Shuffs, reference to the first indices of each interleaved-vector
 107   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
 108   /// X86-specific instructions/intrinsics it also requires the underlying
 109   /// target information \p STarget.
 110   explicit X86InterleavedAccessGroup(Instruction *I,
 111                                      ArrayRef<ShuffleVectorInst *> Shuffs,
 112                                      ArrayRef<unsigned> Ind, const unsigned F,
 113                                      const X86Subtarget &STarget,
 114                                      IRBuilder<> &B)
 115       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
 116         DL(Inst->getModule()->getDataLayout()), Builder(B) {}
 117
 118   /// Returns true if this interleaved access group can be lowered into
 119   /// x86-specific instructions/intrinsics, false otherwise.
 120   bool isSupported() const;
 121
 122   /// Lowers this interleaved access group into X86-specific
 123   /// instructions/intrinsics.
 124   bool lowerIntoOptimizedSequence();
 125 };
 126
 127 } // end anonymous namespace
 128
 129 bool X86InterleavedAccessGroup::isSupported() const {
 130   VectorType *ShuffleVecTy = Shuffles[0]->getType();
 131   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
 132   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
 133   unsigned WideInstSize;
 134
 135   // Currently, lowering is supported for the following vectors:
 136   // Stride 4:
 137   //    1. Store and load of 4-element vectors of 64 bits on AVX.
 138   //    2. Store of 16/32-element vectors of 8 bits on AVX.
 139   // Stride 3:
 140   //    1. Load of 16/32-element vectors of 8 bits on AVX.
 141   if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
 142     return false;
 143
 144   if (isa<LoadInst>(Inst)) {
 145     WideInstSize = DL.getTypeSizeInBits(Inst->getType());
 146     if (cast<LoadInst>(Inst)->getPointerAddressSpace())
 147       return false;
 148   } else
 149     WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
 150
 151   // We support shuffle represents stride 4 for byte type with size of
 152   // WideInstSize.
 153   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
 154      return true;
 155
 156   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
 157       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
 158        WideInstSize == 2048))
 159     return true;
 160
 161   if (ShuffleElemSize == 8 && Factor == 3 &&
 162       (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
 163     return true;
 164
 165   return false;
 166 }
 167
 168 void X86InterleavedAccessGroup::decompose(
 169     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
 170     SmallVectorImpl<Instruction *> &DecomposedVectors) {
 171   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
 172          "Expected Load or Shuffle");
 173
 174   Type *VecWidth = VecInst->getType();
 175   (void)VecWidth;
 176   assert(VecWidth->isVectorTy() &&
 177          DL.getTypeSizeInBits(VecWidth) >=
 178              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
 179          "Invalid Inst-size!!!");
 180
 181   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
 182     Value *Op0 = SVI->getOperand(0);
 183     Value *Op1 = SVI->getOperand(1);
 184
 185     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
 186     for (unsigned i = 0; i < NumSubVectors; ++i)
 187       DecomposedVectors.push_back(
 188           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
 189               Op0, Op1,
 190               createSequentialMask(Builder, Indices[i],
 191                                    SubVecTy->getVectorNumElements(), 0))));
 192     return;
 193   }
 194
 195   // Decompose the load instruction.
 196   LoadInst *LI = cast<LoadInst>(VecInst);
 197   Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
 198   Value *VecBasePtr;
 199   unsigned int NumLoads = NumSubVectors;
 200   // In the case of stride 3 with a vector of 32 elements load the information
 201   // in the following way:
 202   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
 203   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
 204   if (VecLength == 768 || VecLength == 1536) {
 205     Type *VecTran =
 206         VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
 207     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
 208     NumLoads = NumSubVectors * (VecLength / 384);
 209   } else
 210     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 211   // Generate N loads of T type.
 212   for (unsigned i = 0; i < NumLoads; i++) {
 213     // TODO: Support inbounds GEP.
 214     Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
 215     Instruction *NewLoad =
 216         Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
 217     DecomposedVectors.push_back(NewLoad);
 218   }
 219 }
 220
 221 // Changing the scale of the vector type by reducing the number of elements and
 222 // doubling the scalar size.
 223 static MVT scaleVectorType(MVT VT) {
 224   unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
 225   return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
 226                           VT.getVectorNumElements() / 2);
 227 }
 228
 229 static uint32_t Concat[] = {
 230   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
 231   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 232   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
 233   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
 234
 235 // genShuffleBland - Creates shuffle according to two vectors.This function is
 236 // only works on instructions with lane inside 256 registers. According to
 237 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
 238 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
 239 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
 240 // the second vector.
 241 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
 242 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
 243 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
 244 // For the sequence to work as a mirror to the load.
 245 // We must consider the elements order as above.
 246 // In this function we are combining two types of shuffles.
 247 // The first one is vpshufed and the second is a type of "blend" shuffle.
 248 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
 249 // correct offset. We are creating a vpsuffed + blend sequence between two
 250 // shuffles.
 251 static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
 252   SmallVectorImpl<uint32_t> &Out, int LowOffset,
 253   int HighOffset) {
 254   assert(VT.getSizeInBits() >= 256 &&
 255     "This function doesn't accept width smaller then 256");
 256   unsigned NumOfElm = VT.getVectorNumElements();
 257   for (unsigned i = 0; i < Mask.size(); i++)
 258     Out.push_back(Mask[i] + LowOffset);
 259   for (unsigned i = 0; i < Mask.size(); i++)
 260     Out.push_back(Mask[i] + HighOffset + NumOfElm);
 261 }
 262
 263 // reorderSubVector returns the data to is the original state. And de-facto is
 264 // the opposite of  the function concatSubVector.
 265
 266 // For VecElems = 16
 267 // Invec[0] -  |0|      TransposedMatrix[0] - |0|
 268 // Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
 269 // Invec[2] -  |2|      TransposedMatrix[2] - |2|
 270
 271 // For VecElems = 32
 272 // Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
 273 // Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
 274 // Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
 275
 276 // For VecElems = 64
 277 // Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
 278 // Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
 279 // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
 280
 281 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
 282   ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
 283   unsigned VecElems, unsigned Stride,
 284   IRBuilder<> Builder) {
 285
 286   if (VecElems == 16) {
 287     for (unsigned i = 0; i < Stride; i++)
 288       TransposedMatrix[i] = Builder.CreateShuffleVector(
 289         Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
 290     return;
 291   }
 292
 293   SmallVector<uint32_t, 32> OptimizeShuf;
 294   Value *Temp[8];
 295
 296   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
 297     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
 298       (i + 1) / Stride * 16);
 299     Temp[i / 2] = Builder.CreateShuffleVector(
 300       Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
 301     OptimizeShuf.clear();
 302   }
 303
 304   if (VecElems == 32) {
 305     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
 306     return;
 307   }
 308   else
 309     for (unsigned i = 0; i < Stride; i++)
 310       TransposedMatrix[i] =
 311       Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
 312 }
 313
 314 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
 315     ArrayRef<Instruction *> Matrix,
 316     SmallVectorImpl<Value *> &TransposedMatrix) {
 317   // Assuming we start from the following vectors:
 318   // Matrix[0]= c0 c1 c2 c3 c4 ... c7
 319   // Matrix[1]= m0 m1 m2 m3 m4 ... m7
 320   // Matrix[2]= y0 y1 y2 y3 y4 ... y7
 321   // Matrix[3]= k0 k1 k2 k3 k4 ... k7
 322
 323   MVT VT = MVT::v8i16;
 324   TransposedMatrix.resize(2);
 325   SmallVector<uint32_t, 16> MaskLow;
 326   SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
 327   SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
 328
 329   for (unsigned i = 0; i < 8; ++i) {
 330     MaskLow.push_back(i);
 331     MaskLow.push_back(i + 8);
 332   }
 333
 334   createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
 335   createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
 336   scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
 337   scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
 338   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
 339   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
 340   Value *IntrVec1Low =
 341       Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
 342   Value *IntrVec2Low =
 343       Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
 344
 345   // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
 346   // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
 347
 348   TransposedMatrix[0] =
 349       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
 350   TransposedMatrix[1] =
 351       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
 352 }
 353
 354 void X86InterleavedAccessGroup::interleave8bitStride4(
 355     ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
 356     unsigned NumOfElm) {
 357   // Example: Assuming we start from the following vectors:
 358   // Matrix[0]= c0 c1 c2 c3 c4 ... c31
 359   // Matrix[1]= m0 m1 m2 m3 m4 ... m31
 360   // Matrix[2]= y0 y1 y2 y3 y4 ... y31
 361   // Matrix[3]= k0 k1 k2 k3 k4 ... k31
 362
 363   MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
 364   MVT HalfVT = scaleVectorType(VT);
 365
 366   TransposedMatrix.resize(4);
 367   SmallVector<uint32_t, 32> MaskHigh;
 368   SmallVector<uint32_t, 32> MaskLow;
 369   SmallVector<uint32_t, 32> LowHighMask[2];
 370   SmallVector<uint32_t, 32> MaskHighTemp;
 371   SmallVector<uint32_t, 32> MaskLowTemp;
 372
 373   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
 374   // shuffle pattern.
 375
 376   createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
 377   createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
 378
 379   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
 380   // shuffle pattern.
 381
 382   createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
 383   createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
 384   scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
 385   scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
 386
 387   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
 388   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
 389   // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
 390   // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
 391   Value *IntrVec[4];
 392
 393   IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
 394   IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
 395   IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
 396   IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
 397
 398   // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
 399   // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
 400   // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
 401   // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
 402
 403   Value *VecOut[4];
 404   for (int i = 0; i < 4; i++)
 405     VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
 406                                             LowHighMask[i % 2]);
 407
 408   // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
 409   // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
 410   // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
 411   // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
 412
 413   if (VT == MVT::v16i8) {
 414     std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
 415     return;
 416   }
 417
 418   reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
 419                    NumOfElm, 4, Builder);
 420 }
 421
 422 //  createShuffleStride returns shuffle mask of size N.
 423 //  The shuffle pattern is as following :
 424 //  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
 425 //  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
 426 //  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
 427 //  Where Lane is the # of lanes in a register:
 428 //  VectorSize = 128 => Lane = 1
 429 //  VectorSize = 256 => Lane = 2
 430 //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
 431 //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
 432 static void createShuffleStride(MVT VT, int Stride,
 433                                 SmallVectorImpl<uint32_t> &Mask) {
 434   int VectorSize = VT.getSizeInBits();
 435   int VF = VT.getVectorNumElements();
 436   int LaneCount = std::max(VectorSize / 128, 1);
 437   for (int Lane = 0; Lane < LaneCount; Lane++)
 438     for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
 439       Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
 440 }
 441
 442 //  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
 443 //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
 444 //  each group is a monotonically increasing sequence with stride 3.
 445 //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
 446 static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
 447   int VectorSize = VT.getSizeInBits();
 448   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
 449   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
 450     int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
 451     SizeInfo.push_back(GroupSize);
 452     FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
 453   }
 454 }
 455
 456 //  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
 457 //  vpalign works according to lanes
 458 //  Where Lane is the # of lanes in a register:
 459 //  VectorWide = 128 => Lane = 1
 460 //  VectorWide = 256 => Lane = 2
 461 //  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
 462 //  For Lane = 2 shuffle pattern is:
 463 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
 464 //  Imm variable sets the offset amount. The result of the
 465 //  function is stored inside ShuffleMask vector and it built as described in
 466 //  the begin of the description. AlignDirection is a boolean that indecat the
 467 //  direction of the alignment. (false - align to the "right" side while true -
 468 //  align to the "left" side)
 469 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
 470                               SmallVectorImpl<uint32_t> &ShuffleMask,
 471                               bool AlignDirection = true, bool Unary = false) {
 472   unsigned NumElts = VT.getVectorNumElements();
 473   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
 474   unsigned NumLaneElts = NumElts / NumLanes;
 475
 476   Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
 477   unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
 478
 479   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
 480     for (unsigned i = 0; i != NumLaneElts; ++i) {
 481       unsigned Base = i + Offset;
 482       // if i+offset is out of this lane then we actually need the other source
 483       // If Unary the other source is the first source.
 484       if (Base >= NumLaneElts)
 485         Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
 486       ShuffleMask.push_back(Base + l);
 487     }
 488   }
 489 }
 490
 491 // concatSubVector - The function rebuilds the data to a correct expected
 492 // order. An assumption(The shape of the matrix) was taken for the
 493 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
 494 // This function ensures that the data is built in correct way for the lane
 495 // instructions. Each lane inside the vector is a 128-bit length.
 496 //
 497 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
 498 // can find the first 128 bit data. The number of different lanes inside a
 499 // vector depends on the 'VecElems'.In general, the formula is
 500 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
 501 // 'VecElems'.
 502
 503 // For VecElems = 16
 504 // Invec[0] - |0|      Vec[0] - |0|
 505 // Invec[1] - |1|  =>  Vec[1] - |1|
 506 // Invec[2] - |2|      Vec[2] - |2|
 507
 508 // For VecElems = 32
 509 // Invec[0] - |0|1|      Vec[0] - |0|3|
 510 // Invec[1] - |2|3|  =>  Vec[1] - |1|4|
 511 // Invec[2] - |4|5|      Vec[2] - |2|5|
 512
 513 // For VecElems = 64
 514 // Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
 515 // Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
 516 // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
 517
 518 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
 519                             unsigned VecElems, IRBuilder<> Builder) {
 520   if (VecElems == 16) {
 521     for (int i = 0; i < 3; i++)
 522       Vec[i] = InVec[i];
 523     return;
 524   }
 525
 526   for (unsigned j = 0; j < VecElems / 32; j++)
 527     for (int i = 0; i < 3; i++)
 528       Vec[i + j * 3] = Builder.CreateShuffleVector(
 529           InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
 530
 531   if (VecElems == 32)
 532     return;
 533
 534   for (int i = 0; i < 3; i++)
 535     Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
 536 }
 537
 538 void X86InterleavedAccessGroup::deinterleave8bitStride3(
 539     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
 540     unsigned VecElems) {
 541   // Example: Assuming we start from the following vectors:
 542   // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
 543   // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
 544   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
 545
 546   TransposedMatrix.resize(3);
 547   SmallVector<uint32_t, 32> VPShuf;
 548   SmallVector<uint32_t, 32> VPAlign[2];
 549   SmallVector<uint32_t, 32> VPAlign2;
 550   SmallVector<uint32_t, 32> VPAlign3;
 551   SmallVector<uint32_t, 3> GroupSize;
 552   Value *Vec[6], *TempVector[3];
 553
 554   MVT VT = MVT::getVT(Shuffles[0]->getType());
 555
 556   createShuffleStride(VT, 3, VPShuf);
 557   setGroupSize(VT, GroupSize);
 558
 559   for (int i = 0; i < 2; i++)
 560     DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
 561
 562   DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
 563   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
 564
 565   concatSubVector(Vec, InVec, VecElems, Builder);
 566   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
 567   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
 568   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
 569
 570   for (int i = 0; i < 3; i++)
 571     Vec[i] = Builder.CreateShuffleVector(
 572         Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
 573
 574   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
 575   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
 576   // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
 577
 578   for (int i = 0; i < 3; i++)
 579     TempVector[i] =
 580         Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
 581
 582   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
 583   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
 584   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
 585
 586   for (int i = 0; i < 3; i++)
 587     Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
 588                                          VPAlign[1]);
 589
 590   // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
 591   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
 592   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
 593
 594   Value *TempVec = Builder.CreateShuffleVector(
 595       Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
 596   TransposedMatrix[0] = Builder.CreateShuffleVector(
 597       Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
 598   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
 599   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
 600 }
 601
 602 // group2Shuffle reorder the shuffle stride back into continuous order.
 603 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
 604 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
 605 static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
 606                           SmallVectorImpl<uint32_t> &Output) {
 607   int IndexGroup[3] = {0, 0, 0};
 608   int Index = 0;
 609   int VectorWidth = VT.getSizeInBits();
 610   int VF = VT.getVectorNumElements();
 611   // Find the index of the different groups.
 612   int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
 613   for (int i = 0; i < 3; i++) {
 614     IndexGroup[(Index * 3) % (VF / Lane)] = Index;
 615     Index += Mask[i];
 616   }
 617   // According to the index compute the convert mask.
 618   for (int i = 0; i < VF / Lane; i++) {
 619     Output.push_back(IndexGroup[i % 3]);
 620     IndexGroup[i % 3]++;
 621   }
 622 }
 623
 624 void X86InterleavedAccessGroup::interleave8bitStride3(
 625     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
 626     unsigned VecElems) {
 627   // Example: Assuming we start from the following vectors:
 628   // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
 629   // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
 630   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
 631
 632   TransposedMatrix.resize(3);
 633   SmallVector<uint32_t, 3> GroupSize;
 634   SmallVector<uint32_t, 32> VPShuf;
 635   SmallVector<uint32_t, 32> VPAlign[3];
 636   SmallVector<uint32_t, 32> VPAlign2;
 637   SmallVector<uint32_t, 32> VPAlign3;
 638
 639   Value *Vec[3], *TempVector[3];
 640   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
 641
 642   setGroupSize(VT, GroupSize);
 643
 644   for (int i = 0; i < 3; i++)
 645     DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
 646
 647   DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
 648   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
 649
 650   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
 651   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
 652   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
 653
 654   Vec[0] = Builder.CreateShuffleVector(
 655       InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
 656   Vec[1] = Builder.CreateShuffleVector(
 657       InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
 658   Vec[2] = InVec[2];
 659
 660   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
 661   // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
 662   // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
 663
 664   for (int i = 0; i < 3; i++)
 665     TempVector[i] =
 666         Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
 667
 668   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
 669   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
 670   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
 671
 672   for (int i = 0; i < 3; i++)
 673     Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
 674                                          VPAlign[2]);
 675
 676   // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
 677   // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
 678   // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
 679
 680   unsigned NumOfElm = VT.getVectorNumElements();
 681   group2Shuffle(VT, GroupSize, VPShuf);
 682   reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
 683 }
 684
 685 void X86InterleavedAccessGroup::transpose_4x4(
 686     ArrayRef<Instruction *> Matrix,
 687     SmallVectorImpl<Value *> &TransposedMatrix) {
 688   assert(Matrix.size() == 4 && "Invalid matrix size");
 689   TransposedMatrix.resize(4);
 690
 691   // dst = src1[0,1],src2[0,1]
 692   uint32_t IntMask1[] = {0, 1, 4, 5};
 693   ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
 694   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
 695   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 696
 697   // dst = src1[2,3],src2[2,3]
 698   uint32_t IntMask2[] = {2, 3, 6, 7};
 699   Mask = makeArrayRef(IntMask2, 4);
 700   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
 701   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 702
 703   // dst = src1[0],src2[0],src1[2],src2[2]
 704   uint32_t IntMask3[] = {0, 4, 2, 6};
 705   Mask = makeArrayRef(IntMask3, 4);
 706   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
 707   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 708
 709   // dst = src1[1],src2[1],src1[3],src2[3]
 710   uint32_t IntMask4[] = {1, 5, 3, 7};
 711   Mask = makeArrayRef(IntMask4, 4);
 712   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
 713   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 714 }
 715
 716 // Lowers this interleaved access group into X86-specific
 717 // instructions/intrinsics.
 718 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 719   SmallVector<Instruction *, 4> DecomposedVectors;
 720   SmallVector<Value *, 4> TransposedVectors;
 721   VectorType *ShuffleTy = Shuffles[0]->getType();
 722
 723   if (isa<LoadInst>(Inst)) {
 724     // Try to generate target-sized register(/instruction).
 725     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
 726
 727     Type *ShuffleEltTy = Inst->getType();
 728     unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
 729     // Perform matrix-transposition in order to compute interleaved
 730     // results by generating some sort of (optimized) target-specific
 731     // instructions.
 732
 733     switch (NumSubVecElems) {
 734     default:
 735       return false;
 736     case 4:
 737       transpose_4x4(DecomposedVectors, TransposedVectors);
 738       break;
 739     case 8:
 740     case 16:
 741     case 32:
 742     case 64:
 743       deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
 744                               NumSubVecElems);
 745       break;
 746     }
 747
 748     // Now replace the unoptimized-interleaved-vectors with the
 749     // transposed-interleaved vectors.
 750     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
 751       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
 752
 753     return true;
 754   }
 755
 756   Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
 757   unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
 758
 759   // Lower the interleaved stores:
 760   //   1. Decompose the interleaved wide shuffle into individual shuffle
 761   //   vectors.
 762   decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
 763             DecomposedVectors);
 764
 765   //   2. Transpose the interleaved-vectors into vectors of contiguous
 766   //      elements.
 767   switch (NumSubVecElems) {
 768   case 4:
 769     transpose_4x4(DecomposedVectors, TransposedVectors);
 770     break;
 771   case 8:
 772     interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
 773     break;
 774   case 16:
 775   case 32:
 776   case 64:
 777     if (Factor == 4)
 778       interleave8bitStride4(DecomposedVectors, TransposedVectors,
 779                             NumSubVecElems);
 780     if (Factor == 3)
 781       interleave8bitStride3(DecomposedVectors, TransposedVectors,
 782                             NumSubVecElems);
 783     break;
 784   default:
 785     return false;
 786   }
 787
 788   //   3. Concatenate the contiguous-vectors back into a wide vector.
 789   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
 790
 791   //   4. Generate a store instruction for wide-vec.
 792   StoreInst *SI = cast<StoreInst>(Inst);
 793   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
 794                              SI->getAlignment());
 795
 796   return true;
 797 }
 798
 799 // Lower interleaved load(s) into target specific instructions/
 800 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
 801 // number of shuffles and ISA.
 802 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 803 bool X86TargetLowering::lowerInterleavedLoad(
 804     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
 805     ArrayRef<unsigned> Indices, unsigned Factor) const {
 806   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
 807          "Invalid interleave factor");
 808   assert(!Shuffles.empty() && "Empty shufflevector input");
 809   assert(Shuffles.size() == Indices.size() &&
 810          "Unmatched number of shufflevectors and indices");
 811
 812   // Create an interleaved access group.
 813   IRBuilder<> Builder(LI);
 814   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
 815                                 Builder);
 816
 817   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 818 }
 819
 820 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
 821                                               ShuffleVectorInst *SVI,
 822                                               unsigned Factor) const {
 823   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
 824          "Invalid interleave factor");
 825
 826   assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
 827          "Invalid interleaved store");
 828
 829   // Holds the indices of SVI that correspond to the starting index of each
 830   // interleaved shuffle.
 831   SmallVector<unsigned, 4> Indices;
 832   auto Mask = SVI->getShuffleMask();
 833   for (unsigned i = 0; i < Factor; i++)
 834     Indices.push_back(Mask[i]);
 835
 836   ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
 837
 838   // Create an interleaved access group.
 839   IRBuilder<> Builder(SI);
 840   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
 841                                 Builder);
 842
 843   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 844 }