lib/Target/X86/X86InterleavedAccess.cpp

   1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This file contains the X86 implementation of the interleaved accesses
  11 /// optimization generating X86-specific instructions/intrinsics for
  12 /// interleaved access groups.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "X86ISelLowering.h"
  17 #include "X86Subtarget.h"
  18 #include "llvm/ADT/ArrayRef.h"
  19 #include "llvm/ADT/SmallVector.h"
  20 #include "llvm/Analysis/VectorUtils.h"
  21 #include "llvm/IR/Constants.h"
  22 #include "llvm/IR/DataLayout.h"
  23 #include "llvm/IR/DerivedTypes.h"
  24 #include "llvm/IR/IRBuilder.h"
  25 #include "llvm/IR/Instruction.h"
  26 #include "llvm/IR/Instructions.h"
  27 #include "llvm/IR/Module.h"
  28 #include "llvm/IR/Type.h"
  29 #include "llvm/IR/Value.h"
  30 #include "llvm/Support/Casting.h"
  31 #include "llvm/Support/MachineValueType.h"
  32 #include <algorithm>
  33 #include <cassert>
  34 #include <cmath>
  35 #include <cstdint>
  36
  37 using namespace llvm;
  38
  39 namespace {
  40
  41 /// This class holds necessary information to represent an interleaved
  42 /// access group and supports utilities to lower the group into
  43 /// X86-specific instructions/intrinsics.
  44 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
  45 ///       other element)
  46 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  47 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  48 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  49 class X86InterleavedAccessGroup {
  50   /// Reference to the wide-load instruction of an interleaved access
  51   /// group.
  52   Instruction *const Inst;
  53
  54   /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
  55   ArrayRef<ShuffleVectorInst *> Shuffles;
  56
  57   /// Reference to the starting index of each user-shuffle.
  58   ArrayRef<unsigned> Indices;
  59
  60   /// Reference to the interleaving stride in terms of elements.
  61   const unsigned Factor;
  62
  63   /// Reference to the underlying target.
  64   const X86Subtarget &Subtarget;
  65
  66   const DataLayout &DL;
  67
  68   IRBuilder<> &Builder;
  69
  70   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
  71   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
  72   void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
  73                  SmallVectorImpl<Instruction *> &DecomposedVectors);
  74
  75   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
  76   /// returns the transposed-vectors in \p TransposedVectors.
  77   /// E.g.
  78   /// InputVectors:
  79   ///   In-V0 = p1, p2, p3, p4
  80   ///   In-V1 = q1, q2, q3, q4
  81   ///   In-V2 = r1, r2, r3, r4
  82   ///   In-V3 = s1, s2, s3, s4
  83   /// OutputVectors:
  84   ///   Out-V0 = p1, q1, r1, s1
  85   ///   Out-V1 = p2, q2, r2, s2
  86   ///   Out-V2 = p3, q3, r3, s3
  87   ///   Out-V3 = P4, q4, r4, s4
  88   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
  89                      SmallVectorImpl<Value *> &TransposedMatrix);
  90   void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
  91                              SmallVectorImpl<Value *> &TransposedMatrix,
  92                              unsigned NumSubVecElems);
  93   void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
  94                                 SmallVectorImpl<Value *> &TransposedMatrix);
  95   void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
  96                              SmallVectorImpl<Value *> &TransposedMatrix,
  97                              unsigned NumSubVecElems);
  98   void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
  99                                SmallVectorImpl<Value *> &TransposedMatrix,
 100                                unsigned NumSubVecElems);
 101
 102 public:
 103   /// In order to form an interleaved access group X86InterleavedAccessGroup
 104   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
 105   /// \p Shuffs, reference to the first indices of each interleaved-vector
 106   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
 107   /// X86-specific instructions/intrinsics it also requires the underlying
 108   /// target information \p STarget.
 109   explicit X86InterleavedAccessGroup(Instruction *I,
 110                                      ArrayRef<ShuffleVectorInst *> Shuffs,
 111                                      ArrayRef<unsigned> Ind, const unsigned F,
 112                                      const X86Subtarget &STarget,
 113                                      IRBuilder<> &B)
 114       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
 115         DL(Inst->getModule()->getDataLayout()), Builder(B) {}
 116
 117   /// Returns true if this interleaved access group can be lowered into
 118   /// x86-specific instructions/intrinsics, false otherwise.
 119   bool isSupported() const;
 120
 121   /// Lowers this interleaved access group into X86-specific
 122   /// instructions/intrinsics.
 123   bool lowerIntoOptimizedSequence();
 124 };
 125
 126 } // end anonymous namespace
 127
 128 bool X86InterleavedAccessGroup::isSupported() const {
 129   VectorType *ShuffleVecTy = Shuffles[0]->getType();
 130   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
 131   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
 132   unsigned WideInstSize;
 133
 134   // Currently, lowering is supported for the following vectors:
 135   // Stride 4:
 136   //    1. Store and load of 4-element vectors of 64 bits on AVX.
 137   //    2. Store of 16/32-element vectors of 8 bits on AVX.
 138   // Stride 3:
 139   //    1. Load of 16/32-element vectors of 8 bits on AVX.
 140   if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
 141     return false;
 142
 143   if (isa<LoadInst>(Inst)) {
 144     WideInstSize = DL.getTypeSizeInBits(Inst->getType());
 145     if (cast<LoadInst>(Inst)->getPointerAddressSpace())
 146       return false;
 147   } else
 148     WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
 149
 150   // We support shuffle represents stride 4 for byte type with size of
 151   // WideInstSize.
 152   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
 153      return true;
 154
 155   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
 156       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
 157        WideInstSize == 2048))
 158     return true;
 159
 160   if (ShuffleElemSize == 8 && Factor == 3 &&
 161       (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
 162     return true;
 163
 164   return false;
 165 }
 166
 167 void X86InterleavedAccessGroup::decompose(
 168     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
 169     SmallVectorImpl<Instruction *> &DecomposedVectors) {
 170   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
 171          "Expected Load or Shuffle");
 172
 173   Type *VecWidth = VecInst->getType();
 174   (void)VecWidth;
 175   assert(VecWidth->isVectorTy() &&
 176          DL.getTypeSizeInBits(VecWidth) >=
 177              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
 178          "Invalid Inst-size!!!");
 179
 180   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
 181     Value *Op0 = SVI->getOperand(0);
 182     Value *Op1 = SVI->getOperand(1);
 183
 184     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
 185     for (unsigned i = 0; i < NumSubVectors; ++i)
 186       DecomposedVectors.push_back(
 187           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
 188               Op0, Op1,
 189               createSequentialMask(Builder, Indices[i],
 190                                    SubVecTy->getVectorNumElements(), 0))));
 191     return;
 192   }
 193
 194   // Decompose the load instruction.
 195   LoadInst *LI = cast<LoadInst>(VecInst);
 196   Type *VecBaseTy, *VecBasePtrTy;
 197   Value *VecBasePtr;
 198   unsigned int NumLoads = NumSubVectors;
 199   // In the case of stride 3 with a vector of 32 elements load the information
 200   // in the following way:
 201   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
 202   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
 203   if (VecLength == 768 || VecLength == 1536) {
 204     VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
 205     VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
 206     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 207     NumLoads = NumSubVectors * (VecLength / 384);
 208   } else {
 209     VecBaseTy = SubVecTy;
 210     VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
 211     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 212   }
 213   // Generate N loads of T type.
 214   for (unsigned i = 0; i < NumLoads; i++) {
 215     // TODO: Support inbounds GEP.
 216     Value *NewBasePtr =
 217         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
 218     Instruction *NewLoad =
 219         Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
 220     DecomposedVectors.push_back(NewLoad);
 221   }
 222 }
 223
 224 // Changing the scale of the vector type by reducing the number of elements and
 225 // doubling the scalar size.
 226 static MVT scaleVectorType(MVT VT) {
 227   unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
 228   return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
 229                           VT.getVectorNumElements() / 2);
 230 }
 231
 232 static uint32_t Concat[] = {
 233   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
 234   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 235   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
 236   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
 237
 238 // genShuffleBland - Creates shuffle according to two vectors.This function is
 239 // only works on instructions with lane inside 256 registers. According to
 240 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
 241 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
 242 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
 243 // the second vector.
 244 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
 245 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
 246 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
 247 // For the sequence to work as a mirror to the load.
 248 // We must consider the elements order as above.
 249 // In this function we are combining two types of shuffles.
 250 // The first one is vpshufed and the second is a type of "blend" shuffle.
 251 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
 252 // correct offset. We are creating a vpsuffed + blend sequence between two
 253 // shuffles.
 254 static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
 255   SmallVectorImpl<uint32_t> &Out, int LowOffset,
 256   int HighOffset) {
 257   assert(VT.getSizeInBits() >= 256 &&
 258     "This function doesn't accept width smaller then 256");
 259   unsigned NumOfElm = VT.getVectorNumElements();
 260   for (unsigned i = 0; i < Mask.size(); i++)
 261     Out.push_back(Mask[i] + LowOffset);
 262   for (unsigned i = 0; i < Mask.size(); i++)
 263     Out.push_back(Mask[i] + HighOffset + NumOfElm);
 264 }
 265
 266 // reorderSubVector returns the data to is the original state. And de-facto is
 267 // the opposite of  the function concatSubVector.
 268
 269 // For VecElems = 16
 270 // Invec[0] -  |0|      TransposedMatrix[0] - |0|
 271 // Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
 272 // Invec[2] -  |2|      TransposedMatrix[2] - |2|
 273
 274 // For VecElems = 32
 275 // Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
 276 // Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
 277 // Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
 278
 279 // For VecElems = 64
 280 // Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
 281 // Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
 282 // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
 283
 284 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
 285   ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
 286   unsigned VecElems, unsigned Stride,
 287   IRBuilder<> Builder) {
 288
 289   if (VecElems == 16) {
 290     for (unsigned i = 0; i < Stride; i++)
 291       TransposedMatrix[i] = Builder.CreateShuffleVector(
 292         Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
 293     return;
 294   }
 295
 296   SmallVector<uint32_t, 32> OptimizeShuf;
 297   Value *Temp[8];
 298
 299   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
 300     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
 301       (i + 1) / Stride * 16);
 302     Temp[i / 2] = Builder.CreateShuffleVector(
 303       Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
 304     OptimizeShuf.clear();
 305   }
 306
 307   if (VecElems == 32) {
 308     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
 309     return;
 310   }
 311   else
 312     for (unsigned i = 0; i < Stride; i++)
 313       TransposedMatrix[i] =
 314       Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
 315 }
 316
 317 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
 318     ArrayRef<Instruction *> Matrix,
 319     SmallVectorImpl<Value *> &TransposedMatrix) {
 320   // Assuming we start from the following vectors:
 321   // Matrix[0]= c0 c1 c2 c3 c4 ... c7
 322   // Matrix[1]= m0 m1 m2 m3 m4 ... m7
 323   // Matrix[2]= y0 y1 y2 y3 y4 ... y7
 324   // Matrix[3]= k0 k1 k2 k3 k4 ... k7
 325
 326   MVT VT = MVT::v8i16;
 327   TransposedMatrix.resize(2);
 328   SmallVector<uint32_t, 16> MaskLow;
 329   SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
 330   SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
 331
 332   for (unsigned i = 0; i < 8; ++i) {
 333     MaskLow.push_back(i);
 334     MaskLow.push_back(i + 8);
 335   }
 336
 337   createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
 338   createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
 339   scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
 340   scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
 341   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
 342   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
 343   Value *IntrVec1Low =
 344       Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
 345   Value *IntrVec2Low =
 346       Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
 347
 348   // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
 349   // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
 350
 351   TransposedMatrix[0] =
 352       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
 353   TransposedMatrix[1] =
 354       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
 355 }
 356
 357 void X86InterleavedAccessGroup::interleave8bitStride4(
 358     ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
 359     unsigned NumOfElm) {
 360   // Example: Assuming we start from the following vectors:
 361   // Matrix[0]= c0 c1 c2 c3 c4 ... c31
 362   // Matrix[1]= m0 m1 m2 m3 m4 ... m31
 363   // Matrix[2]= y0 y1 y2 y3 y4 ... y31
 364   // Matrix[3]= k0 k1 k2 k3 k4 ... k31
 365
 366   MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
 367   MVT HalfVT = scaleVectorType(VT);
 368
 369   TransposedMatrix.resize(4);
 370   SmallVector<uint32_t, 32> MaskHigh;
 371   SmallVector<uint32_t, 32> MaskLow;
 372   SmallVector<uint32_t, 32> LowHighMask[2];
 373   SmallVector<uint32_t, 32> MaskHighTemp;
 374   SmallVector<uint32_t, 32> MaskLowTemp;
 375
 376   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
 377   // shuffle pattern.
 378
 379   createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
 380   createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
 381
 382   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
 383   // shuffle pattern.
 384
 385   createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
 386   createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
 387   scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
 388   scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
 389
 390   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
 391   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
 392   // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
 393   // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
 394   Value *IntrVec[4];
 395
 396   IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
 397   IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
 398   IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
 399   IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
 400
 401   // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
 402   // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
 403   // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
 404   // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
 405
 406   Value *VecOut[4];
 407   for (int i = 0; i < 4; i++)
 408     VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
 409                                             LowHighMask[i % 2]);
 410
 411   // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
 412   // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
 413   // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
 414   // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
 415
 416   if (VT == MVT::v16i8) {
 417     std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
 418     return;
 419   }
 420
 421   reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
 422                    NumOfElm, 4, Builder);
 423 }
 424
 425 //  createShuffleStride returns shuffle mask of size N.
 426 //  The shuffle pattern is as following :
 427 //  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
 428 //  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
 429 //  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
 430 //  Where Lane is the # of lanes in a register:
 431 //  VectorSize = 128 => Lane = 1
 432 //  VectorSize = 256 => Lane = 2
 433 //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
 434 //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
 435 static void createShuffleStride(MVT VT, int Stride,
 436                                 SmallVectorImpl<uint32_t> &Mask) {
 437   int VectorSize = VT.getSizeInBits();
 438   int VF = VT.getVectorNumElements();
 439   int LaneCount = std::max(VectorSize / 128, 1);
 440   for (int Lane = 0; Lane < LaneCount; Lane++)
 441     for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
 442       Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
 443 }
 444
 445 //  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
 446 //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
 447 //  each group is a monotonically increasing sequence with stride 3.
 448 //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
 449 static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
 450   int VectorSize = VT.getSizeInBits();
 451   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
 452   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
 453     int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
 454     SizeInfo.push_back(GroupSize);
 455     FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
 456   }
 457 }
 458
 459 //  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
 460 //  vpalign works according to lanes
 461 //  Where Lane is the # of lanes in a register:
 462 //  VectorWide = 128 => Lane = 1
 463 //  VectorWide = 256 => Lane = 2
 464 //  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
 465 //  For Lane = 2 shuffle pattern is:
 466 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
 467 //  Imm variable sets the offset amount. The result of the
 468 //  function is stored inside ShuffleMask vector and it built as described in
 469 //  the begin of the description. AlignDirection is a boolean that indicates the
 470 //  direction of the alignment. (false - align to the "right" side while true -
 471 //  align to the "left" side)
 472 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
 473                               SmallVectorImpl<uint32_t> &ShuffleMask,
 474                               bool AlignDirection = true, bool Unary = false) {
 475   unsigned NumElts = VT.getVectorNumElements();
 476   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
 477   unsigned NumLaneElts = NumElts / NumLanes;
 478
 479   Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
 480   unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
 481
 482   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
 483     for (unsigned i = 0; i != NumLaneElts; ++i) {
 484       unsigned Base = i + Offset;
 485       // if i+offset is out of this lane then we actually need the other source
 486       // If Unary the other source is the first source.
 487       if (Base >= NumLaneElts)
 488         Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
 489       ShuffleMask.push_back(Base + l);
 490     }
 491   }
 492 }
 493
 494 // concatSubVector - The function rebuilds the data to a correct expected
 495 // order. An assumption(The shape of the matrix) was taken for the
 496 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
 497 // This function ensures that the data is built in correct way for the lane
 498 // instructions. Each lane inside the vector is a 128-bit length.
 499 //
 500 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
 501 // can find the first 128 bit data. The number of different lanes inside a
 502 // vector depends on the 'VecElems'.In general, the formula is
 503 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
 504 // 'VecElems'.
 505
 506 // For VecElems = 16
 507 // Invec[0] - |0|      Vec[0] - |0|
 508 // Invec[1] - |1|  =>  Vec[1] - |1|
 509 // Invec[2] - |2|      Vec[2] - |2|
 510
 511 // For VecElems = 32
 512 // Invec[0] - |0|1|      Vec[0] - |0|3|
 513 // Invec[1] - |2|3|  =>  Vec[1] - |1|4|
 514 // Invec[2] - |4|5|      Vec[2] - |2|5|
 515
 516 // For VecElems = 64
 517 // Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
 518 // Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
 519 // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
 520
 521 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
 522                             unsigned VecElems, IRBuilder<> Builder) {
 523   if (VecElems == 16) {
 524     for (int i = 0; i < 3; i++)
 525       Vec[i] = InVec[i];
 526     return;
 527   }
 528
 529   for (unsigned j = 0; j < VecElems / 32; j++)
 530     for (int i = 0; i < 3; i++)
 531       Vec[i + j * 3] = Builder.CreateShuffleVector(
 532           InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
 533
 534   if (VecElems == 32)
 535     return;
 536
 537   for (int i = 0; i < 3; i++)
 538     Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
 539 }
 540
 541 void X86InterleavedAccessGroup::deinterleave8bitStride3(
 542     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
 543     unsigned VecElems) {
 544   // Example: Assuming we start from the following vectors:
 545   // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
 546   // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
 547   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
 548
 549   TransposedMatrix.resize(3);
 550   SmallVector<uint32_t, 32> VPShuf;
 551   SmallVector<uint32_t, 32> VPAlign[2];
 552   SmallVector<uint32_t, 32> VPAlign2;
 553   SmallVector<uint32_t, 32> VPAlign3;
 554   SmallVector<uint32_t, 3> GroupSize;
 555   Value *Vec[6], *TempVector[3];
 556
 557   MVT VT = MVT::getVT(Shuffles[0]->getType());
 558
 559   createShuffleStride(VT, 3, VPShuf);
 560   setGroupSize(VT, GroupSize);
 561
 562   for (int i = 0; i < 2; i++)
 563     DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
 564
 565   DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
 566   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
 567
 568   concatSubVector(Vec, InVec, VecElems, Builder);
 569   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
 570   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
 571   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
 572
 573   for (int i = 0; i < 3; i++)
 574     Vec[i] = Builder.CreateShuffleVector(
 575         Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
 576
 577   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
 578   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
 579   // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
 580
 581   for (int i = 0; i < 3; i++)
 582     TempVector[i] =
 583         Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
 584
 585   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
 586   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
 587   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
 588
 589   for (int i = 0; i < 3; i++)
 590     Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
 591                                          VPAlign[1]);
 592
 593   // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
 594   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
 595   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
 596
 597   Value *TempVec = Builder.CreateShuffleVector(
 598       Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
 599   TransposedMatrix[0] = Builder.CreateShuffleVector(
 600       Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
 601   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
 602   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
 603 }
 604
 605 // group2Shuffle reorder the shuffle stride back into continuous order.
 606 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
 607 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
 608 static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
 609                           SmallVectorImpl<uint32_t> &Output) {
 610   int IndexGroup[3] = {0, 0, 0};
 611   int Index = 0;
 612   int VectorWidth = VT.getSizeInBits();
 613   int VF = VT.getVectorNumElements();
 614   // Find the index of the different groups.
 615   int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
 616   for (int i = 0; i < 3; i++) {
 617     IndexGroup[(Index * 3) % (VF / Lane)] = Index;
 618     Index += Mask[i];
 619   }
 620   // According to the index compute the convert mask.
 621   for (int i = 0; i < VF / Lane; i++) {
 622     Output.push_back(IndexGroup[i % 3]);
 623     IndexGroup[i % 3]++;
 624   }
 625 }
 626
 627 void X86InterleavedAccessGroup::interleave8bitStride3(
 628     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
 629     unsigned VecElems) {
 630   // Example: Assuming we start from the following vectors:
 631   // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
 632   // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
 633   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
 634
 635   TransposedMatrix.resize(3);
 636   SmallVector<uint32_t, 3> GroupSize;
 637   SmallVector<uint32_t, 32> VPShuf;
 638   SmallVector<uint32_t, 32> VPAlign[3];
 639   SmallVector<uint32_t, 32> VPAlign2;
 640   SmallVector<uint32_t, 32> VPAlign3;
 641
 642   Value *Vec[3], *TempVector[3];
 643   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
 644
 645   setGroupSize(VT, GroupSize);
 646
 647   for (int i = 0; i < 3; i++)
 648     DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
 649
 650   DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
 651   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
 652
 653   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
 654   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
 655   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
 656
 657   Vec[0] = Builder.CreateShuffleVector(
 658       InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
 659   Vec[1] = Builder.CreateShuffleVector(
 660       InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
 661   Vec[2] = InVec[2];
 662
 663   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
 664   // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
 665   // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
 666
 667   for (int i = 0; i < 3; i++)
 668     TempVector[i] =
 669         Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
 670
 671   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
 672   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
 673   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
 674
 675   for (int i = 0; i < 3; i++)
 676     Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
 677                                          VPAlign[2]);
 678
 679   // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
 680   // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
 681   // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
 682
 683   unsigned NumOfElm = VT.getVectorNumElements();
 684   group2Shuffle(VT, GroupSize, VPShuf);
 685   reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
 686 }
 687
 688 void X86InterleavedAccessGroup::transpose_4x4(
 689     ArrayRef<Instruction *> Matrix,
 690     SmallVectorImpl<Value *> &TransposedMatrix) {
 691   assert(Matrix.size() == 4 && "Invalid matrix size");
 692   TransposedMatrix.resize(4);
 693
 694   // dst = src1[0,1],src2[0,1]
 695   uint32_t IntMask1[] = {0, 1, 4, 5};
 696   ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
 697   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
 698   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 699
 700   // dst = src1[2,3],src2[2,3]
 701   uint32_t IntMask2[] = {2, 3, 6, 7};
 702   Mask = makeArrayRef(IntMask2, 4);
 703   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
 704   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 705
 706   // dst = src1[0],src2[0],src1[2],src2[2]
 707   uint32_t IntMask3[] = {0, 4, 2, 6};
 708   Mask = makeArrayRef(IntMask3, 4);
 709   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
 710   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 711
 712   // dst = src1[1],src2[1],src1[3],src2[3]
 713   uint32_t IntMask4[] = {1, 5, 3, 7};
 714   Mask = makeArrayRef(IntMask4, 4);
 715   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
 716   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 717 }
 718
 719 // Lowers this interleaved access group into X86-specific
 720 // instructions/intrinsics.
 721 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 722   SmallVector<Instruction *, 4> DecomposedVectors;
 723   SmallVector<Value *, 4> TransposedVectors;
 724   VectorType *ShuffleTy = Shuffles[0]->getType();
 725
 726   if (isa<LoadInst>(Inst)) {
 727     // Try to generate target-sized register(/instruction).
 728     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
 729
 730     Type *ShuffleEltTy = Inst->getType();
 731     unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
 732     // Perform matrix-transposition in order to compute interleaved
 733     // results by generating some sort of (optimized) target-specific
 734     // instructions.
 735
 736     switch (NumSubVecElems) {
 737     default:
 738       return false;
 739     case 4:
 740       transpose_4x4(DecomposedVectors, TransposedVectors);
 741       break;
 742     case 8:
 743     case 16:
 744     case 32:
 745     case 64:
 746       deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
 747                               NumSubVecElems);
 748       break;
 749     }
 750
 751     // Now replace the unoptimized-interleaved-vectors with the
 752     // transposed-interleaved vectors.
 753     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
 754       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
 755
 756     return true;
 757   }
 758
 759   Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
 760   unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
 761
 762   // Lower the interleaved stores:
 763   //   1. Decompose the interleaved wide shuffle into individual shuffle
 764   //   vectors.
 765   decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
 766             DecomposedVectors);
 767
 768   //   2. Transpose the interleaved-vectors into vectors of contiguous
 769   //      elements.
 770   switch (NumSubVecElems) {
 771   case 4:
 772     transpose_4x4(DecomposedVectors, TransposedVectors);
 773     break;
 774   case 8:
 775     interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
 776     break;
 777   case 16:
 778   case 32:
 779   case 64:
 780     if (Factor == 4)
 781       interleave8bitStride4(DecomposedVectors, TransposedVectors,
 782                             NumSubVecElems);
 783     if (Factor == 3)
 784       interleave8bitStride3(DecomposedVectors, TransposedVectors,
 785                             NumSubVecElems);
 786     break;
 787   default:
 788     return false;
 789   }
 790
 791   //   3. Concatenate the contiguous-vectors back into a wide vector.
 792   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
 793
 794   //   4. Generate a store instruction for wide-vec.
 795   StoreInst *SI = cast<StoreInst>(Inst);
 796   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
 797                              SI->getAlignment());
 798
 799   return true;
 800 }
 801
 802 // Lower interleaved load(s) into target specific instructions/
 803 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
 804 // number of shuffles and ISA.
 805 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 806 bool X86TargetLowering::lowerInterleavedLoad(
 807     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
 808     ArrayRef<unsigned> Indices, unsigned Factor) const {
 809   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
 810          "Invalid interleave factor");
 811   assert(!Shuffles.empty() && "Empty shufflevector input");
 812   assert(Shuffles.size() == Indices.size() &&
 813          "Unmatched number of shufflevectors and indices");
 814
 815   // Create an interleaved access group.
 816   IRBuilder<> Builder(LI);
 817   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
 818                                 Builder);
 819
 820   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 821 }
 822
 823 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
 824                                               ShuffleVectorInst *SVI,
 825                                               unsigned Factor) const {
 826   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
 827          "Invalid interleave factor");
 828
 829   assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
 830          "Invalid interleaved store");
 831
 832   // Holds the indices of SVI that correspond to the starting index of each
 833   // interleaved shuffle.
 834   SmallVector<unsigned, 4> Indices;
 835   auto Mask = SVI->getShuffleMask();
 836   for (unsigned i = 0; i < Factor; i++)
 837     Indices.push_back(Mask[i]);
 838
 839   ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
 840
 841   // Create an interleaved access group.
 842   IRBuilder<> Builder(SI);
 843   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
 844                                 Builder);
 845
 846   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 847 }