lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #if defined(_MSC_VER) || defined(__MINGW32__)
  15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
  16 // from the Visual C++ cmath / math.h headers:
  17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
  18 #define _USE_MATH_DEFINES
  19 #endif
  20
  21 #include "AMDGPU.h"
  22 #include "AMDGPULegalizerInfo.h"
  23 #include "AMDGPUTargetMachine.h"
  24 #include "SIMachineFunctionInfo.h"
  25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  27 #include "llvm/CodeGen/TargetOpcodes.h"
  28 #include "llvm/CodeGen/ValueTypes.h"
  29 #include "llvm/IR/DerivedTypes.h"
  30 #include "llvm/IR/DiagnosticInfo.h"
  31 #include "llvm/IR/Type.h"
  32 #include "llvm/Support/Debug.h"
  33
  34 #define DEBUG_TYPE "amdgpu-legalinfo"
  35
  36 using namespace llvm;
  37 using namespace LegalizeActions;
  38 using namespace LegalizeMutations;
  39 using namespace LegalityPredicates;
  40
  41
  42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
  43                                       unsigned MaxSize = 1024) {
  44   return [=](const LegalityQuery &Query) {
  45     const LLT Ty = Query.Types[TypeIdx];
  46     const LLT EltTy = Ty.getScalarType();
  47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
  48   };
  49 }
  50
  51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
  52   return [=](const LegalityQuery &Query) {
  53     return Query.Types[TypeIdx].getSizeInBits() == Size;
  54   };
  55 }
  56
  57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  58   return [=](const LegalityQuery &Query) {
  59     const LLT Ty = Query.Types[TypeIdx];
  60     return Ty.isVector() &&
  61            Ty.getNumElements() % 2 != 0 &&
  62            Ty.getElementType().getSizeInBits() < 32 &&
  63            Ty.getSizeInBits() % 32 != 0;
  64   };
  65 }
  66
  67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
  68   return [=](const LegalityQuery &Query) {
  69     const LLT Ty = Query.Types[TypeIdx];
  70     const LLT EltTy = Ty.getScalarType();
  71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
  72   };
  73 }
  74
  75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  76   return [=](const LegalityQuery &Query) {
  77     const LLT Ty = Query.Types[TypeIdx];
  78     const LLT EltTy = Ty.getElementType();
  79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  80   };
  81 }
  82
  83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  84   return [=](const LegalityQuery &Query) {
  85     const LLT Ty = Query.Types[TypeIdx];
  86     const LLT EltTy = Ty.getElementType();
  87     unsigned Size = Ty.getSizeInBits();
  88     unsigned Pieces = (Size + 63) / 64;
  89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  91   };
  92 }
  93
  94 // Increase the number of vector elements to reach the next multiple of 32-bit
  95 // type.
  96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
  97   return [=](const LegalityQuery &Query) {
  98     const LLT Ty = Query.Types[TypeIdx];
  99
 100     const LLT EltTy = Ty.getElementType();
 101     const int Size = Ty.getSizeInBits();
 102     const int EltSize = EltTy.getSizeInBits();
 103     const int NextMul32 = (Size + 31) / 32;
 104
 105     assert(EltSize < 32);
 106
 107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
 108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
 109   };
 110 }
 111
 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
 113   return [=](const LegalityQuery &Query) {
 114     const LLT QueryTy = Query.Types[TypeIdx];
 115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
 116   };
 117 }
 118
 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
 120   return [=](const LegalityQuery &Query) {
 121     const LLT QueryTy = Query.Types[TypeIdx];
 122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
 123   };
 124 }
 125
 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
 127   return [=](const LegalityQuery &Query) {
 128     const LLT QueryTy = Query.Types[TypeIdx];
 129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
 130   };
 131 }
 132
 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
 134 // v2s16.
 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 136   return [=](const LegalityQuery &Query) {
 137     const LLT Ty = Query.Types[TypeIdx];
 138     if (Ty.isVector()) {
 139       const int EltSize = Ty.getElementType().getSizeInBits();
 140       return EltSize == 32 || EltSize == 64 ||
 141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 142              EltSize == 128 || EltSize == 256;
 143     }
 144
 145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
 146   };
 147 }
 148
 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
 150   return [=](const LegalityQuery &Query) {
 151     return Query.Types[TypeIdx].getElementType() == Type;
 152   };
 153 }
 154
 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
 156   return [=](const LegalityQuery &Query) {
 157     const LLT Ty = Query.Types[TypeIdx];
 158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
 159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
 160   };
 161 }
 162
 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 164                                          const GCNTargetMachine &TM)
 165   :  ST(ST_) {
 166   using namespace TargetOpcode;
 167
 168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 170   };
 171
 172   const LLT S1 = LLT::scalar(1);
 173   const LLT S8 = LLT::scalar(8);
 174   const LLT S16 = LLT::scalar(16);
 175   const LLT S32 = LLT::scalar(32);
 176   const LLT S64 = LLT::scalar(64);
 177   const LLT S96 = LLT::scalar(96);
 178   const LLT S128 = LLT::scalar(128);
 179   const LLT S256 = LLT::scalar(256);
 180   const LLT S1024 = LLT::scalar(1024);
 181
 182   const LLT V2S16 = LLT::vector(2, 16);
 183   const LLT V4S16 = LLT::vector(4, 16);
 184
 185   const LLT V2S32 = LLT::vector(2, 32);
 186   const LLT V3S32 = LLT::vector(3, 32);
 187   const LLT V4S32 = LLT::vector(4, 32);
 188   const LLT V5S32 = LLT::vector(5, 32);
 189   const LLT V6S32 = LLT::vector(6, 32);
 190   const LLT V7S32 = LLT::vector(7, 32);
 191   const LLT V8S32 = LLT::vector(8, 32);
 192   const LLT V9S32 = LLT::vector(9, 32);
 193   const LLT V10S32 = LLT::vector(10, 32);
 194   const LLT V11S32 = LLT::vector(11, 32);
 195   const LLT V12S32 = LLT::vector(12, 32);
 196   const LLT V13S32 = LLT::vector(13, 32);
 197   const LLT V14S32 = LLT::vector(14, 32);
 198   const LLT V15S32 = LLT::vector(15, 32);
 199   const LLT V16S32 = LLT::vector(16, 32);
 200   const LLT V32S32 = LLT::vector(32, 32);
 201
 202   const LLT V2S64 = LLT::vector(2, 64);
 203   const LLT V3S64 = LLT::vector(3, 64);
 204   const LLT V4S64 = LLT::vector(4, 64);
 205   const LLT V5S64 = LLT::vector(5, 64);
 206   const LLT V6S64 = LLT::vector(6, 64);
 207   const LLT V7S64 = LLT::vector(7, 64);
 208   const LLT V8S64 = LLT::vector(8, 64);
 209   const LLT V16S64 = LLT::vector(16, 64);
 210
 211   std::initializer_list<LLT> AllS32Vectors =
 212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
 214   std::initializer_list<LLT> AllS64Vectors =
 215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
 216
 217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 224
 225   const LLT CodePtr = FlatPtr;
 226
 227   const std::initializer_list<LLT> AddrSpaces64 = {
 228     GlobalPtr, ConstantPtr, FlatPtr
 229   };
 230
 231   const std::initializer_list<LLT> AddrSpaces32 = {
 232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 233   };
 234
 235   const std::initializer_list<LLT> FPTypesBase = {
 236     S32, S64
 237   };
 238
 239   const std::initializer_list<LLT> FPTypes16 = {
 240     S32, S64, S16
 241   };
 242
 243   const std::initializer_list<LLT> FPTypesPK16 = {
 244     S32, S64, S16, V2S16
 245   };
 246
 247   setAction({G_BRCOND, S1}, Legal);
 248
 249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 250   // elements for v3s16
 251   getActionDefinitionsBuilder(G_PHI)
 252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 253     .legalFor(AllS32Vectors)
 254     .legalFor(AllS64Vectors)
 255     .legalFor(AddrSpaces64)
 256     .legalFor(AddrSpaces32)
 257     .clampScalar(0, S32, S256)
 258     .widenScalarToNextPow2(0, 32)
 259     .clampMaxNumElements(0, S32, 16)
 260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 261     .legalIf(isPointer(0));
 262
 263   if (ST.has16BitInsts()) {
 264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 265       .legalFor({S32, S16})
 266       .clampScalar(0, S16, S32)
 267       .scalarize(0);
 268   } else {
 269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 270       .legalFor({S32})
 271       .clampScalar(0, S32, S32)
 272       .scalarize(0);
 273   }
 274
 275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 276     .legalFor({S32})
 277     .clampScalar(0, S32, S32)
 278     .scalarize(0);
 279
 280   // Report legal for any types we can handle anywhere. For the cases only legal
 281   // on the SALU, RegBankSelect will be able to re-legalize.
 282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 284     .clampScalar(0, S32, S64)
 285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
 287     .widenScalarToNextPow2(0)
 288     .scalarize(0);
 289
 290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
 291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 292     .legalFor({{S32, S1}})
 293     .clampScalar(0, S32, S32)
 294     .scalarize(0); // TODO: Implement.
 295
 296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
 297     .lower();
 298
 299   getActionDefinitionsBuilder(G_BITCAST)
 300     // Don't worry about the size constraint.
 301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
 302     // FIXME: Testing hack
 303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
 304
 305   getActionDefinitionsBuilder(G_FCONSTANT)
 306     .legalFor({S32, S64, S16})
 307     .clampScalar(0, S16, S64);
 308
 309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
 310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
 311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
 312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 313     .clampScalarOrElt(0, S32, S1024)
 314     .legalIf(isMultiple32(0))
 315     .widenScalarToNextPow2(0, 32)
 316     .clampMaxNumElements(0, S32, 16);
 317
 318
 319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
 320   // values may not be legal.  We need to figure out how to distinguish
 321   // between these two scenarios.
 322   getActionDefinitionsBuilder(G_CONSTANT)
 323     .legalFor({S1, S32, S64, S16, GlobalPtr,
 324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 325     .clampScalar(0, S32, S64)
 326     .widenScalarToNextPow2(0)
 327     .legalIf(isPointer(0));
 328
 329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
 331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
 332
 333
 334   auto &FPOpActions = getActionDefinitionsBuilder(
 335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
 336     .legalFor({S32, S64});
 337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 338     .customFor({S32, S64});
 339   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
 340     .customFor({S32, S64});
 341
 342   if (ST.has16BitInsts()) {
 343     if (ST.hasVOP3PInsts())
 344       FPOpActions.legalFor({S16, V2S16});
 345     else
 346       FPOpActions.legalFor({S16});
 347
 348     TrigActions.customFor({S16});
 349     FDIVActions.customFor({S16});
 350   }
 351
 352   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 353       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 354
 355   if (ST.hasVOP3PInsts()) {
 356     MinNumMaxNum.customFor(FPTypesPK16)
 357       .clampMaxNumElements(0, S16, 2)
 358       .clampScalar(0, S16, S64)
 359       .scalarize(0);
 360   } else if (ST.has16BitInsts()) {
 361     MinNumMaxNum.customFor(FPTypes16)
 362       .clampScalar(0, S16, S64)
 363       .scalarize(0);
 364   } else {
 365     MinNumMaxNum.customFor(FPTypesBase)
 366       .clampScalar(0, S32, S64)
 367       .scalarize(0);
 368   }
 369
 370   if (ST.hasVOP3PInsts())
 371     FPOpActions.clampMaxNumElements(0, S16, 2);
 372
 373   FPOpActions
 374     .scalarize(0)
 375     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 376
 377   TrigActions
 378     .scalarize(0)
 379     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 380
 381   FDIVActions
 382     .scalarize(0)
 383     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 384
 385   getActionDefinitionsBuilder({G_FNEG, G_FABS})
 386     .legalFor(FPTypesPK16)
 387     .clampMaxNumElements(0, S16, 2)
 388     .scalarize(0)
 389     .clampScalar(0, S16, S64);
 390
 391   // TODO: Implement
 392   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
 393
 394   if (ST.has16BitInsts()) {
 395     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 396       .legalFor({S32, S64, S16})
 397       .scalarize(0)
 398       .clampScalar(0, S16, S64);
 399   } else {
 400     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 401       .legalFor({S32, S64})
 402       .scalarize(0)
 403       .clampScalar(0, S32, S64);
 404   }
 405
 406   getActionDefinitionsBuilder(G_FPTRUNC)
 407     .legalFor({{S32, S64}, {S16, S32}})
 408     .scalarize(0);
 409
 410   getActionDefinitionsBuilder(G_FPEXT)
 411     .legalFor({{S64, S32}, {S32, S16}})
 412     .lowerFor({{S64, S16}}) // FIXME: Implement
 413     .scalarize(0);
 414
 415   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
 416   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
 417
 418   getActionDefinitionsBuilder(G_FSUB)
 419       // Use actual fsub instruction
 420       .legalFor({S32})
 421       // Must use fadd + fneg
 422       .lowerFor({S64, S16, V2S16})
 423       .scalarize(0)
 424       .clampScalar(0, S32, S64);
 425
 426   // Whether this is legal depends on the floating point mode for the function.
 427   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
 428   if (ST.hasMadF16())
 429     FMad.customFor({S32, S16});
 430   else
 431     FMad.customFor({S32});
 432   FMad.scalarize(0)
 433       .lower();
 434
 435   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 436     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 437                {S32, S1}, {S64, S1}, {S16, S1},
 438                {S96, S32},
 439                // FIXME: Hack
 440                {S64, LLT::scalar(33)},
 441                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
 442     .scalarize(0);
 443
 444   // TODO: Split s1->s64 during regbankselect for VALU.
 445   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 446     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
 447     .lowerFor({{S32, S64}})
 448     .customFor({{S64, S64}});
 449   if (ST.has16BitInsts())
 450     IToFP.legalFor({{S16, S16}});
 451   IToFP.clampScalar(1, S32, S64)
 452        .scalarize(0);
 453
 454   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 455     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
 456   if (ST.has16BitInsts())
 457     FPToI.legalFor({{S16, S16}});
 458   else
 459     FPToI.minScalar(1, S32);
 460
 461   FPToI.minScalar(0, S32)
 462        .scalarize(0);
 463
 464   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 465     .legalFor({S32, S64})
 466     .scalarize(0);
 467
 468   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 469     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 470       .legalFor({S32, S64})
 471       .clampScalar(0, S32, S64)
 472       .scalarize(0);
 473   } else {
 474     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 475       .legalFor({S32})
 476       .customFor({S64})
 477       .clampScalar(0, S32, S64)
 478       .scalarize(0);
 479   }
 480
 481   getActionDefinitionsBuilder(G_GEP)
 482     .legalForCartesianProduct(AddrSpaces64, {S64})
 483     .legalForCartesianProduct(AddrSpaces32, {S32})
 484     .scalarize(0);
 485
 486   getActionDefinitionsBuilder(G_PTR_MASK)
 487     .scalarize(0)
 488     .alwaysLegal();
 489
 490   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 491
 492   auto &CmpBuilder =
 493     getActionDefinitionsBuilder(G_ICMP)
 494     .legalForCartesianProduct(
 495       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 496     .legalFor({{S1, S32}, {S1, S64}});
 497   if (ST.has16BitInsts()) {
 498     CmpBuilder.legalFor({{S1, S16}});
 499   }
 500
 501   CmpBuilder
 502     .widenScalarToNextPow2(1)
 503     .clampScalar(1, S32, S64)
 504     .scalarize(0)
 505     .legalIf(all(typeIs(0, S1), isPointer(1)));
 506
 507   getActionDefinitionsBuilder(G_FCMP)
 508     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 509     .widenScalarToNextPow2(1)
 510     .clampScalar(1, S32, S64)
 511     .scalarize(0);
 512
 513   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
 514   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
 515                                G_FLOG, G_FLOG2, G_FLOG10})
 516     .legalFor({S32})
 517     .scalarize(0);
 518
 519   // The 64-bit versions produce 32-bit results, but only on the SALU.
 520   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
 521                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
 522                                G_CTPOP})
 523     .legalFor({{S32, S32}, {S32, S64}})
 524     .clampScalar(0, S32, S32)
 525     .clampScalar(1, S32, S64)
 526     .scalarize(0)
 527     .widenScalarToNextPow2(0, 32)
 528     .widenScalarToNextPow2(1, 32);
 529
 530   // TODO: Expand for > s32
 531   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
 532     .legalFor({S32})
 533     .clampScalar(0, S32, S32)
 534     .scalarize(0);
 535
 536   if (ST.has16BitInsts()) {
 537     if (ST.hasVOP3PInsts()) {
 538       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 539         .legalFor({S32, S16, V2S16})
 540         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 541         .clampMaxNumElements(0, S16, 2)
 542         .clampScalar(0, S16, S32)
 543         .widenScalarToNextPow2(0)
 544         .scalarize(0);
 545     } else {
 546       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 547         .legalFor({S32, S16})
 548         .widenScalarToNextPow2(0)
 549         .clampScalar(0, S16, S32)
 550         .scalarize(0);
 551     }
 552   } else {
 553     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 554       .legalFor({S32})
 555       .clampScalar(0, S32, S32)
 556       .widenScalarToNextPow2(0)
 557       .scalarize(0);
 558   }
 559
 560   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 561     return [=](const LegalityQuery &Query) {
 562       return Query.Types[TypeIdx0].getSizeInBits() <
 563              Query.Types[TypeIdx1].getSizeInBits();
 564     };
 565   };
 566
 567   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 568     return [=](const LegalityQuery &Query) {
 569       return Query.Types[TypeIdx0].getSizeInBits() >
 570              Query.Types[TypeIdx1].getSizeInBits();
 571     };
 572   };
 573
 574   getActionDefinitionsBuilder(G_INTTOPTR)
 575     // List the common cases
 576     .legalForCartesianProduct(AddrSpaces64, {S64})
 577     .legalForCartesianProduct(AddrSpaces32, {S32})
 578     .scalarize(0)
 579     // Accept any address space as long as the size matches
 580     .legalIf(sameSize(0, 1))
 581     .widenScalarIf(smallerThan(1, 0),
 582       [](const LegalityQuery &Query) {
 583         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 584       })
 585     .narrowScalarIf(greaterThan(1, 0),
 586       [](const LegalityQuery &Query) {
 587         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 588       });
 589
 590   getActionDefinitionsBuilder(G_PTRTOINT)
 591     // List the common cases
 592     .legalForCartesianProduct(AddrSpaces64, {S64})
 593     .legalForCartesianProduct(AddrSpaces32, {S32})
 594     .scalarize(0)
 595     // Accept any address space as long as the size matches
 596     .legalIf(sameSize(0, 1))
 597     .widenScalarIf(smallerThan(0, 1),
 598       [](const LegalityQuery &Query) {
 599         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 600       })
 601     .narrowScalarIf(
 602       greaterThan(0, 1),
 603       [](const LegalityQuery &Query) {
 604         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 605       });
 606
 607   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 608     .scalarize(0)
 609     .custom();
 610
 611   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 612   // handle some operations by just promoting the register during
 613   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 614   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
 615     switch (AS) {
 616     // FIXME: Private element size.
 617     case AMDGPUAS::PRIVATE_ADDRESS:
 618       return 32;
 619     // FIXME: Check subtarget
 620     case AMDGPUAS::LOCAL_ADDRESS:
 621       return ST.useDS128() ? 128 : 64;
 622
 623     // Treat constant and global as identical. SMRD loads are sometimes usable
 624     // for global loads (ideally constant address space should be eliminated)
 625     // depending on the context. Legality cannot be context dependent, but
 626     // RegBankSelect can split the load as necessary depending on the pointer
 627     // register bank/uniformity and if the memory is invariant or not written in
 628     // a kernel.
 629     case AMDGPUAS::CONSTANT_ADDRESS:
 630     case AMDGPUAS::GLOBAL_ADDRESS:
 631       return 512;
 632     default:
 633       return 128;
 634     }
 635   };
 636
 637   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
 638     const LLT DstTy = Query.Types[0];
 639
 640     // Split vector extloads.
 641     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 642     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
 643       return true;
 644
 645     const LLT PtrTy = Query.Types[1];
 646     unsigned AS = PtrTy.getAddressSpace();
 647     if (MemSize > maxSizeForAddrSpace(AS))
 648       return true;
 649
 650     // Catch weird sized loads that don't evenly divide into the access sizes
 651     // TODO: May be able to widen depending on alignment etc.
 652     unsigned NumRegs = MemSize / 32;
 653     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
 654       return true;
 655
 656     unsigned Align = Query.MMODescrs[0].AlignInBits;
 657     if (Align < MemSize) {
 658       const SITargetLowering *TLI = ST.getTargetLowering();
 659       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
 660     }
 661
 662     return false;
 663   };
 664
 665   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
 666   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
 667   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
 668
 669   // TODO: Refine based on subtargets which support unaligned access or 128-bit
 670   // LDS
 671   // TODO: Unsupported flat for SI.
 672
 673   for (unsigned Op : {G_LOAD, G_STORE}) {
 674     const bool IsStore = Op == G_STORE;
 675
 676     auto &Actions = getActionDefinitionsBuilder(Op);
 677     // Whitelist the common cases.
 678     // TODO: Pointer loads
 679     // TODO: Wide constant loads
 680     // TODO: Only CI+ has 3x loads
 681     // TODO: Loads to s16 on gfx9
 682     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
 683                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
 684                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
 685                                       {S96, GlobalPtr, 96, GlobalAlign32},
 686                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
 687                                       {S128, GlobalPtr, 128, GlobalAlign32},
 688                                       {S64, GlobalPtr, 64, GlobalAlign32},
 689                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
 690                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
 691                                       {S32, GlobalPtr, 8, GlobalAlign8},
 692                                       {S32, GlobalPtr, 16, GlobalAlign16},
 693
 694                                       {S32, LocalPtr, 32, 32},
 695                                       {S64, LocalPtr, 64, 32},
 696                                       {V2S32, LocalPtr, 64, 32},
 697                                       {S32, LocalPtr, 8, 8},
 698                                       {S32, LocalPtr, 16, 16},
 699                                       {V2S16, LocalPtr, 32, 32},
 700
 701                                       {S32, PrivatePtr, 32, 32},
 702                                       {S32, PrivatePtr, 8, 8},
 703                                       {S32, PrivatePtr, 16, 16},
 704                                       {V2S16, PrivatePtr, 32, 32},
 705
 706                                       {S32, FlatPtr, 32, GlobalAlign32},
 707                                       {S32, FlatPtr, 16, GlobalAlign16},
 708                                       {S32, FlatPtr, 8, GlobalAlign8},
 709                                       {V2S16, FlatPtr, 32, GlobalAlign32},
 710
 711                                       {S32, ConstantPtr, 32, GlobalAlign32},
 712                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
 713                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
 714                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
 715                                       {S64, ConstantPtr, 64, GlobalAlign32},
 716                                       {S128, ConstantPtr, 128, GlobalAlign32},
 717                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
 718     Actions
 719         .customIf(typeIs(1, Constant32Ptr))
 720         .narrowScalarIf(
 721             [=](const LegalityQuery &Query) -> bool {
 722               return !Query.Types[0].isVector() && needToSplitLoad(Query);
 723             },
 724             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
 725               const LLT DstTy = Query.Types[0];
 726               const LLT PtrTy = Query.Types[1];
 727
 728               const unsigned DstSize = DstTy.getSizeInBits();
 729               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 730
 731               // Split extloads.
 732               if (DstSize > MemSize)
 733                 return std::make_pair(0, LLT::scalar(MemSize));
 734
 735               if (DstSize > 32 && (DstSize % 32 != 0)) {
 736                 // FIXME: Need a way to specify non-extload of larger size if
 737                 // suitably aligned.
 738                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
 739               }
 740
 741               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
 742               if (MemSize > MaxSize)
 743                 return std::make_pair(0, LLT::scalar(MaxSize));
 744
 745               unsigned Align = Query.MMODescrs[0].AlignInBits;
 746               return std::make_pair(0, LLT::scalar(Align));
 747             })
 748         .fewerElementsIf(
 749             [=](const LegalityQuery &Query) -> bool {
 750               return Query.Types[0].isVector() && needToSplitLoad(Query);
 751             },
 752             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
 753               const LLT DstTy = Query.Types[0];
 754               const LLT PtrTy = Query.Types[1];
 755
 756               LLT EltTy = DstTy.getElementType();
 757               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
 758
 759               // Split if it's too large for the address space.
 760               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
 761                 unsigned NumElts = DstTy.getNumElements();
 762                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
 763
 764                 // FIXME: Refine when odd breakdowns handled
 765                 // The scalars will need to be re-legalized.
 766                 if (NumPieces == 1 || NumPieces >= NumElts ||
 767                     NumElts % NumPieces != 0)
 768                   return std::make_pair(0, EltTy);
 769
 770                 return std::make_pair(0,
 771                                       LLT::vector(NumElts / NumPieces, EltTy));
 772               }
 773
 774               // Need to split because of alignment.
 775               unsigned Align = Query.MMODescrs[0].AlignInBits;
 776               unsigned EltSize = EltTy.getSizeInBits();
 777               if (EltSize > Align &&
 778                   (EltSize / Align < DstTy.getNumElements())) {
 779                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
 780               }
 781
 782               // May need relegalization for the scalars.
 783               return std::make_pair(0, EltTy);
 784             })
 785         .minScalar(0, S32);
 786
 787     if (IsStore)
 788       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
 789
 790     // TODO: Need a bitcast lower option?
 791     Actions
 792         .legalIf([=](const LegalityQuery &Query) {
 793           const LLT Ty0 = Query.Types[0];
 794           unsigned Size = Ty0.getSizeInBits();
 795           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 796           unsigned Align = Query.MMODescrs[0].AlignInBits;
 797
 798           // No extending vector loads.
 799           if (Size > MemSize && Ty0.isVector())
 800             return false;
 801
 802           // FIXME: Widening store from alignment not valid.
 803           if (MemSize < Size)
 804             MemSize = std::max(MemSize, Align);
 805
 806           switch (MemSize) {
 807           case 8:
 808           case 16:
 809             return Size == 32;
 810           case 32:
 811           case 64:
 812           case 128:
 813             return true;
 814           case 96:
 815             return ST.hasDwordx3LoadStores();
 816           case 256:
 817           case 512:
 818             return true;
 819           default:
 820             return false;
 821           }
 822         })
 823         .widenScalarToNextPow2(0)
 824         // TODO: v3s32->v4s32 with alignment
 825         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
 826   }
 827
 828   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
 829                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
 830                                                   {S32, GlobalPtr, 16, 2 * 8},
 831                                                   {S32, LocalPtr, 8, 8},
 832                                                   {S32, LocalPtr, 16, 16},
 833                                                   {S32, PrivatePtr, 8, 8},
 834                                                   {S32, PrivatePtr, 16, 16},
 835                                                   {S32, ConstantPtr, 8, 8},
 836                                                   {S32, ConstantPtr, 16, 2 * 8}});
 837   if (ST.hasFlatAddressSpace()) {
 838     ExtLoads.legalForTypesWithMemDesc(
 839         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
 840   }
 841
 842   ExtLoads.clampScalar(0, S32, S32)
 843           .widenScalarToNextPow2(0)
 844           .unsupportedIfMemSizeNotPow2()
 845           .lower();
 846
 847   auto &Atomics = getActionDefinitionsBuilder(
 848     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
 849      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
 850      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
 851      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
 852     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
 853                {S64, GlobalPtr}, {S64, LocalPtr}});
 854   if (ST.hasFlatAddressSpace()) {
 855     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
 856   }
 857
 858   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
 859     .legalFor({{S32, LocalPtr}});
 860
 861   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
 862     .lower();
 863
 864   // TODO: Pointer types, any 32-bit or 64-bit vector
 865   getActionDefinitionsBuilder(G_SELECT)
 866     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
 867           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
 868           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
 869     .clampScalar(0, S16, S64)
 870     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 871     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
 872     .scalarize(1)
 873     .clampMaxNumElements(0, S32, 2)
 874     .clampMaxNumElements(0, LocalPtr, 2)
 875     .clampMaxNumElements(0, PrivatePtr, 2)
 876     .scalarize(0)
 877     .widenScalarToNextPow2(0)
 878     .legalIf(all(isPointer(0), typeIs(1, S1)));
 879
 880   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
 881   // be more flexible with the shift amount type.
 882   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
 883     .legalFor({{S32, S32}, {S64, S32}});
 884   if (ST.has16BitInsts()) {
 885     if (ST.hasVOP3PInsts()) {
 886       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
 887             .clampMaxNumElements(0, S16, 2);
 888     } else
 889       Shifts.legalFor({{S16, S32}, {S16, S16}});
 890
 891     Shifts.clampScalar(1, S16, S32);
 892     Shifts.clampScalar(0, S16, S64);
 893     Shifts.widenScalarToNextPow2(0, 16);
 894   } else {
 895     // Make sure we legalize the shift amount type first, as the general
 896     // expansion for the shifted type will produce much worse code if it hasn't
 897     // been truncated already.
 898     Shifts.clampScalar(1, S32, S32);
 899     Shifts.clampScalar(0, S32, S64);
 900     Shifts.widenScalarToNextPow2(0, 32);
 901   }
 902   Shifts.scalarize(0);
 903
 904   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
 905     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
 906     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
 907     unsigned IdxTypeIdx = 2;
 908
 909     getActionDefinitionsBuilder(Op)
 910       .customIf([=](const LegalityQuery &Query) {
 911           const LLT EltTy = Query.Types[EltTypeIdx];
 912           const LLT VecTy = Query.Types[VecTypeIdx];
 913           const LLT IdxTy = Query.Types[IdxTypeIdx];
 914           return (EltTy.getSizeInBits() == 16 ||
 915                   EltTy.getSizeInBits() % 32 == 0) &&
 916                  VecTy.getSizeInBits() % 32 == 0 &&
 917                  VecTy.getSizeInBits() <= 1024 &&
 918                  IdxTy.getSizeInBits() == 32;
 919         })
 920       .clampScalar(EltTypeIdx, S32, S64)
 921       .clampScalar(VecTypeIdx, S32, S64)
 922       .clampScalar(IdxTypeIdx, S32, S32);
 923   }
 924
 925   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
 926     .unsupportedIf([=](const LegalityQuery &Query) {
 927         const LLT &EltTy = Query.Types[1].getElementType();
 928         return Query.Types[0] != EltTy;
 929       });
 930
 931   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
 932     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
 933     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
 934
 935     // FIXME: Doesn't handle extract of illegal sizes.
 936     getActionDefinitionsBuilder(Op)
 937       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
 938       // FIXME: Multiples of 16 should not be legal.
 939       .legalIf([=](const LegalityQuery &Query) {
 940           const LLT BigTy = Query.Types[BigTyIdx];
 941           const LLT LitTy = Query.Types[LitTyIdx];
 942           return (BigTy.getSizeInBits() % 32 == 0) &&
 943                  (LitTy.getSizeInBits() % 16 == 0);
 944         })
 945       .widenScalarIf(
 946         [=](const LegalityQuery &Query) {
 947           const LLT BigTy = Query.Types[BigTyIdx];
 948           return (BigTy.getScalarSizeInBits() < 16);
 949         },
 950         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
 951       .widenScalarIf(
 952         [=](const LegalityQuery &Query) {
 953           const LLT LitTy = Query.Types[LitTyIdx];
 954           return (LitTy.getScalarSizeInBits() < 16);
 955         },
 956         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
 957       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 958       .widenScalarToNextPow2(BigTyIdx, 32);
 959
 960   }
 961
 962   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
 963     .legalForCartesianProduct(AllS32Vectors, {S32})
 964     .legalForCartesianProduct(AllS64Vectors, {S64})
 965     .clampNumElements(0, V16S32, V32S32)
 966     .clampNumElements(0, V2S64, V16S64)
 967     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
 968
 969   if (ST.hasScalarPackInsts())
 970     BuildVector.legalFor({V2S16, S32});
 971
 972   BuildVector
 973     .minScalarSameAs(1, 0)
 974     .legalIf(isRegisterType(0))
 975     .minScalarOrElt(0, S32);
 976
 977   if (ST.hasScalarPackInsts()) {
 978     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
 979       .legalFor({V2S16, S32})
 980       .lower();
 981   } else {
 982     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
 983       .lower();
 984   }
 985
 986   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
 987     .legalIf(isRegisterType(0));
 988
 989   // TODO: Don't fully scalarize v2s16 pieces
 990   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
 991
 992   // Merge/Unmerge
 993   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
 994     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
 995     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 996
 997     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
 998       const LLT &Ty = Query.Types[TypeIdx];
 999       if (Ty.isVector()) {
1000         const LLT &EltTy = Ty.getElementType();
1001         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1002           return true;
1003         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1004           return true;
1005       }
1006       return false;
1007     };
1008
1009     auto &Builder = getActionDefinitionsBuilder(Op)
1010       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1011       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1012       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1013       // valid.
1014       .clampScalar(LitTyIdx, S16, S256)
1015       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1016       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1017       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1018                            elementTypeIs(1, S16)),
1019                        changeTo(1, V2S16))
1020       // Break up vectors with weird elements into scalars
1021       .fewerElementsIf(
1022         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1023         scalarize(0))
1024       .fewerElementsIf(
1025         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1026         scalarize(1))
1027       .clampScalar(BigTyIdx, S32, S1024)
1028       .lowerFor({{S16, V2S16}});
1029
1030     if (Op == G_MERGE_VALUES) {
1031       Builder.widenScalarIf(
1032         // TODO: Use 16-bit shifts if legal for 8-bit values?
1033         [=](const LegalityQuery &Query) {
1034           const LLT Ty = Query.Types[LitTyIdx];
1035           return Ty.getSizeInBits() < 32;
1036         },
1037         changeTo(LitTyIdx, S32));
1038     }
1039
1040     Builder.widenScalarIf(
1041       [=](const LegalityQuery &Query) {
1042         const LLT Ty = Query.Types[BigTyIdx];
1043         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1044           Ty.getSizeInBits() % 16 != 0;
1045       },
1046       [=](const LegalityQuery &Query) {
1047         // Pick the next power of 2, or a multiple of 64 over 128.
1048         // Whichever is smaller.
1049         const LLT &Ty = Query.Types[BigTyIdx];
1050         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1051         if (NewSizeInBits >= 256) {
1052           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1053           if (RoundedTo < NewSizeInBits)
1054             NewSizeInBits = RoundedTo;
1055         }
1056         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1057       })
1058       .legalIf([=](const LegalityQuery &Query) {
1059           const LLT &BigTy = Query.Types[BigTyIdx];
1060           const LLT &LitTy = Query.Types[LitTyIdx];
1061
1062           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1063             return false;
1064           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1065             return false;
1066
1067           return BigTy.getSizeInBits() % 16 == 0 &&
1068                  LitTy.getSizeInBits() % 16 == 0 &&
1069                  BigTy.getSizeInBits() <= 1024;
1070         })
1071       // Any vectors left are the wrong size. Scalarize them.
1072       .scalarize(0)
1073       .scalarize(1);
1074   }
1075
1076   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1077
1078   computeTables();
1079   verify(*ST.getInstrInfo());
1080 }
1081
1082 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1083                                          MachineRegisterInfo &MRI,
1084                                          MachineIRBuilder &B,
1085                                          GISelChangeObserver &Observer) const {
1086   switch (MI.getOpcode()) {
1087   case TargetOpcode::G_ADDRSPACE_CAST:
1088     return legalizeAddrSpaceCast(MI, MRI, B);
1089   case TargetOpcode::G_FRINT:
1090     return legalizeFrint(MI, MRI, B);
1091   case TargetOpcode::G_FCEIL:
1092     return legalizeFceil(MI, MRI, B);
1093   case TargetOpcode::G_INTRINSIC_TRUNC:
1094     return legalizeIntrinsicTrunc(MI, MRI, B);
1095   case TargetOpcode::G_SITOFP:
1096     return legalizeITOFP(MI, MRI, B, true);
1097   case TargetOpcode::G_UITOFP:
1098     return legalizeITOFP(MI, MRI, B, false);
1099   case TargetOpcode::G_FMINNUM:
1100   case TargetOpcode::G_FMAXNUM:
1101   case TargetOpcode::G_FMINNUM_IEEE:
1102   case TargetOpcode::G_FMAXNUM_IEEE:
1103     return legalizeMinNumMaxNum(MI, MRI, B);
1104   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1105     return legalizeExtractVectorElt(MI, MRI, B);
1106   case TargetOpcode::G_INSERT_VECTOR_ELT:
1107     return legalizeInsertVectorElt(MI, MRI, B);
1108   case TargetOpcode::G_FSIN:
1109   case TargetOpcode::G_FCOS:
1110     return legalizeSinCos(MI, MRI, B);
1111   case TargetOpcode::G_GLOBAL_VALUE:
1112     return legalizeGlobalValue(MI, MRI, B);
1113   case TargetOpcode::G_LOAD:
1114     return legalizeLoad(MI, MRI, B, Observer);
1115   case TargetOpcode::G_FMAD:
1116     return legalizeFMad(MI, MRI, B);
1117   case TargetOpcode::G_FDIV:
1118     return legalizeFDIV(MI, MRI, B);
1119   default:
1120     return false;
1121   }
1122
1123   llvm_unreachable("expected switch to return");
1124 }
1125
1126 Register AMDGPULegalizerInfo::getSegmentAperture(
1127   unsigned AS,
1128   MachineRegisterInfo &MRI,
1129   MachineIRBuilder &B) const {
1130   MachineFunction &MF = B.getMF();
1131   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132   const LLT S32 = LLT::scalar(32);
1133
1134   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1135
1136   if (ST.hasApertureRegs()) {
1137     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1138     // getreg.
1139     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1140         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1141         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1142     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1143         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1144         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1145     unsigned Encoding =
1146         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1147         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1148         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1149
1150     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1151     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1152
1153     B.buildInstr(AMDGPU::S_GETREG_B32)
1154       .addDef(GetReg)
1155       .addImm(Encoding);
1156     MRI.setType(GetReg, S32);
1157
1158     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1159     B.buildInstr(TargetOpcode::G_SHL)
1160       .addDef(ApertureReg)
1161       .addUse(GetReg)
1162       .addUse(ShiftAmt.getReg(0));
1163
1164     return ApertureReg;
1165   }
1166
1167   Register QueuePtr = MRI.createGenericVirtualRegister(
1168     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1169
1170   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1171   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1172     return Register();
1173
1174   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1175   // private_segment_aperture_base_hi.
1176   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1177
1178   // FIXME: Don't use undef
1179   Value *V = UndefValue::get(PointerType::get(
1180                                Type::getInt8Ty(MF.getFunction().getContext()),
1181                                AMDGPUAS::CONSTANT_ADDRESS));
1182
1183   MachinePointerInfo PtrInfo(V, StructOffset);
1184   MachineMemOperand *MMO = MF.getMachineMemOperand(
1185     PtrInfo,
1186     MachineMemOperand::MOLoad |
1187     MachineMemOperand::MODereferenceable |
1188     MachineMemOperand::MOInvariant,
1189     4,
1190     MinAlign(64, StructOffset));
1191
1192   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1193   Register LoadAddr;
1194
1195   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1196   B.buildLoad(LoadResult, LoadAddr, *MMO);
1197   return LoadResult;
1198 }
1199
1200 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1201   MachineInstr &MI, MachineRegisterInfo &MRI,
1202   MachineIRBuilder &B) const {
1203   MachineFunction &MF = B.getMF();
1204
1205   B.setInstr(MI);
1206
1207   const LLT S32 = LLT::scalar(32);
1208   Register Dst = MI.getOperand(0).getReg();
1209   Register Src = MI.getOperand(1).getReg();
1210
1211   LLT DstTy = MRI.getType(Dst);
1212   LLT SrcTy = MRI.getType(Src);
1213   unsigned DestAS = DstTy.getAddressSpace();
1214   unsigned SrcAS = SrcTy.getAddressSpace();
1215
1216   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1217   // vector element.
1218   assert(!DstTy.isVector());
1219
1220   const AMDGPUTargetMachine &TM
1221     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1222
1223   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1224   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1225     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1226     return true;
1227   }
1228
1229   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1230     // Truncate.
1231     B.buildExtract(Dst, Src, 0);
1232     MI.eraseFromParent();
1233     return true;
1234   }
1235
1236   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1237     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1238     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1239
1240     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1241     // another. Merge operands are required to be the same type, but creating an
1242     // extra ptrtoint would be kind of pointless.
1243     auto HighAddr = B.buildConstant(
1244       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1245     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1246     MI.eraseFromParent();
1247     return true;
1248   }
1249
1250   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1251     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1252            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1253     unsigned NullVal = TM.getNullPointerValue(DestAS);
1254
1255     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1256     auto FlatNull = B.buildConstant(SrcTy, 0);
1257
1258     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1259
1260     // Extract low 32-bits of the pointer.
1261     B.buildExtract(PtrLo32, Src, 0);
1262
1263     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1264     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1265     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1266
1267     MI.eraseFromParent();
1268     return true;
1269   }
1270
1271   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1272     return false;
1273
1274   if (!ST.hasFlatAddressSpace())
1275     return false;
1276
1277   auto SegmentNull =
1278       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1279   auto FlatNull =
1280       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1281
1282   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1283   if (!ApertureReg.isValid())
1284     return false;
1285
1286   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1287   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1288
1289   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1290
1291   // Coerce the type of the low half of the result so we can use merge_values.
1292   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1293   B.buildInstr(TargetOpcode::G_PTRTOINT)
1294     .addDef(SrcAsInt)
1295     .addUse(Src);
1296
1297   // TODO: Should we allow mismatched types but matching sizes in merges to
1298   // avoid the ptrtoint?
1299   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1300   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1301
1302   MI.eraseFromParent();
1303   return true;
1304 }
1305
1306 bool AMDGPULegalizerInfo::legalizeFrint(
1307   MachineInstr &MI, MachineRegisterInfo &MRI,
1308   MachineIRBuilder &B) const {
1309   B.setInstr(MI);
1310
1311   Register Src = MI.getOperand(1).getReg();
1312   LLT Ty = MRI.getType(Src);
1313   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1314
1315   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1316   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1317
1318   auto C1 = B.buildFConstant(Ty, C1Val);
1319   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1320
1321   // TODO: Should this propagate fast-math-flags?
1322   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1323   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1324
1325   auto C2 = B.buildFConstant(Ty, C2Val);
1326   auto Fabs = B.buildFAbs(Ty, Src);
1327
1328   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1329   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1330   return true;
1331 }
1332
1333 bool AMDGPULegalizerInfo::legalizeFceil(
1334   MachineInstr &MI, MachineRegisterInfo &MRI,
1335   MachineIRBuilder &B) const {
1336   B.setInstr(MI);
1337
1338   const LLT S1 = LLT::scalar(1);
1339   const LLT S64 = LLT::scalar(64);
1340
1341   Register Src = MI.getOperand(1).getReg();
1342   assert(MRI.getType(Src) == S64);
1343
1344   // result = trunc(src)
1345   // if (src > 0.0 && src != result)
1346   //   result += 1.0
1347
1348   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1349
1350   const auto Zero = B.buildFConstant(S64, 0.0);
1351   const auto One = B.buildFConstant(S64, 1.0);
1352   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1353   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1354   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1355   auto Add = B.buildSelect(S64, And, One, Zero);
1356
1357   // TODO: Should this propagate fast-math-flags?
1358   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1359   return true;
1360 }
1361
1362 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1363                                               MachineIRBuilder &B) {
1364   const unsigned FractBits = 52;
1365   const unsigned ExpBits = 11;
1366   LLT S32 = LLT::scalar(32);
1367
1368   auto Const0 = B.buildConstant(S32, FractBits - 32);
1369   auto Const1 = B.buildConstant(S32, ExpBits);
1370
1371   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1372     .addUse(Const0.getReg(0))
1373     .addUse(Const1.getReg(0));
1374
1375   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1376 }
1377
1378 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1379   MachineInstr &MI, MachineRegisterInfo &MRI,
1380   MachineIRBuilder &B) const {
1381   B.setInstr(MI);
1382
1383   const LLT S1 = LLT::scalar(1);
1384   const LLT S32 = LLT::scalar(32);
1385   const LLT S64 = LLT::scalar(64);
1386
1387   Register Src = MI.getOperand(1).getReg();
1388   assert(MRI.getType(Src) == S64);
1389
1390   // TODO: Should this use extract since the low half is unused?
1391   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1392   Register Hi = Unmerge.getReg(1);
1393
1394   // Extract the upper half, since this is where we will find the sign and
1395   // exponent.
1396   auto Exp = extractF64Exponent(Hi, B);
1397
1398   const unsigned FractBits = 52;
1399
1400   // Extract the sign bit.
1401   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1402   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1403
1404   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1405
1406   const auto Zero32 = B.buildConstant(S32, 0);
1407
1408   // Extend back to 64-bits.
1409   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1410
1411   auto Shr = B.buildAShr(S64, FractMask, Exp);
1412   auto Not = B.buildNot(S64, Shr);
1413   auto Tmp0 = B.buildAnd(S64, Src, Not);
1414   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1415
1416   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1417   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1418
1419   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1420   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1421   return true;
1422 }
1423
1424 bool AMDGPULegalizerInfo::legalizeITOFP(
1425   MachineInstr &MI, MachineRegisterInfo &MRI,
1426   MachineIRBuilder &B, bool Signed) const {
1427   B.setInstr(MI);
1428
1429   Register Dst = MI.getOperand(0).getReg();
1430   Register Src = MI.getOperand(1).getReg();
1431
1432   const LLT S64 = LLT::scalar(64);
1433   const LLT S32 = LLT::scalar(32);
1434
1435   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1436
1437   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1438
1439   auto CvtHi = Signed ?
1440     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1441     B.buildUITOFP(S64, Unmerge.getReg(1));
1442
1443   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1444
1445   auto ThirtyTwo = B.buildConstant(S32, 32);
1446   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1447     .addUse(CvtHi.getReg(0))
1448     .addUse(ThirtyTwo.getReg(0));
1449
1450   // TODO: Should this propagate fast-math-flags?
1451   B.buildFAdd(Dst, LdExp, CvtLo);
1452   MI.eraseFromParent();
1453   return true;
1454 }
1455
1456 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1457   MachineInstr &MI, MachineRegisterInfo &MRI,
1458   MachineIRBuilder &B) const {
1459   MachineFunction &MF = B.getMF();
1460   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1461
1462   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1463                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1464
1465   // With ieee_mode disabled, the instructions have the correct behavior
1466   // already for G_FMINNUM/G_FMAXNUM
1467   if (!MFI->getMode().IEEE)
1468     return !IsIEEEOp;
1469
1470   if (IsIEEEOp)
1471     return true;
1472
1473   MachineIRBuilder HelperBuilder(MI);
1474   GISelObserverWrapper DummyObserver;
1475   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1476   HelperBuilder.setInstr(MI);
1477   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1478 }
1479
1480 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1481   MachineInstr &MI, MachineRegisterInfo &MRI,
1482   MachineIRBuilder &B) const {
1483   // TODO: Should move some of this into LegalizerHelper.
1484
1485   // TODO: Promote dynamic indexing of s16 to s32
1486   // TODO: Dynamic s64 indexing is only legal for SGPR.
1487   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1488   if (!IdxVal) // Dynamic case will be selected to register indexing.
1489     return true;
1490
1491   Register Dst = MI.getOperand(0).getReg();
1492   Register Vec = MI.getOperand(1).getReg();
1493
1494   LLT VecTy = MRI.getType(Vec);
1495   LLT EltTy = VecTy.getElementType();
1496   assert(EltTy == MRI.getType(Dst));
1497
1498   B.setInstr(MI);
1499
1500   if (IdxVal.getValue() < VecTy.getNumElements())
1501     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1502   else
1503     B.buildUndef(Dst);
1504
1505   MI.eraseFromParent();
1506   return true;
1507 }
1508
1509 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1510   MachineInstr &MI, MachineRegisterInfo &MRI,
1511   MachineIRBuilder &B) const {
1512   // TODO: Should move some of this into LegalizerHelper.
1513
1514   // TODO: Promote dynamic indexing of s16 to s32
1515   // TODO: Dynamic s64 indexing is only legal for SGPR.
1516   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1517   if (!IdxVal) // Dynamic case will be selected to register indexing.
1518     return true;
1519
1520   Register Dst = MI.getOperand(0).getReg();
1521   Register Vec = MI.getOperand(1).getReg();
1522   Register Ins = MI.getOperand(2).getReg();
1523
1524   LLT VecTy = MRI.getType(Vec);
1525   LLT EltTy = VecTy.getElementType();
1526   assert(EltTy == MRI.getType(Ins));
1527
1528   B.setInstr(MI);
1529
1530   if (IdxVal.getValue() < VecTy.getNumElements())
1531     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1532   else
1533     B.buildUndef(Dst);
1534
1535   MI.eraseFromParent();
1536   return true;
1537 }
1538
1539 bool AMDGPULegalizerInfo::legalizeSinCos(
1540   MachineInstr &MI, MachineRegisterInfo &MRI,
1541   MachineIRBuilder &B) const {
1542   B.setInstr(MI);
1543
1544   Register DstReg = MI.getOperand(0).getReg();
1545   Register SrcReg = MI.getOperand(1).getReg();
1546   LLT Ty = MRI.getType(DstReg);
1547   unsigned Flags = MI.getFlags();
1548
1549   Register TrigVal;
1550   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1551   if (ST.hasTrigReducedRange()) {
1552     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1553     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1554       .addUse(MulVal.getReg(0))
1555       .setMIFlags(Flags).getReg(0);
1556   } else
1557     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1558
1559   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1560     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1561   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1562     .addUse(TrigVal)
1563     .setMIFlags(Flags);
1564   MI.eraseFromParent();
1565   return true;
1566 }
1567
1568 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1569   Register DstReg, LLT PtrTy,
1570   MachineIRBuilder &B, const GlobalValue *GV,
1571   unsigned Offset, unsigned GAFlags) const {
1572   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1573   // to the following code sequence:
1574   //
1575   // For constant address space:
1576   //   s_getpc_b64 s[0:1]
1577   //   s_add_u32 s0, s0, $symbol
1578   //   s_addc_u32 s1, s1, 0
1579   //
1580   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1581   //   a fixup or relocation is emitted to replace $symbol with a literal
1582   //   constant, which is a pc-relative offset from the encoding of the $symbol
1583   //   operand to the global variable.
1584   //
1585   // For global address space:
1586   //   s_getpc_b64 s[0:1]
1587   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1588   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1589   //
1590   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591   //   fixups or relocations are emitted to replace $symbol@*@lo and
1592   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1593   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1594   //   operand to the global variable.
1595   //
1596   // What we want here is an offset from the value returned by s_getpc
1597   // (which is the address of the s_add_u32 instruction) to the global
1598   // variable, but since the encoding of $symbol starts 4 bytes after the start
1599   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1600   // small. This requires us to add 4 to the global variable offset in order to
1601   // compute the correct address.
1602
1603   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1604
1605   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1606     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1607
1608   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1609     .addDef(PCReg);
1610
1611   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1612   if (GAFlags == SIInstrInfo::MO_NONE)
1613     MIB.addImm(0);
1614   else
1615     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1616
1617   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1618
1619   if (PtrTy.getSizeInBits() == 32)
1620     B.buildExtract(DstReg, PCReg, 0);
1621   return true;
1622  }
1623
1624 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1625   MachineInstr &MI, MachineRegisterInfo &MRI,
1626   MachineIRBuilder &B) const {
1627   Register DstReg = MI.getOperand(0).getReg();
1628   LLT Ty = MRI.getType(DstReg);
1629   unsigned AS = Ty.getAddressSpace();
1630
1631   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1632   MachineFunction &MF = B.getMF();
1633   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1634   B.setInstr(MI);
1635
1636   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1637     if (!MFI->isEntryFunction()) {
1638       const Function &Fn = MF.getFunction();
1639       DiagnosticInfoUnsupported BadLDSDecl(
1640         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1641       Fn.getContext().diagnose(BadLDSDecl);
1642     }
1643
1644     // TODO: We could emit code to handle the initialization somewhere.
1645     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1646       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1647       MI.eraseFromParent();
1648       return true;
1649     }
1650
1651     const Function &Fn = MF.getFunction();
1652     DiagnosticInfoUnsupported BadInit(
1653       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1654     Fn.getContext().diagnose(BadInit);
1655     return true;
1656   }
1657
1658   const SITargetLowering *TLI = ST.getTargetLowering();
1659
1660   if (TLI->shouldEmitFixup(GV)) {
1661     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1662     MI.eraseFromParent();
1663     return true;
1664   }
1665
1666   if (TLI->shouldEmitPCReloc(GV)) {
1667     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1668     MI.eraseFromParent();
1669     return true;
1670   }
1671
1672   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1673   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1674
1675   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1676     MachinePointerInfo::getGOT(MF),
1677     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1678     MachineMemOperand::MOInvariant,
1679     8 /*Size*/, 8 /*Align*/);
1680
1681   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1682
1683   if (Ty.getSizeInBits() == 32) {
1684     // Truncate if this is a 32-bit constant adrdess.
1685     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1686     B.buildExtract(DstReg, Load, 0);
1687   } else
1688     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1689
1690   MI.eraseFromParent();
1691   return true;
1692 }
1693
1694 bool AMDGPULegalizerInfo::legalizeLoad(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1697   B.setInstr(MI);
1698   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1699   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1700   Observer.changingInstr(MI);
1701   MI.getOperand(1).setReg(Cast.getReg(0));
1702   Observer.changedInstr(MI);
1703   return true;
1704 }
1705
1706 bool AMDGPULegalizerInfo::legalizeFMad(
1707   MachineInstr &MI, MachineRegisterInfo &MRI,
1708   MachineIRBuilder &B) const {
1709   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1710   assert(Ty.isScalar());
1711
1712   // TODO: Always legal with future ftz flag.
1713   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1714     return true;
1715   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1716     return true;
1717
1718   MachineFunction &MF = B.getMF();
1719
1720   MachineIRBuilder HelperBuilder(MI);
1721   GISelObserverWrapper DummyObserver;
1722   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1723   HelperBuilder.setMBB(*MI.getParent());
1724   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1725 }
1726
1727 // Return the use branch instruction, otherwise null if the usage is invalid.
1728 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1729                                        MachineRegisterInfo &MRI) {
1730   Register CondDef = MI.getOperand(0).getReg();
1731   if (!MRI.hasOneNonDBGUse(CondDef))
1732     return nullptr;
1733
1734   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1735   return UseMI.getParent() == MI.getParent() &&
1736     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1737 }
1738
1739 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1740                                                 Register Reg, LLT Ty) const {
1741   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1742   if (LiveIn)
1743     return LiveIn;
1744
1745   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1746   MRI.addLiveIn(Reg, NewReg);
1747   return NewReg;
1748 }
1749
1750 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1751                                          const ArgDescriptor *Arg) const {
1752   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1753     return false; // TODO: Handle these
1754
1755   assert(Arg->getRegister().isPhysical());
1756
1757   MachineRegisterInfo &MRI = *B.getMRI();
1758
1759   LLT Ty = MRI.getType(DstReg);
1760   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1761
1762   if (Arg->isMasked()) {
1763     // TODO: Should we try to emit this once in the entry block?
1764     const LLT S32 = LLT::scalar(32);
1765     const unsigned Mask = Arg->getMask();
1766     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1767
1768     Register AndMaskSrc = LiveIn;
1769
1770     if (Shift != 0) {
1771       auto ShiftAmt = B.buildConstant(S32, Shift);
1772       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1773     }
1774
1775     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1776   } else
1777     B.buildCopy(DstReg, LiveIn);
1778
1779   // Insert the argument copy if it doens't already exist.
1780   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1781   if (!MRI.getVRegDef(LiveIn)) {
1782     // FIXME: Should have scoped insert pt
1783     MachineBasicBlock &OrigInsBB = B.getMBB();
1784     auto OrigInsPt = B.getInsertPt();
1785
1786     MachineBasicBlock &EntryMBB = B.getMF().front();
1787     EntryMBB.addLiveIn(Arg->getRegister());
1788     B.setInsertPt(EntryMBB, EntryMBB.begin());
1789     B.buildCopy(LiveIn, Arg->getRegister());
1790
1791     B.setInsertPt(OrigInsBB, OrigInsPt);
1792   }
1793
1794   return true;
1795 }
1796
1797 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1798   MachineInstr &MI,
1799   MachineRegisterInfo &MRI,
1800   MachineIRBuilder &B,
1801   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1802   B.setInstr(MI);
1803
1804   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1805
1806   const ArgDescriptor *Arg;
1807   const TargetRegisterClass *RC;
1808   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1809   if (!Arg) {
1810     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1811     return false;
1812   }
1813
1814   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1815     MI.eraseFromParent();
1816     return true;
1817   }
1818
1819   return false;
1820 }
1821
1822 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1823                                        MachineRegisterInfo &MRI,
1824                                        MachineIRBuilder &B) const {
1825   B.setInstr(MI);
1826
1827   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1828     return true;
1829
1830   return false;
1831 }
1832
1833 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1834                                                  MachineRegisterInfo &MRI,
1835                                                  MachineIRBuilder &B) const {
1836   Register Res = MI.getOperand(0).getReg();
1837   Register LHS = MI.getOperand(1).getReg();
1838   Register RHS = MI.getOperand(2).getReg();
1839
1840   uint16_t Flags = MI.getFlags();
1841
1842   LLT ResTy = MRI.getType(Res);
1843   LLT S32 = LLT::scalar(32);
1844   LLT S64 = LLT::scalar(64);
1845
1846   const MachineFunction &MF = B.getMF();
1847   bool Unsafe =
1848     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1849
1850   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1851     return false;
1852
1853   if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1854     return false;
1855
1856   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1857     // 1 / x -> RCP(x)
1858     if (CLHS->isExactlyValue(1.0)) {
1859       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1860         .addUse(RHS)
1861         .setMIFlags(Flags);
1862
1863       MI.eraseFromParent();
1864       return true;
1865     }
1866
1867     // -1 / x -> RCP( FNEG(x) )
1868     if (CLHS->isExactlyValue(-1.0)) {
1869       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1870       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1871         .addUse(FNeg.getReg(0))
1872         .setMIFlags(Flags);
1873
1874       MI.eraseFromParent();
1875       return true;
1876     }
1877   }
1878
1879   // x / y -> x * (1.0 / y)
1880   if (Unsafe) {
1881     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1882       .addUse(RHS)
1883       .setMIFlags(Flags);
1884     B.buildFMul(Res, LHS, RCP, Flags);
1885
1886     MI.eraseFromParent();
1887     return true;
1888   }
1889
1890   return false;
1891 }
1892
1893 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
1894                                                  MachineRegisterInfo &MRI,
1895                                                  MachineIRBuilder &B) const {
1896   B.setInstr(MI);
1897   Register Res = MI.getOperand(0).getReg();
1898   Register LHS = MI.getOperand(2).getReg();
1899   Register RHS = MI.getOperand(3).getReg();
1900   uint16_t Flags = MI.getFlags();
1901
1902   LLT S32 = LLT::scalar(32);
1903   LLT S1 = LLT::scalar(1);
1904
1905   auto Abs = B.buildFAbs(S32, RHS, Flags);
1906   const APFloat C0Val(1.0f);
1907
1908   auto C0 = B.buildConstant(S32, 0x6f800000);
1909   auto C1 = B.buildConstant(S32, 0x2f800000);
1910   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1911
1912   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1913   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1914
1915   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1916
1917   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1918     .addUse(Mul0.getReg(0))
1919     .setMIFlags(Flags);
1920
1921   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1922
1923   B.buildFMul(Res, Sel, Mul1, Flags);
1924
1925   MI.eraseFromParent();
1926   return true;
1927 }
1928
1929 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1930                                                  MachineRegisterInfo &MRI,
1931                                                  MachineIRBuilder &B) const {
1932   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1933   if (!MFI->isEntryFunction()) {
1934     return legalizePreloadedArgIntrin(MI, MRI, B,
1935                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1936   }
1937
1938   B.setInstr(MI);
1939
1940   uint64_t Offset =
1941     ST.getTargetLowering()->getImplicitParameterOffset(
1942       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1943   Register DstReg = MI.getOperand(0).getReg();
1944   LLT DstTy = MRI.getType(DstReg);
1945   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1946
1947   const ArgDescriptor *Arg;
1948   const TargetRegisterClass *RC;
1949   std::tie(Arg, RC)
1950     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1951   if (!Arg)
1952     return false;
1953
1954   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1955   if (!loadInputValue(KernargPtrReg, B, Arg))
1956     return false;
1957
1958   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1959   MI.eraseFromParent();
1960   return true;
1961 }
1962
1963 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1964                                               MachineRegisterInfo &MRI,
1965                                               MachineIRBuilder &B,
1966                                               unsigned AddrSpace) const {
1967   B.setInstr(MI);
1968   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1969   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1970   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1971   MI.eraseFromParent();
1972   return true;
1973 }
1974
1975 /// Handle register layout difference for f16 images for some subtargets.
1976 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1977                                              MachineRegisterInfo &MRI,
1978                                              Register Reg) const {
1979   if (!ST.hasUnpackedD16VMem())
1980     return Reg;
1981
1982   const LLT S16 = LLT::scalar(16);
1983   const LLT S32 = LLT::scalar(32);
1984   LLT StoreVT = MRI.getType(Reg);
1985   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1986
1987   auto Unmerge = B.buildUnmerge(S16, Reg);
1988
1989   SmallVector<Register, 4> WideRegs;
1990   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1991     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1992
1993   int NumElts = StoreVT.getNumElements();
1994
1995   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1996 }
1997
1998 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1999                                                  MachineRegisterInfo &MRI,
2000                                                  MachineIRBuilder &B,
2001                                                  bool IsFormat) const {
2002   // TODO: Reject f16 format on targets where unsupported.
2003   Register VData = MI.getOperand(1).getReg();
2004   LLT Ty = MRI.getType(VData);
2005
2006   B.setInstr(MI);
2007
2008   const LLT S32 = LLT::scalar(32);
2009   const LLT S16 = LLT::scalar(16);
2010
2011   // Fixup illegal register types for i8 stores.
2012   if (Ty == LLT::scalar(8) || Ty == S16) {
2013     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2014     MI.getOperand(1).setReg(AnyExt);
2015     return true;
2016   }
2017
2018   if (Ty.isVector()) {
2019     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2020       if (IsFormat)
2021         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2022       return true;
2023     }
2024
2025     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2026   }
2027
2028   return Ty == S32;
2029 }
2030
2031 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2032                                             MachineRegisterInfo &MRI,
2033                                             MachineIRBuilder &B) const {
2034   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2035   switch (MI.getIntrinsicID()) {
2036   case Intrinsic::amdgcn_if: {
2037     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2038       const SIRegisterInfo *TRI
2039         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2040
2041       B.setInstr(*BrCond);
2042       Register Def = MI.getOperand(1).getReg();
2043       Register Use = MI.getOperand(3).getReg();
2044       B.buildInstr(AMDGPU::SI_IF)
2045         .addDef(Def)
2046         .addUse(Use)
2047         .addMBB(BrCond->getOperand(1).getMBB());
2048
2049       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2050       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2051       MI.eraseFromParent();
2052       BrCond->eraseFromParent();
2053       return true;
2054     }
2055
2056     return false;
2057   }
2058   case Intrinsic::amdgcn_loop: {
2059     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2060       const SIRegisterInfo *TRI
2061         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2062
2063       B.setInstr(*BrCond);
2064       Register Reg = MI.getOperand(2).getReg();
2065       B.buildInstr(AMDGPU::SI_LOOP)
2066         .addUse(Reg)
2067         .addMBB(BrCond->getOperand(1).getMBB());
2068       MI.eraseFromParent();
2069       BrCond->eraseFromParent();
2070       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2071       return true;
2072     }
2073
2074     return false;
2075   }
2076   case Intrinsic::amdgcn_kernarg_segment_ptr:
2077     return legalizePreloadedArgIntrin(
2078       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2079   case Intrinsic::amdgcn_implicitarg_ptr:
2080     return legalizeImplicitArgPtr(MI, MRI, B);
2081   case Intrinsic::amdgcn_workitem_id_x:
2082     return legalizePreloadedArgIntrin(MI, MRI, B,
2083                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2084   case Intrinsic::amdgcn_workitem_id_y:
2085     return legalizePreloadedArgIntrin(MI, MRI, B,
2086                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2087   case Intrinsic::amdgcn_workitem_id_z:
2088     return legalizePreloadedArgIntrin(MI, MRI, B,
2089                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2090   case Intrinsic::amdgcn_workgroup_id_x:
2091     return legalizePreloadedArgIntrin(MI, MRI, B,
2092                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2093   case Intrinsic::amdgcn_workgroup_id_y:
2094     return legalizePreloadedArgIntrin(MI, MRI, B,
2095                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2096   case Intrinsic::amdgcn_workgroup_id_z:
2097     return legalizePreloadedArgIntrin(MI, MRI, B,
2098                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2099   case Intrinsic::amdgcn_dispatch_ptr:
2100     return legalizePreloadedArgIntrin(MI, MRI, B,
2101                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2102   case Intrinsic::amdgcn_queue_ptr:
2103     return legalizePreloadedArgIntrin(MI, MRI, B,
2104                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2105   case Intrinsic::amdgcn_implicit_buffer_ptr:
2106     return legalizePreloadedArgIntrin(
2107       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2108   case Intrinsic::amdgcn_dispatch_id:
2109     return legalizePreloadedArgIntrin(MI, MRI, B,
2110                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2111   case Intrinsic::amdgcn_fdiv_fast:
2112     return legalizeFDIVFastIntrin(MI, MRI, B);
2113   case Intrinsic::amdgcn_is_shared:
2114     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2115   case Intrinsic::amdgcn_is_private:
2116     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2117   case Intrinsic::amdgcn_wavefrontsize: {
2118     B.setInstr(MI);
2119     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2120     MI.eraseFromParent();
2121     return true;
2122   }
2123   case Intrinsic::amdgcn_raw_buffer_store:
2124     return legalizeRawBufferStore(MI, MRI, B, false);
2125   case Intrinsic::amdgcn_raw_buffer_store_format:
2126     return legalizeRawBufferStore(MI, MRI, B, true);
2127   default:
2128     return true;
2129   }
2130
2131   return true;
2132 }