lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #if defined(_MSC_VER) || defined(__MINGW32__)
  15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
  16 // from the Visual C++ cmath / math.h headers:
  17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
  18 #define _USE_MATH_DEFINES
  19 #endif
  20
  21 #include "AMDGPU.h"
  22 #include "AMDGPULegalizerInfo.h"
  23 #include "AMDGPUTargetMachine.h"
  24 #include "SIMachineFunctionInfo.h"
  25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  27 #include "llvm/CodeGen/TargetOpcodes.h"
  28 #include "llvm/CodeGen/ValueTypes.h"
  29 #include "llvm/IR/DerivedTypes.h"
  30 #include "llvm/IR/DiagnosticInfo.h"
  31 #include "llvm/IR/Type.h"
  32 #include "llvm/Support/Debug.h"
  33
  34 #define DEBUG_TYPE "amdgpu-legalinfo"
  35
  36 using namespace llvm;
  37 using namespace LegalizeActions;
  38 using namespace LegalizeMutations;
  39 using namespace LegalityPredicates;
  40
  41
  42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
  43                                       unsigned MaxSize = 1024) {
  44   return [=](const LegalityQuery &Query) {
  45     const LLT Ty = Query.Types[TypeIdx];
  46     const LLT EltTy = Ty.getScalarType();
  47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
  48   };
  49 }
  50
  51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
  52   return [=](const LegalityQuery &Query) {
  53     return Query.Types[TypeIdx].getSizeInBits() == Size;
  54   };
  55 }
  56
  57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  58   return [=](const LegalityQuery &Query) {
  59     const LLT Ty = Query.Types[TypeIdx];
  60     return Ty.isVector() &&
  61            Ty.getNumElements() % 2 != 0 &&
  62            Ty.getElementType().getSizeInBits() < 32 &&
  63            Ty.getSizeInBits() % 32 != 0;
  64   };
  65 }
  66
  67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
  68   return [=](const LegalityQuery &Query) {
  69     const LLT Ty = Query.Types[TypeIdx];
  70     const LLT EltTy = Ty.getScalarType();
  71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
  72   };
  73 }
  74
  75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  76   return [=](const LegalityQuery &Query) {
  77     const LLT Ty = Query.Types[TypeIdx];
  78     const LLT EltTy = Ty.getElementType();
  79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  80   };
  81 }
  82
  83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  84   return [=](const LegalityQuery &Query) {
  85     const LLT Ty = Query.Types[TypeIdx];
  86     const LLT EltTy = Ty.getElementType();
  87     unsigned Size = Ty.getSizeInBits();
  88     unsigned Pieces = (Size + 63) / 64;
  89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  91   };
  92 }
  93
  94 // Increase the number of vector elements to reach the next multiple of 32-bit
  95 // type.
  96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
  97   return [=](const LegalityQuery &Query) {
  98     const LLT Ty = Query.Types[TypeIdx];
  99
 100     const LLT EltTy = Ty.getElementType();
 101     const int Size = Ty.getSizeInBits();
 102     const int EltSize = EltTy.getSizeInBits();
 103     const int NextMul32 = (Size + 31) / 32;
 104
 105     assert(EltSize < 32);
 106
 107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
 108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
 109   };
 110 }
 111
 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
 113   return [=](const LegalityQuery &Query) {
 114     const LLT QueryTy = Query.Types[TypeIdx];
 115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
 116   };
 117 }
 118
 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
 120   return [=](const LegalityQuery &Query) {
 121     const LLT QueryTy = Query.Types[TypeIdx];
 122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
 123   };
 124 }
 125
 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
 127   return [=](const LegalityQuery &Query) {
 128     const LLT QueryTy = Query.Types[TypeIdx];
 129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
 130   };
 131 }
 132
 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
 134 // v2s16.
 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 136   return [=](const LegalityQuery &Query) {
 137     const LLT Ty = Query.Types[TypeIdx];
 138     if (Ty.isVector()) {
 139       const int EltSize = Ty.getElementType().getSizeInBits();
 140       return EltSize == 32 || EltSize == 64 ||
 141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 142              EltSize == 128 || EltSize == 256;
 143     }
 144
 145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
 146   };
 147 }
 148
 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
 150   return [=](const LegalityQuery &Query) {
 151     return Query.Types[TypeIdx].getElementType() == Type;
 152   };
 153 }
 154
 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
 156   return [=](const LegalityQuery &Query) {
 157     const LLT Ty = Query.Types[TypeIdx];
 158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
 159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
 160   };
 161 }
 162
 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 164                                          const GCNTargetMachine &TM)
 165   :  ST(ST_) {
 166   using namespace TargetOpcode;
 167
 168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 170   };
 171
 172   const LLT S1 = LLT::scalar(1);
 173   const LLT S8 = LLT::scalar(8);
 174   const LLT S16 = LLT::scalar(16);
 175   const LLT S32 = LLT::scalar(32);
 176   const LLT S64 = LLT::scalar(64);
 177   const LLT S96 = LLT::scalar(96);
 178   const LLT S128 = LLT::scalar(128);
 179   const LLT S256 = LLT::scalar(256);
 180   const LLT S1024 = LLT::scalar(1024);
 181
 182   const LLT V2S16 = LLT::vector(2, 16);
 183   const LLT V4S16 = LLT::vector(4, 16);
 184
 185   const LLT V2S32 = LLT::vector(2, 32);
 186   const LLT V3S32 = LLT::vector(3, 32);
 187   const LLT V4S32 = LLT::vector(4, 32);
 188   const LLT V5S32 = LLT::vector(5, 32);
 189   const LLT V6S32 = LLT::vector(6, 32);
 190   const LLT V7S32 = LLT::vector(7, 32);
 191   const LLT V8S32 = LLT::vector(8, 32);
 192   const LLT V9S32 = LLT::vector(9, 32);
 193   const LLT V10S32 = LLT::vector(10, 32);
 194   const LLT V11S32 = LLT::vector(11, 32);
 195   const LLT V12S32 = LLT::vector(12, 32);
 196   const LLT V13S32 = LLT::vector(13, 32);
 197   const LLT V14S32 = LLT::vector(14, 32);
 198   const LLT V15S32 = LLT::vector(15, 32);
 199   const LLT V16S32 = LLT::vector(16, 32);
 200   const LLT V32S32 = LLT::vector(32, 32);
 201
 202   const LLT V2S64 = LLT::vector(2, 64);
 203   const LLT V3S64 = LLT::vector(3, 64);
 204   const LLT V4S64 = LLT::vector(4, 64);
 205   const LLT V5S64 = LLT::vector(5, 64);
 206   const LLT V6S64 = LLT::vector(6, 64);
 207   const LLT V7S64 = LLT::vector(7, 64);
 208   const LLT V8S64 = LLT::vector(8, 64);
 209   const LLT V16S64 = LLT::vector(16, 64);
 210
 211   std::initializer_list<LLT> AllS32Vectors =
 212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
 214   std::initializer_list<LLT> AllS64Vectors =
 215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
 216
 217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 224
 225   const LLT CodePtr = FlatPtr;
 226
 227   const std::initializer_list<LLT> AddrSpaces64 = {
 228     GlobalPtr, ConstantPtr, FlatPtr
 229   };
 230
 231   const std::initializer_list<LLT> AddrSpaces32 = {
 232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 233   };
 234
 235   const std::initializer_list<LLT> FPTypesBase = {
 236     S32, S64
 237   };
 238
 239   const std::initializer_list<LLT> FPTypes16 = {
 240     S32, S64, S16
 241   };
 242
 243   const std::initializer_list<LLT> FPTypesPK16 = {
 244     S32, S64, S16, V2S16
 245   };
 246
 247   setAction({G_BRCOND, S1}, Legal);
 248
 249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 250   // elements for v3s16
 251   getActionDefinitionsBuilder(G_PHI)
 252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 253     .legalFor(AllS32Vectors)
 254     .legalFor(AllS64Vectors)
 255     .legalFor(AddrSpaces64)
 256     .legalFor(AddrSpaces32)
 257     .clampScalar(0, S32, S256)
 258     .widenScalarToNextPow2(0, 32)
 259     .clampMaxNumElements(0, S32, 16)
 260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 261     .legalIf(isPointer(0));
 262
 263   if (ST.has16BitInsts()) {
 264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 265       .legalFor({S32, S16})
 266       .clampScalar(0, S16, S32)
 267       .scalarize(0);
 268   } else {
 269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 270       .legalFor({S32})
 271       .clampScalar(0, S32, S32)
 272       .scalarize(0);
 273   }
 274
 275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 276     .legalFor({S32})
 277     .clampScalar(0, S32, S32)
 278     .scalarize(0);
 279
 280   // Report legal for any types we can handle anywhere. For the cases only legal
 281   // on the SALU, RegBankSelect will be able to re-legalize.
 282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 284     .clampScalar(0, S32, S64)
 285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
 287     .widenScalarToNextPow2(0)
 288     .scalarize(0);
 289
 290   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
 291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 292     .legalFor({{S32, S1}})
 293     .clampScalar(0, S32, S32)
 294     .scalarize(0); // TODO: Implement.
 295
 296   getActionDefinitionsBuilder(G_BITCAST)
 297     // Don't worry about the size constraint.
 298     .legalIf(all(isRegisterType(0), isRegisterType(1)))
 299     // FIXME: Testing hack
 300     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
 301
 302   getActionDefinitionsBuilder(G_FCONSTANT)
 303     .legalFor({S32, S64, S16})
 304     .clampScalar(0, S16, S64);
 305
 306   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
 307     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
 308                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
 309     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 310     .clampScalarOrElt(0, S32, S1024)
 311     .legalIf(isMultiple32(0))
 312     .widenScalarToNextPow2(0, 32)
 313     .clampMaxNumElements(0, S32, 16);
 314
 315
 316   // FIXME: i1 operands to intrinsics should always be legal, but other i1
 317   // values may not be legal.  We need to figure out how to distinguish
 318   // between these two scenarios.
 319   getActionDefinitionsBuilder(G_CONSTANT)
 320     .legalFor({S1, S32, S64, S16, GlobalPtr,
 321                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 322     .clampScalar(0, S32, S64)
 323     .widenScalarToNextPow2(0)
 324     .legalIf(isPointer(0));
 325
 326   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 327   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
 328     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
 329
 330
 331   auto &FPOpActions = getActionDefinitionsBuilder(
 332     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
 333     .legalFor({S32, S64});
 334   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 335     .customFor({S32, S64});
 336
 337   if (ST.has16BitInsts()) {
 338     if (ST.hasVOP3PInsts())
 339       FPOpActions.legalFor({S16, V2S16});
 340     else
 341       FPOpActions.legalFor({S16});
 342
 343     TrigActions.customFor({S16});
 344   }
 345
 346   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 347       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 348
 349   if (ST.hasVOP3PInsts()) {
 350     MinNumMaxNum.customFor(FPTypesPK16)
 351       .clampMaxNumElements(0, S16, 2)
 352       .clampScalar(0, S16, S64)
 353       .scalarize(0);
 354   } else if (ST.has16BitInsts()) {
 355     MinNumMaxNum.customFor(FPTypes16)
 356       .clampScalar(0, S16, S64)
 357       .scalarize(0);
 358   } else {
 359     MinNumMaxNum.customFor(FPTypesBase)
 360       .clampScalar(0, S32, S64)
 361       .scalarize(0);
 362   }
 363
 364   if (ST.hasVOP3PInsts())
 365     FPOpActions.clampMaxNumElements(0, S16, 2);
 366
 367   FPOpActions
 368     .scalarize(0)
 369     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 370
 371   TrigActions
 372     .scalarize(0)
 373     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 374
 375   getActionDefinitionsBuilder({G_FNEG, G_FABS})
 376     .legalFor(FPTypesPK16)
 377     .clampMaxNumElements(0, S16, 2)
 378     .scalarize(0)
 379     .clampScalar(0, S16, S64);
 380
 381   // TODO: Implement
 382   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
 383
 384   if (ST.has16BitInsts()) {
 385     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 386       .legalFor({S32, S64, S16})
 387       .scalarize(0)
 388       .clampScalar(0, S16, S64);
 389   } else {
 390     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 391       .legalFor({S32, S64})
 392       .scalarize(0)
 393       .clampScalar(0, S32, S64);
 394   }
 395
 396   getActionDefinitionsBuilder(G_FPTRUNC)
 397     .legalFor({{S32, S64}, {S16, S32}})
 398     .scalarize(0);
 399
 400   getActionDefinitionsBuilder(G_FPEXT)
 401     .legalFor({{S64, S32}, {S32, S16}})
 402     .lowerFor({{S64, S16}}) // FIXME: Implement
 403     .scalarize(0);
 404
 405   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
 406   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
 407
 408   getActionDefinitionsBuilder(G_FSUB)
 409       // Use actual fsub instruction
 410       .legalFor({S32})
 411       // Must use fadd + fneg
 412       .lowerFor({S64, S16, V2S16})
 413       .scalarize(0)
 414       .clampScalar(0, S32, S64);
 415
 416   // Whether this is legal depends on the floating point mode for the function.
 417   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
 418   if (ST.hasMadF16())
 419     FMad.customFor({S32, S16});
 420   else
 421     FMad.customFor({S32});
 422   FMad.scalarize(0)
 423       .lower();
 424
 425   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 426     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 427                {S32, S1}, {S64, S1}, {S16, S1},
 428                {S96, S32},
 429                // FIXME: Hack
 430                {S64, LLT::scalar(33)},
 431                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
 432     .scalarize(0);
 433
 434   // TODO: Split s1->s64 during regbankselect for VALU.
 435   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 436     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
 437     .lowerFor({{S32, S64}})
 438     .customFor({{S64, S64}});
 439   if (ST.has16BitInsts())
 440     IToFP.legalFor({{S16, S16}});
 441   IToFP.clampScalar(1, S32, S64)
 442        .scalarize(0);
 443
 444   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 445     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
 446   if (ST.has16BitInsts())
 447     FPToI.legalFor({{S16, S16}});
 448   else
 449     FPToI.minScalar(1, S32);
 450
 451   FPToI.minScalar(0, S32)
 452        .scalarize(0);
 453
 454   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 455     .legalFor({S32, S64})
 456     .scalarize(0);
 457
 458   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 459     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 460       .legalFor({S32, S64})
 461       .clampScalar(0, S32, S64)
 462       .scalarize(0);
 463   } else {
 464     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 465       .legalFor({S32})
 466       .customFor({S64})
 467       .clampScalar(0, S32, S64)
 468       .scalarize(0);
 469   }
 470
 471   getActionDefinitionsBuilder(G_GEP)
 472     .legalForCartesianProduct(AddrSpaces64, {S64})
 473     .legalForCartesianProduct(AddrSpaces32, {S32})
 474     .scalarize(0);
 475
 476   getActionDefinitionsBuilder(G_PTR_MASK)
 477     .scalarize(0)
 478     .alwaysLegal();
 479
 480   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 481
 482   auto &CmpBuilder =
 483     getActionDefinitionsBuilder(G_ICMP)
 484     .legalForCartesianProduct(
 485       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 486     .legalFor({{S1, S32}, {S1, S64}});
 487   if (ST.has16BitInsts()) {
 488     CmpBuilder.legalFor({{S1, S16}});
 489   }
 490
 491   CmpBuilder
 492     .widenScalarToNextPow2(1)
 493     .clampScalar(1, S32, S64)
 494     .scalarize(0)
 495     .legalIf(all(typeIs(0, S1), isPointer(1)));
 496
 497   getActionDefinitionsBuilder(G_FCMP)
 498     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 499     .widenScalarToNextPow2(1)
 500     .clampScalar(1, S32, S64)
 501     .scalarize(0);
 502
 503   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
 504   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
 505                                G_FLOG, G_FLOG2, G_FLOG10})
 506     .legalFor({S32})
 507     .scalarize(0);
 508
 509   // The 64-bit versions produce 32-bit results, but only on the SALU.
 510   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
 511                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
 512                                G_CTPOP})
 513     .legalFor({{S32, S32}, {S32, S64}})
 514     .clampScalar(0, S32, S32)
 515     .clampScalar(1, S32, S64)
 516     .scalarize(0)
 517     .widenScalarToNextPow2(0, 32)
 518     .widenScalarToNextPow2(1, 32);
 519
 520   // TODO: Expand for > s32
 521   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
 522     .legalFor({S32})
 523     .clampScalar(0, S32, S32)
 524     .scalarize(0);
 525
 526   if (ST.has16BitInsts()) {
 527     if (ST.hasVOP3PInsts()) {
 528       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 529         .legalFor({S32, S16, V2S16})
 530         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 531         .clampMaxNumElements(0, S16, 2)
 532         .clampScalar(0, S16, S32)
 533         .widenScalarToNextPow2(0)
 534         .scalarize(0);
 535     } else {
 536       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 537         .legalFor({S32, S16})
 538         .widenScalarToNextPow2(0)
 539         .clampScalar(0, S16, S32)
 540         .scalarize(0);
 541     }
 542   } else {
 543     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 544       .legalFor({S32})
 545       .clampScalar(0, S32, S32)
 546       .widenScalarToNextPow2(0)
 547       .scalarize(0);
 548   }
 549
 550   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 551     return [=](const LegalityQuery &Query) {
 552       return Query.Types[TypeIdx0].getSizeInBits() <
 553              Query.Types[TypeIdx1].getSizeInBits();
 554     };
 555   };
 556
 557   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 558     return [=](const LegalityQuery &Query) {
 559       return Query.Types[TypeIdx0].getSizeInBits() >
 560              Query.Types[TypeIdx1].getSizeInBits();
 561     };
 562   };
 563
 564   getActionDefinitionsBuilder(G_INTTOPTR)
 565     // List the common cases
 566     .legalForCartesianProduct(AddrSpaces64, {S64})
 567     .legalForCartesianProduct(AddrSpaces32, {S32})
 568     .scalarize(0)
 569     // Accept any address space as long as the size matches
 570     .legalIf(sameSize(0, 1))
 571     .widenScalarIf(smallerThan(1, 0),
 572       [](const LegalityQuery &Query) {
 573         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 574       })
 575     .narrowScalarIf(greaterThan(1, 0),
 576       [](const LegalityQuery &Query) {
 577         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 578       });
 579
 580   getActionDefinitionsBuilder(G_PTRTOINT)
 581     // List the common cases
 582     .legalForCartesianProduct(AddrSpaces64, {S64})
 583     .legalForCartesianProduct(AddrSpaces32, {S32})
 584     .scalarize(0)
 585     // Accept any address space as long as the size matches
 586     .legalIf(sameSize(0, 1))
 587     .widenScalarIf(smallerThan(0, 1),
 588       [](const LegalityQuery &Query) {
 589         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 590       })
 591     .narrowScalarIf(
 592       greaterThan(0, 1),
 593       [](const LegalityQuery &Query) {
 594         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 595       });
 596
 597   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 598     .scalarize(0)
 599     .custom();
 600
 601   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 602   // handle some operations by just promoting the register during
 603   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 604   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
 605     switch (AS) {
 606     // FIXME: Private element size.
 607     case AMDGPUAS::PRIVATE_ADDRESS:
 608       return 32;
 609     // FIXME: Check subtarget
 610     case AMDGPUAS::LOCAL_ADDRESS:
 611       return ST.useDS128() ? 128 : 64;
 612
 613     // Treat constant and global as identical. SMRD loads are sometimes usable
 614     // for global loads (ideally constant address space should be eliminated)
 615     // depending on the context. Legality cannot be context dependent, but
 616     // RegBankSelect can split the load as necessary depending on the pointer
 617     // register bank/uniformity and if the memory is invariant or not written in
 618     // a kernel.
 619     case AMDGPUAS::CONSTANT_ADDRESS:
 620     case AMDGPUAS::GLOBAL_ADDRESS:
 621       return 512;
 622     default:
 623       return 128;
 624     }
 625   };
 626
 627   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
 628     const LLT DstTy = Query.Types[0];
 629
 630     // Split vector extloads.
 631     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 632     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
 633       return true;
 634
 635     const LLT PtrTy = Query.Types[1];
 636     unsigned AS = PtrTy.getAddressSpace();
 637     if (MemSize > maxSizeForAddrSpace(AS))
 638       return true;
 639
 640     // Catch weird sized loads that don't evenly divide into the access sizes
 641     // TODO: May be able to widen depending on alignment etc.
 642     unsigned NumRegs = MemSize / 32;
 643     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
 644       return true;
 645
 646     unsigned Align = Query.MMODescrs[0].AlignInBits;
 647     if (Align < MemSize) {
 648       const SITargetLowering *TLI = ST.getTargetLowering();
 649       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
 650     }
 651
 652     return false;
 653   };
 654
 655   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
 656   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
 657   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
 658
 659   // TODO: Refine based on subtargets which support unaligned access or 128-bit
 660   // LDS
 661   // TODO: Unsupported flat for SI.
 662
 663   for (unsigned Op : {G_LOAD, G_STORE}) {
 664     const bool IsStore = Op == G_STORE;
 665
 666     auto &Actions = getActionDefinitionsBuilder(Op);
 667     // Whitelist the common cases.
 668     // TODO: Pointer loads
 669     // TODO: Wide constant loads
 670     // TODO: Only CI+ has 3x loads
 671     // TODO: Loads to s16 on gfx9
 672     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
 673                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
 674                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
 675                                       {S96, GlobalPtr, 96, GlobalAlign32},
 676                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
 677                                       {S128, GlobalPtr, 128, GlobalAlign32},
 678                                       {S64, GlobalPtr, 64, GlobalAlign32},
 679                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
 680                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
 681                                       {S32, GlobalPtr, 8, GlobalAlign8},
 682                                       {S32, GlobalPtr, 16, GlobalAlign16},
 683
 684                                       {S32, LocalPtr, 32, 32},
 685                                       {S64, LocalPtr, 64, 32},
 686                                       {V2S32, LocalPtr, 64, 32},
 687                                       {S32, LocalPtr, 8, 8},
 688                                       {S32, LocalPtr, 16, 16},
 689                                       {V2S16, LocalPtr, 32, 32},
 690
 691                                       {S32, PrivatePtr, 32, 32},
 692                                       {S32, PrivatePtr, 8, 8},
 693                                       {S32, PrivatePtr, 16, 16},
 694                                       {V2S16, PrivatePtr, 32, 32},
 695
 696                                       {S32, FlatPtr, 32, GlobalAlign32},
 697                                       {S32, FlatPtr, 16, GlobalAlign16},
 698                                       {S32, FlatPtr, 8, GlobalAlign8},
 699                                       {V2S16, FlatPtr, 32, GlobalAlign32},
 700
 701                                       {S32, ConstantPtr, 32, GlobalAlign32},
 702                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
 703                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
 704                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
 705                                       {S64, ConstantPtr, 64, GlobalAlign32},
 706                                       {S128, ConstantPtr, 128, GlobalAlign32},
 707                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
 708     Actions
 709         .customIf(typeIs(1, Constant32Ptr))
 710         .narrowScalarIf(
 711             [=](const LegalityQuery &Query) -> bool {
 712               return !Query.Types[0].isVector() && needToSplitLoad(Query);
 713             },
 714             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
 715               const LLT DstTy = Query.Types[0];
 716               const LLT PtrTy = Query.Types[1];
 717
 718               const unsigned DstSize = DstTy.getSizeInBits();
 719               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 720
 721               // Split extloads.
 722               if (DstSize > MemSize)
 723                 return std::make_pair(0, LLT::scalar(MemSize));
 724
 725               if (DstSize > 32 && (DstSize % 32 != 0)) {
 726                 // FIXME: Need a way to specify non-extload of larger size if
 727                 // suitably aligned.
 728                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
 729               }
 730
 731               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
 732               if (MemSize > MaxSize)
 733                 return std::make_pair(0, LLT::scalar(MaxSize));
 734
 735               unsigned Align = Query.MMODescrs[0].AlignInBits;
 736               return std::make_pair(0, LLT::scalar(Align));
 737             })
 738         .fewerElementsIf(
 739             [=](const LegalityQuery &Query) -> bool {
 740               return Query.Types[0].isVector() && needToSplitLoad(Query);
 741             },
 742             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
 743               const LLT DstTy = Query.Types[0];
 744               const LLT PtrTy = Query.Types[1];
 745
 746               LLT EltTy = DstTy.getElementType();
 747               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
 748
 749               // Split if it's too large for the address space.
 750               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
 751                 unsigned NumElts = DstTy.getNumElements();
 752                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
 753
 754                 // FIXME: Refine when odd breakdowns handled
 755                 // The scalars will need to be re-legalized.
 756                 if (NumPieces == 1 || NumPieces >= NumElts ||
 757                     NumElts % NumPieces != 0)
 758                   return std::make_pair(0, EltTy);
 759
 760                 return std::make_pair(0,
 761                                       LLT::vector(NumElts / NumPieces, EltTy));
 762               }
 763
 764               // Need to split because of alignment.
 765               unsigned Align = Query.MMODescrs[0].AlignInBits;
 766               unsigned EltSize = EltTy.getSizeInBits();
 767               if (EltSize > Align &&
 768                   (EltSize / Align < DstTy.getNumElements())) {
 769                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
 770               }
 771
 772               // May need relegalization for the scalars.
 773               return std::make_pair(0, EltTy);
 774             })
 775         .minScalar(0, S32);
 776
 777     if (IsStore)
 778       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
 779
 780     // TODO: Need a bitcast lower option?
 781     Actions
 782         .legalIf([=](const LegalityQuery &Query) {
 783           const LLT Ty0 = Query.Types[0];
 784           unsigned Size = Ty0.getSizeInBits();
 785           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 786           unsigned Align = Query.MMODescrs[0].AlignInBits;
 787
 788           // No extending vector loads.
 789           if (Size > MemSize && Ty0.isVector())
 790             return false;
 791
 792           // FIXME: Widening store from alignment not valid.
 793           if (MemSize < Size)
 794             MemSize = std::max(MemSize, Align);
 795
 796           switch (MemSize) {
 797           case 8:
 798           case 16:
 799             return Size == 32;
 800           case 32:
 801           case 64:
 802           case 128:
 803             return true;
 804           case 96:
 805             return ST.hasDwordx3LoadStores();
 806           case 256:
 807           case 512:
 808             return true;
 809           default:
 810             return false;
 811           }
 812         })
 813         .widenScalarToNextPow2(0)
 814         // TODO: v3s32->v4s32 with alignment
 815         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
 816   }
 817
 818   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
 819                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
 820                                                   {S32, GlobalPtr, 16, 2 * 8},
 821                                                   {S32, LocalPtr, 8, 8},
 822                                                   {S32, LocalPtr, 16, 16},
 823                                                   {S32, PrivatePtr, 8, 8},
 824                                                   {S32, PrivatePtr, 16, 16},
 825                                                   {S32, ConstantPtr, 8, 8},
 826                                                   {S32, ConstantPtr, 16, 2 * 8}});
 827   if (ST.hasFlatAddressSpace()) {
 828     ExtLoads.legalForTypesWithMemDesc(
 829         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
 830   }
 831
 832   ExtLoads.clampScalar(0, S32, S32)
 833           .widenScalarToNextPow2(0)
 834           .unsupportedIfMemSizeNotPow2()
 835           .lower();
 836
 837   auto &Atomics = getActionDefinitionsBuilder(
 838     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
 839      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
 840      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
 841      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
 842     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
 843                {S64, GlobalPtr}, {S64, LocalPtr}});
 844   if (ST.hasFlatAddressSpace()) {
 845     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
 846   }
 847
 848   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
 849     .legalFor({{S32, LocalPtr}});
 850
 851   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
 852     .lower();
 853
 854   // TODO: Pointer types, any 32-bit or 64-bit vector
 855   getActionDefinitionsBuilder(G_SELECT)
 856     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
 857           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
 858           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
 859     .clampScalar(0, S16, S64)
 860     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 861     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
 862     .scalarize(1)
 863     .clampMaxNumElements(0, S32, 2)
 864     .clampMaxNumElements(0, LocalPtr, 2)
 865     .clampMaxNumElements(0, PrivatePtr, 2)
 866     .scalarize(0)
 867     .widenScalarToNextPow2(0)
 868     .legalIf(all(isPointer(0), typeIs(1, S1)));
 869
 870   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
 871   // be more flexible with the shift amount type.
 872   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
 873     .legalFor({{S32, S32}, {S64, S32}});
 874   if (ST.has16BitInsts()) {
 875     if (ST.hasVOP3PInsts()) {
 876       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
 877             .clampMaxNumElements(0, S16, 2);
 878     } else
 879       Shifts.legalFor({{S16, S32}, {S16, S16}});
 880
 881     Shifts.clampScalar(1, S16, S32);
 882     Shifts.clampScalar(0, S16, S64);
 883     Shifts.widenScalarToNextPow2(0, 16);
 884   } else {
 885     // Make sure we legalize the shift amount type first, as the general
 886     // expansion for the shifted type will produce much worse code if it hasn't
 887     // been truncated already.
 888     Shifts.clampScalar(1, S32, S32);
 889     Shifts.clampScalar(0, S32, S64);
 890     Shifts.widenScalarToNextPow2(0, 32);
 891   }
 892   Shifts.scalarize(0);
 893
 894   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
 895     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
 896     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
 897     unsigned IdxTypeIdx = 2;
 898
 899     getActionDefinitionsBuilder(Op)
 900       .customIf([=](const LegalityQuery &Query) {
 901           const LLT EltTy = Query.Types[EltTypeIdx];
 902           const LLT VecTy = Query.Types[VecTypeIdx];
 903           const LLT IdxTy = Query.Types[IdxTypeIdx];
 904           return (EltTy.getSizeInBits() == 16 ||
 905                   EltTy.getSizeInBits() % 32 == 0) &&
 906                  VecTy.getSizeInBits() % 32 == 0 &&
 907                  VecTy.getSizeInBits() <= 1024 &&
 908                  IdxTy.getSizeInBits() == 32;
 909         })
 910       .clampScalar(EltTypeIdx, S32, S64)
 911       .clampScalar(VecTypeIdx, S32, S64)
 912       .clampScalar(IdxTypeIdx, S32, S32);
 913   }
 914
 915   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
 916     .unsupportedIf([=](const LegalityQuery &Query) {
 917         const LLT &EltTy = Query.Types[1].getElementType();
 918         return Query.Types[0] != EltTy;
 919       });
 920
 921   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
 922     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
 923     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
 924
 925     // FIXME: Doesn't handle extract of illegal sizes.
 926     getActionDefinitionsBuilder(Op)
 927       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
 928       // FIXME: Multiples of 16 should not be legal.
 929       .legalIf([=](const LegalityQuery &Query) {
 930           const LLT BigTy = Query.Types[BigTyIdx];
 931           const LLT LitTy = Query.Types[LitTyIdx];
 932           return (BigTy.getSizeInBits() % 32 == 0) &&
 933                  (LitTy.getSizeInBits() % 16 == 0);
 934         })
 935       .widenScalarIf(
 936         [=](const LegalityQuery &Query) {
 937           const LLT BigTy = Query.Types[BigTyIdx];
 938           return (BigTy.getScalarSizeInBits() < 16);
 939         },
 940         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
 941       .widenScalarIf(
 942         [=](const LegalityQuery &Query) {
 943           const LLT LitTy = Query.Types[LitTyIdx];
 944           return (LitTy.getScalarSizeInBits() < 16);
 945         },
 946         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
 947       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 948       .widenScalarToNextPow2(BigTyIdx, 32);
 949
 950   }
 951
 952   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
 953     .legalForCartesianProduct(AllS32Vectors, {S32})
 954     .legalForCartesianProduct(AllS64Vectors, {S64})
 955     .clampNumElements(0, V16S32, V32S32)
 956     .clampNumElements(0, V2S64, V16S64)
 957     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
 958
 959   if (ST.hasScalarPackInsts())
 960     BuildVector.legalFor({V2S16, S32});
 961
 962   BuildVector
 963     .minScalarSameAs(1, 0)
 964     .legalIf(isRegisterType(0))
 965     .minScalarOrElt(0, S32);
 966
 967   if (ST.hasScalarPackInsts()) {
 968     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
 969       .legalFor({V2S16, S32})
 970       .lower();
 971   } else {
 972     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
 973       .lower();
 974   }
 975
 976   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
 977     .legalIf(isRegisterType(0));
 978
 979   // TODO: Don't fully scalarize v2s16 pieces
 980   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
 981
 982   // Merge/Unmerge
 983   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
 984     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
 985     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 986
 987     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
 988       const LLT &Ty = Query.Types[TypeIdx];
 989       if (Ty.isVector()) {
 990         const LLT &EltTy = Ty.getElementType();
 991         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
 992           return true;
 993         if (!isPowerOf2_32(EltTy.getSizeInBits()))
 994           return true;
 995       }
 996       return false;
 997     };
 998
 999     auto &Builder = getActionDefinitionsBuilder(Op)
1000       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1001       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1002       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1003       // valid.
1004       .clampScalar(LitTyIdx, S16, S256)
1005       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1006       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1007       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1008                            elementTypeIs(1, S16)),
1009                        changeTo(1, V2S16))
1010       // Break up vectors with weird elements into scalars
1011       .fewerElementsIf(
1012         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1013         scalarize(0))
1014       .fewerElementsIf(
1015         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1016         scalarize(1))
1017       .clampScalar(BigTyIdx, S32, S1024)
1018       .lowerFor({{S16, V2S16}});
1019
1020     if (Op == G_MERGE_VALUES) {
1021       Builder.widenScalarIf(
1022         // TODO: Use 16-bit shifts if legal for 8-bit values?
1023         [=](const LegalityQuery &Query) {
1024           const LLT Ty = Query.Types[LitTyIdx];
1025           return Ty.getSizeInBits() < 32;
1026         },
1027         changeTo(LitTyIdx, S32));
1028     }
1029
1030     Builder.widenScalarIf(
1031       [=](const LegalityQuery &Query) {
1032         const LLT Ty = Query.Types[BigTyIdx];
1033         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1034           Ty.getSizeInBits() % 16 != 0;
1035       },
1036       [=](const LegalityQuery &Query) {
1037         // Pick the next power of 2, or a multiple of 64 over 128.
1038         // Whichever is smaller.
1039         const LLT &Ty = Query.Types[BigTyIdx];
1040         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1041         if (NewSizeInBits >= 256) {
1042           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1043           if (RoundedTo < NewSizeInBits)
1044             NewSizeInBits = RoundedTo;
1045         }
1046         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1047       })
1048       .legalIf([=](const LegalityQuery &Query) {
1049           const LLT &BigTy = Query.Types[BigTyIdx];
1050           const LLT &LitTy = Query.Types[LitTyIdx];
1051
1052           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1053             return false;
1054           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1055             return false;
1056
1057           return BigTy.getSizeInBits() % 16 == 0 &&
1058                  LitTy.getSizeInBits() % 16 == 0 &&
1059                  BigTy.getSizeInBits() <= 1024;
1060         })
1061       // Any vectors left are the wrong size. Scalarize them.
1062       .scalarize(0)
1063       .scalarize(1);
1064   }
1065
1066   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1067
1068   computeTables();
1069   verify(*ST.getInstrInfo());
1070 }
1071
1072 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1073                                          MachineRegisterInfo &MRI,
1074                                          MachineIRBuilder &B,
1075                                          GISelChangeObserver &Observer) const {
1076   switch (MI.getOpcode()) {
1077   case TargetOpcode::G_ADDRSPACE_CAST:
1078     return legalizeAddrSpaceCast(MI, MRI, B);
1079   case TargetOpcode::G_FRINT:
1080     return legalizeFrint(MI, MRI, B);
1081   case TargetOpcode::G_FCEIL:
1082     return legalizeFceil(MI, MRI, B);
1083   case TargetOpcode::G_INTRINSIC_TRUNC:
1084     return legalizeIntrinsicTrunc(MI, MRI, B);
1085   case TargetOpcode::G_SITOFP:
1086     return legalizeITOFP(MI, MRI, B, true);
1087   case TargetOpcode::G_UITOFP:
1088     return legalizeITOFP(MI, MRI, B, false);
1089   case TargetOpcode::G_FMINNUM:
1090   case TargetOpcode::G_FMAXNUM:
1091   case TargetOpcode::G_FMINNUM_IEEE:
1092   case TargetOpcode::G_FMAXNUM_IEEE:
1093     return legalizeMinNumMaxNum(MI, MRI, B);
1094   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1095     return legalizeExtractVectorElt(MI, MRI, B);
1096   case TargetOpcode::G_INSERT_VECTOR_ELT:
1097     return legalizeInsertVectorElt(MI, MRI, B);
1098   case TargetOpcode::G_FSIN:
1099   case TargetOpcode::G_FCOS:
1100     return legalizeSinCos(MI, MRI, B);
1101   case TargetOpcode::G_GLOBAL_VALUE:
1102     return legalizeGlobalValue(MI, MRI, B);
1103   case TargetOpcode::G_LOAD:
1104     return legalizeLoad(MI, MRI, B, Observer);
1105   case TargetOpcode::G_FMAD:
1106     return legalizeFMad(MI, MRI, B);
1107   default:
1108     return false;
1109   }
1110
1111   llvm_unreachable("expected switch to return");
1112 }
1113
1114 Register AMDGPULegalizerInfo::getSegmentAperture(
1115   unsigned AS,
1116   MachineRegisterInfo &MRI,
1117   MachineIRBuilder &B) const {
1118   MachineFunction &MF = B.getMF();
1119   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1120   const LLT S32 = LLT::scalar(32);
1121
1122   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1123
1124   if (ST.hasApertureRegs()) {
1125     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1126     // getreg.
1127     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1128         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1129         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1130     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1131         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1132         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1133     unsigned Encoding =
1134         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1135         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1136         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1137
1138     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1139     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1140
1141     B.buildInstr(AMDGPU::S_GETREG_B32)
1142       .addDef(GetReg)
1143       .addImm(Encoding);
1144     MRI.setType(GetReg, S32);
1145
1146     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1147     B.buildInstr(TargetOpcode::G_SHL)
1148       .addDef(ApertureReg)
1149       .addUse(GetReg)
1150       .addUse(ShiftAmt.getReg(0));
1151
1152     return ApertureReg;
1153   }
1154
1155   Register QueuePtr = MRI.createGenericVirtualRegister(
1156     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1157
1158   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1159   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1160     return Register();
1161
1162   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1163   // private_segment_aperture_base_hi.
1164   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1165
1166   // FIXME: Don't use undef
1167   Value *V = UndefValue::get(PointerType::get(
1168                                Type::getInt8Ty(MF.getFunction().getContext()),
1169                                AMDGPUAS::CONSTANT_ADDRESS));
1170
1171   MachinePointerInfo PtrInfo(V, StructOffset);
1172   MachineMemOperand *MMO = MF.getMachineMemOperand(
1173     PtrInfo,
1174     MachineMemOperand::MOLoad |
1175     MachineMemOperand::MODereferenceable |
1176     MachineMemOperand::MOInvariant,
1177     4,
1178     MinAlign(64, StructOffset));
1179
1180   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1181   Register LoadAddr;
1182
1183   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1184   B.buildLoad(LoadResult, LoadAddr, *MMO);
1185   return LoadResult;
1186 }
1187
1188 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1189   MachineInstr &MI, MachineRegisterInfo &MRI,
1190   MachineIRBuilder &B) const {
1191   MachineFunction &MF = B.getMF();
1192
1193   B.setInstr(MI);
1194
1195   const LLT S32 = LLT::scalar(32);
1196   Register Dst = MI.getOperand(0).getReg();
1197   Register Src = MI.getOperand(1).getReg();
1198
1199   LLT DstTy = MRI.getType(Dst);
1200   LLT SrcTy = MRI.getType(Src);
1201   unsigned DestAS = DstTy.getAddressSpace();
1202   unsigned SrcAS = SrcTy.getAddressSpace();
1203
1204   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1205   // vector element.
1206   assert(!DstTy.isVector());
1207
1208   const AMDGPUTargetMachine &TM
1209     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1210
1211   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1212   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1213     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1214     return true;
1215   }
1216
1217   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1218     // Truncate.
1219     B.buildExtract(Dst, Src, 0);
1220     MI.eraseFromParent();
1221     return true;
1222   }
1223
1224   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1225     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1226     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1227
1228     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1229     // another. Merge operands are required to be the same type, but creating an
1230     // extra ptrtoint would be kind of pointless.
1231     auto HighAddr = B.buildConstant(
1232       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1233     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1234     MI.eraseFromParent();
1235     return true;
1236   }
1237
1238   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1239     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1240            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1241     unsigned NullVal = TM.getNullPointerValue(DestAS);
1242
1243     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1244     auto FlatNull = B.buildConstant(SrcTy, 0);
1245
1246     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1247
1248     // Extract low 32-bits of the pointer.
1249     B.buildExtract(PtrLo32, Src, 0);
1250
1251     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1252     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1253     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1254
1255     MI.eraseFromParent();
1256     return true;
1257   }
1258
1259   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1260     return false;
1261
1262   if (!ST.hasFlatAddressSpace())
1263     return false;
1264
1265   auto SegmentNull =
1266       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1267   auto FlatNull =
1268       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1269
1270   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1271   if (!ApertureReg.isValid())
1272     return false;
1273
1274   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1275   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1276
1277   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1278
1279   // Coerce the type of the low half of the result so we can use merge_values.
1280   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1281   B.buildInstr(TargetOpcode::G_PTRTOINT)
1282     .addDef(SrcAsInt)
1283     .addUse(Src);
1284
1285   // TODO: Should we allow mismatched types but matching sizes in merges to
1286   // avoid the ptrtoint?
1287   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1288   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1289
1290   MI.eraseFromParent();
1291   return true;
1292 }
1293
1294 bool AMDGPULegalizerInfo::legalizeFrint(
1295   MachineInstr &MI, MachineRegisterInfo &MRI,
1296   MachineIRBuilder &B) const {
1297   B.setInstr(MI);
1298
1299   Register Src = MI.getOperand(1).getReg();
1300   LLT Ty = MRI.getType(Src);
1301   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1302
1303   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1304   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1305
1306   auto C1 = B.buildFConstant(Ty, C1Val);
1307   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1308
1309   // TODO: Should this propagate fast-math-flags?
1310   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1311   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1312
1313   auto C2 = B.buildFConstant(Ty, C2Val);
1314   auto Fabs = B.buildFAbs(Ty, Src);
1315
1316   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1317   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1318   return true;
1319 }
1320
1321 bool AMDGPULegalizerInfo::legalizeFceil(
1322   MachineInstr &MI, MachineRegisterInfo &MRI,
1323   MachineIRBuilder &B) const {
1324   B.setInstr(MI);
1325
1326   const LLT S1 = LLT::scalar(1);
1327   const LLT S64 = LLT::scalar(64);
1328
1329   Register Src = MI.getOperand(1).getReg();
1330   assert(MRI.getType(Src) == S64);
1331
1332   // result = trunc(src)
1333   // if (src > 0.0 && src != result)
1334   //   result += 1.0
1335
1336   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1337
1338   const auto Zero = B.buildFConstant(S64, 0.0);
1339   const auto One = B.buildFConstant(S64, 1.0);
1340   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1341   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1342   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1343   auto Add = B.buildSelect(S64, And, One, Zero);
1344
1345   // TODO: Should this propagate fast-math-flags?
1346   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1347   return true;
1348 }
1349
1350 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1351                                               MachineIRBuilder &B) {
1352   const unsigned FractBits = 52;
1353   const unsigned ExpBits = 11;
1354   LLT S32 = LLT::scalar(32);
1355
1356   auto Const0 = B.buildConstant(S32, FractBits - 32);
1357   auto Const1 = B.buildConstant(S32, ExpBits);
1358
1359   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1360     .addUse(Const0.getReg(0))
1361     .addUse(Const1.getReg(0));
1362
1363   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1364 }
1365
1366 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1367   MachineInstr &MI, MachineRegisterInfo &MRI,
1368   MachineIRBuilder &B) const {
1369   B.setInstr(MI);
1370
1371   const LLT S1 = LLT::scalar(1);
1372   const LLT S32 = LLT::scalar(32);
1373   const LLT S64 = LLT::scalar(64);
1374
1375   Register Src = MI.getOperand(1).getReg();
1376   assert(MRI.getType(Src) == S64);
1377
1378   // TODO: Should this use extract since the low half is unused?
1379   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1380   Register Hi = Unmerge.getReg(1);
1381
1382   // Extract the upper half, since this is where we will find the sign and
1383   // exponent.
1384   auto Exp = extractF64Exponent(Hi, B);
1385
1386   const unsigned FractBits = 52;
1387
1388   // Extract the sign bit.
1389   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1390   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1391
1392   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1393
1394   const auto Zero32 = B.buildConstant(S32, 0);
1395
1396   // Extend back to 64-bits.
1397   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1398
1399   auto Shr = B.buildAShr(S64, FractMask, Exp);
1400   auto Not = B.buildNot(S64, Shr);
1401   auto Tmp0 = B.buildAnd(S64, Src, Not);
1402   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1403
1404   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1405   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1406
1407   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1408   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1409   return true;
1410 }
1411
1412 bool AMDGPULegalizerInfo::legalizeITOFP(
1413   MachineInstr &MI, MachineRegisterInfo &MRI,
1414   MachineIRBuilder &B, bool Signed) const {
1415   B.setInstr(MI);
1416
1417   Register Dst = MI.getOperand(0).getReg();
1418   Register Src = MI.getOperand(1).getReg();
1419
1420   const LLT S64 = LLT::scalar(64);
1421   const LLT S32 = LLT::scalar(32);
1422
1423   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1424
1425   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1426
1427   auto CvtHi = Signed ?
1428     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1429     B.buildUITOFP(S64, Unmerge.getReg(1));
1430
1431   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1432
1433   auto ThirtyTwo = B.buildConstant(S32, 32);
1434   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1435     .addUse(CvtHi.getReg(0))
1436     .addUse(ThirtyTwo.getReg(0));
1437
1438   // TODO: Should this propagate fast-math-flags?
1439   B.buildFAdd(Dst, LdExp, CvtLo);
1440   MI.eraseFromParent();
1441   return true;
1442 }
1443
1444 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1445   MachineInstr &MI, MachineRegisterInfo &MRI,
1446   MachineIRBuilder &B) const {
1447   MachineFunction &MF = B.getMF();
1448   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1449
1450   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1451                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1452
1453   // With ieee_mode disabled, the instructions have the correct behavior
1454   // already for G_FMINNUM/G_FMAXNUM
1455   if (!MFI->getMode().IEEE)
1456     return !IsIEEEOp;
1457
1458   if (IsIEEEOp)
1459     return true;
1460
1461   MachineIRBuilder HelperBuilder(MI);
1462   GISelObserverWrapper DummyObserver;
1463   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1464   HelperBuilder.setInstr(MI);
1465   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1466 }
1467
1468 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1469   MachineInstr &MI, MachineRegisterInfo &MRI,
1470   MachineIRBuilder &B) const {
1471   // TODO: Should move some of this into LegalizerHelper.
1472
1473   // TODO: Promote dynamic indexing of s16 to s32
1474   // TODO: Dynamic s64 indexing is only legal for SGPR.
1475   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1476   if (!IdxVal) // Dynamic case will be selected to register indexing.
1477     return true;
1478
1479   Register Dst = MI.getOperand(0).getReg();
1480   Register Vec = MI.getOperand(1).getReg();
1481
1482   LLT VecTy = MRI.getType(Vec);
1483   LLT EltTy = VecTy.getElementType();
1484   assert(EltTy == MRI.getType(Dst));
1485
1486   B.setInstr(MI);
1487
1488   if (IdxVal.getValue() < VecTy.getNumElements())
1489     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1490   else
1491     B.buildUndef(Dst);
1492
1493   MI.eraseFromParent();
1494   return true;
1495 }
1496
1497 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1498   MachineInstr &MI, MachineRegisterInfo &MRI,
1499   MachineIRBuilder &B) const {
1500   // TODO: Should move some of this into LegalizerHelper.
1501
1502   // TODO: Promote dynamic indexing of s16 to s32
1503   // TODO: Dynamic s64 indexing is only legal for SGPR.
1504   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1505   if (!IdxVal) // Dynamic case will be selected to register indexing.
1506     return true;
1507
1508   Register Dst = MI.getOperand(0).getReg();
1509   Register Vec = MI.getOperand(1).getReg();
1510   Register Ins = MI.getOperand(2).getReg();
1511
1512   LLT VecTy = MRI.getType(Vec);
1513   LLT EltTy = VecTy.getElementType();
1514   assert(EltTy == MRI.getType(Ins));
1515
1516   B.setInstr(MI);
1517
1518   if (IdxVal.getValue() < VecTy.getNumElements())
1519     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1520   else
1521     B.buildUndef(Dst);
1522
1523   MI.eraseFromParent();
1524   return true;
1525 }
1526
1527 bool AMDGPULegalizerInfo::legalizeSinCos(
1528   MachineInstr &MI, MachineRegisterInfo &MRI,
1529   MachineIRBuilder &B) const {
1530   B.setInstr(MI);
1531
1532   Register DstReg = MI.getOperand(0).getReg();
1533   Register SrcReg = MI.getOperand(1).getReg();
1534   LLT Ty = MRI.getType(DstReg);
1535   unsigned Flags = MI.getFlags();
1536
1537   Register TrigVal;
1538   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1539   if (ST.hasTrigReducedRange()) {
1540     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1541     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1542       .addUse(MulVal.getReg(0))
1543       .setMIFlags(Flags).getReg(0);
1544   } else
1545     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1546
1547   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1548     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1549   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1550     .addUse(TrigVal)
1551     .setMIFlags(Flags);
1552   MI.eraseFromParent();
1553   return true;
1554 }
1555
1556 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1557   Register DstReg, LLT PtrTy,
1558   MachineIRBuilder &B, const GlobalValue *GV,
1559   unsigned Offset, unsigned GAFlags) const {
1560   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1561   // to the following code sequence:
1562   //
1563   // For constant address space:
1564   //   s_getpc_b64 s[0:1]
1565   //   s_add_u32 s0, s0, $symbol
1566   //   s_addc_u32 s1, s1, 0
1567   //
1568   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1569   //   a fixup or relocation is emitted to replace $symbol with a literal
1570   //   constant, which is a pc-relative offset from the encoding of the $symbol
1571   //   operand to the global variable.
1572   //
1573   // For global address space:
1574   //   s_getpc_b64 s[0:1]
1575   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1576   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1577   //
1578   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1579   //   fixups or relocations are emitted to replace $symbol@*@lo and
1580   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1581   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1582   //   operand to the global variable.
1583   //
1584   // What we want here is an offset from the value returned by s_getpc
1585   // (which is the address of the s_add_u32 instruction) to the global
1586   // variable, but since the encoding of $symbol starts 4 bytes after the start
1587   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1588   // small. This requires us to add 4 to the global variable offset in order to
1589   // compute the correct address.
1590
1591   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1592
1593   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1594     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1595
1596   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1597     .addDef(PCReg);
1598
1599   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1600   if (GAFlags == SIInstrInfo::MO_NONE)
1601     MIB.addImm(0);
1602   else
1603     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1604
1605   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1606
1607   if (PtrTy.getSizeInBits() == 32)
1608     B.buildExtract(DstReg, PCReg, 0);
1609   return true;
1610  }
1611
1612 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1613   MachineInstr &MI, MachineRegisterInfo &MRI,
1614   MachineIRBuilder &B) const {
1615   Register DstReg = MI.getOperand(0).getReg();
1616   LLT Ty = MRI.getType(DstReg);
1617   unsigned AS = Ty.getAddressSpace();
1618
1619   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1620   MachineFunction &MF = B.getMF();
1621   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622   B.setInstr(MI);
1623
1624   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1625     if (!MFI->isEntryFunction()) {
1626       const Function &Fn = MF.getFunction();
1627       DiagnosticInfoUnsupported BadLDSDecl(
1628         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1629       Fn.getContext().diagnose(BadLDSDecl);
1630     }
1631
1632     // TODO: We could emit code to handle the initialization somewhere.
1633     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1634       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1635       MI.eraseFromParent();
1636       return true;
1637     }
1638
1639     const Function &Fn = MF.getFunction();
1640     DiagnosticInfoUnsupported BadInit(
1641       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1642     Fn.getContext().diagnose(BadInit);
1643     return true;
1644   }
1645
1646   const SITargetLowering *TLI = ST.getTargetLowering();
1647
1648   if (TLI->shouldEmitFixup(GV)) {
1649     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1650     MI.eraseFromParent();
1651     return true;
1652   }
1653
1654   if (TLI->shouldEmitPCReloc(GV)) {
1655     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1656     MI.eraseFromParent();
1657     return true;
1658   }
1659
1660   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1661   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1662
1663   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1664     MachinePointerInfo::getGOT(MF),
1665     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1666     MachineMemOperand::MOInvariant,
1667     8 /*Size*/, 8 /*Align*/);
1668
1669   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1670
1671   if (Ty.getSizeInBits() == 32) {
1672     // Truncate if this is a 32-bit constant adrdess.
1673     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1674     B.buildExtract(DstReg, Load, 0);
1675   } else
1676     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1677
1678   MI.eraseFromParent();
1679   return true;
1680 }
1681
1682 bool AMDGPULegalizerInfo::legalizeLoad(
1683   MachineInstr &MI, MachineRegisterInfo &MRI,
1684   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1685   B.setInstr(MI);
1686   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1687   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1688   Observer.changingInstr(MI);
1689   MI.getOperand(1).setReg(Cast.getReg(0));
1690   Observer.changedInstr(MI);
1691   return true;
1692 }
1693
1694 bool AMDGPULegalizerInfo::legalizeFMad(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B) const {
1697   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1698   assert(Ty.isScalar());
1699
1700   // TODO: Always legal with future ftz flag.
1701   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1702     return true;
1703   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1704     return true;
1705
1706   MachineFunction &MF = B.getMF();
1707
1708   MachineIRBuilder HelperBuilder(MI);
1709   GISelObserverWrapper DummyObserver;
1710   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1711   HelperBuilder.setMBB(*MI.getParent());
1712   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1713 }
1714
1715 // Return the use branch instruction, otherwise null if the usage is invalid.
1716 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1717                                        MachineRegisterInfo &MRI) {
1718   Register CondDef = MI.getOperand(0).getReg();
1719   if (!MRI.hasOneNonDBGUse(CondDef))
1720     return nullptr;
1721
1722   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1723   return UseMI.getParent() == MI.getParent() &&
1724     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1725 }
1726
1727 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1728                                                 Register Reg, LLT Ty) const {
1729   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1730   if (LiveIn)
1731     return LiveIn;
1732
1733   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1734   MRI.addLiveIn(Reg, NewReg);
1735   return NewReg;
1736 }
1737
1738 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1739                                          const ArgDescriptor *Arg) const {
1740   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1741     return false; // TODO: Handle these
1742
1743   assert(Arg->getRegister().isPhysical());
1744
1745   MachineRegisterInfo &MRI = *B.getMRI();
1746
1747   LLT Ty = MRI.getType(DstReg);
1748   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1749
1750   if (Arg->isMasked()) {
1751     // TODO: Should we try to emit this once in the entry block?
1752     const LLT S32 = LLT::scalar(32);
1753     const unsigned Mask = Arg->getMask();
1754     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1755
1756     Register AndMaskSrc = LiveIn;
1757
1758     if (Shift != 0) {
1759       auto ShiftAmt = B.buildConstant(S32, Shift);
1760       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1761     }
1762
1763     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1764   } else
1765     B.buildCopy(DstReg, LiveIn);
1766
1767   // Insert the argument copy if it doens't already exist.
1768   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1769   if (!MRI.getVRegDef(LiveIn)) {
1770     // FIXME: Should have scoped insert pt
1771     MachineBasicBlock &OrigInsBB = B.getMBB();
1772     auto OrigInsPt = B.getInsertPt();
1773
1774     MachineBasicBlock &EntryMBB = B.getMF().front();
1775     EntryMBB.addLiveIn(Arg->getRegister());
1776     B.setInsertPt(EntryMBB, EntryMBB.begin());
1777     B.buildCopy(LiveIn, Arg->getRegister());
1778
1779     B.setInsertPt(OrigInsBB, OrigInsPt);
1780   }
1781
1782   return true;
1783 }
1784
1785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1786   MachineInstr &MI,
1787   MachineRegisterInfo &MRI,
1788   MachineIRBuilder &B,
1789   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1790   B.setInstr(MI);
1791
1792   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1793
1794   const ArgDescriptor *Arg;
1795   const TargetRegisterClass *RC;
1796   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1797   if (!Arg) {
1798     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1799     return false;
1800   }
1801
1802   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1803     MI.eraseFromParent();
1804     return true;
1805   }
1806
1807   return false;
1808 }
1809
1810 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1811                                            MachineRegisterInfo &MRI,
1812                                            MachineIRBuilder &B) const {
1813   B.setInstr(MI);
1814   Register Res = MI.getOperand(0).getReg();
1815   Register LHS = MI.getOperand(2).getReg();
1816   Register RHS = MI.getOperand(3).getReg();
1817   uint16_t Flags = MI.getFlags();
1818
1819   LLT S32 = LLT::scalar(32);
1820   LLT S1 = LLT::scalar(1);
1821
1822   auto Abs = B.buildFAbs(S32, RHS, Flags);
1823   const APFloat C0Val(1.0f);
1824
1825   auto C0 = B.buildConstant(S32, 0x6f800000);
1826   auto C1 = B.buildConstant(S32, 0x2f800000);
1827   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1828
1829   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1830   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1831
1832   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1833
1834   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1835     .addUse(Mul0.getReg(0))
1836     .setMIFlags(Flags);
1837
1838   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1839
1840   B.buildFMul(Res, Sel, Mul1, Flags);
1841
1842   MI.eraseFromParent();
1843   return true;
1844 }
1845
1846 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1847                                                  MachineRegisterInfo &MRI,
1848                                                  MachineIRBuilder &B) const {
1849   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1850   if (!MFI->isEntryFunction()) {
1851     return legalizePreloadedArgIntrin(MI, MRI, B,
1852                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1853   }
1854
1855   B.setInstr(MI);
1856
1857   uint64_t Offset =
1858     ST.getTargetLowering()->getImplicitParameterOffset(
1859       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1860   Register DstReg = MI.getOperand(0).getReg();
1861   LLT DstTy = MRI.getType(DstReg);
1862   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1863
1864   const ArgDescriptor *Arg;
1865   const TargetRegisterClass *RC;
1866   std::tie(Arg, RC)
1867     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1868   if (!Arg)
1869     return false;
1870
1871   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1872   if (!loadInputValue(KernargPtrReg, B, Arg))
1873     return false;
1874
1875   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1876   MI.eraseFromParent();
1877   return true;
1878 }
1879
1880 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1881                                               MachineRegisterInfo &MRI,
1882                                               MachineIRBuilder &B,
1883                                               unsigned AddrSpace) const {
1884   B.setInstr(MI);
1885   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1886   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1887   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1888   MI.eraseFromParent();
1889   return true;
1890 }
1891
1892 /// Handle register layout difference for f16 images for some subtargets.
1893 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1894                                              MachineRegisterInfo &MRI,
1895                                              Register Reg) const {
1896   if (!ST.hasUnpackedD16VMem())
1897     return Reg;
1898
1899   const LLT S16 = LLT::scalar(16);
1900   const LLT S32 = LLT::scalar(32);
1901   LLT StoreVT = MRI.getType(Reg);
1902   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1903
1904   auto Unmerge = B.buildUnmerge(S16, Reg);
1905
1906   SmallVector<Register, 4> WideRegs;
1907   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1908     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1909
1910   int NumElts = StoreVT.getNumElements();
1911
1912   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1913 }
1914
1915 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1916                                                  MachineRegisterInfo &MRI,
1917                                                  MachineIRBuilder &B,
1918                                                  bool IsFormat) const {
1919   // TODO: Reject f16 format on targets where unsupported.
1920   Register VData = MI.getOperand(1).getReg();
1921   LLT Ty = MRI.getType(VData);
1922
1923   B.setInstr(MI);
1924
1925   const LLT S32 = LLT::scalar(32);
1926   const LLT S16 = LLT::scalar(16);
1927
1928   // Fixup illegal register types for i8 stores.
1929   if (Ty == LLT::scalar(8) || Ty == S16) {
1930     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1931     MI.getOperand(1).setReg(AnyExt);
1932     return true;
1933   }
1934
1935   if (Ty.isVector()) {
1936     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1937       if (IsFormat)
1938         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1939       return true;
1940     }
1941
1942     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1943   }
1944
1945   return Ty == S32;
1946 }
1947
1948 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1949                                             MachineRegisterInfo &MRI,
1950                                             MachineIRBuilder &B) const {
1951   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1952   switch (MI.getIntrinsicID()) {
1953   case Intrinsic::amdgcn_if: {
1954     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1955       const SIRegisterInfo *TRI
1956         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1957
1958       B.setInstr(*BrCond);
1959       Register Def = MI.getOperand(1).getReg();
1960       Register Use = MI.getOperand(3).getReg();
1961       B.buildInstr(AMDGPU::SI_IF)
1962         .addDef(Def)
1963         .addUse(Use)
1964         .addMBB(BrCond->getOperand(1).getMBB());
1965
1966       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1967       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1968       MI.eraseFromParent();
1969       BrCond->eraseFromParent();
1970       return true;
1971     }
1972
1973     return false;
1974   }
1975   case Intrinsic::amdgcn_loop: {
1976     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1977       const SIRegisterInfo *TRI
1978         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1979
1980       B.setInstr(*BrCond);
1981       Register Reg = MI.getOperand(2).getReg();
1982       B.buildInstr(AMDGPU::SI_LOOP)
1983         .addUse(Reg)
1984         .addMBB(BrCond->getOperand(1).getMBB());
1985       MI.eraseFromParent();
1986       BrCond->eraseFromParent();
1987       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1988       return true;
1989     }
1990
1991     return false;
1992   }
1993   case Intrinsic::amdgcn_kernarg_segment_ptr:
1994     return legalizePreloadedArgIntrin(
1995       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1996   case Intrinsic::amdgcn_implicitarg_ptr:
1997     return legalizeImplicitArgPtr(MI, MRI, B);
1998   case Intrinsic::amdgcn_workitem_id_x:
1999     return legalizePreloadedArgIntrin(MI, MRI, B,
2000                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2001   case Intrinsic::amdgcn_workitem_id_y:
2002     return legalizePreloadedArgIntrin(MI, MRI, B,
2003                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2004   case Intrinsic::amdgcn_workitem_id_z:
2005     return legalizePreloadedArgIntrin(MI, MRI, B,
2006                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2007   case Intrinsic::amdgcn_workgroup_id_x:
2008     return legalizePreloadedArgIntrin(MI, MRI, B,
2009                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2010   case Intrinsic::amdgcn_workgroup_id_y:
2011     return legalizePreloadedArgIntrin(MI, MRI, B,
2012                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2013   case Intrinsic::amdgcn_workgroup_id_z:
2014     return legalizePreloadedArgIntrin(MI, MRI, B,
2015                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2016   case Intrinsic::amdgcn_dispatch_ptr:
2017     return legalizePreloadedArgIntrin(MI, MRI, B,
2018                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2019   case Intrinsic::amdgcn_queue_ptr:
2020     return legalizePreloadedArgIntrin(MI, MRI, B,
2021                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2022   case Intrinsic::amdgcn_implicit_buffer_ptr:
2023     return legalizePreloadedArgIntrin(
2024       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2025   case Intrinsic::amdgcn_dispatch_id:
2026     return legalizePreloadedArgIntrin(MI, MRI, B,
2027                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2028   case Intrinsic::amdgcn_fdiv_fast:
2029     return legalizeFDIVFast(MI, MRI, B);
2030   case Intrinsic::amdgcn_is_shared:
2031     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2032   case Intrinsic::amdgcn_is_private:
2033     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2034   case Intrinsic::amdgcn_wavefrontsize: {
2035     B.setInstr(MI);
2036     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2037     MI.eraseFromParent();
2038     return true;
2039   }
2040   case Intrinsic::amdgcn_raw_buffer_store:
2041     return legalizeRawBufferStore(MI, MRI, B, false);
2042   case Intrinsic::amdgcn_raw_buffer_store_format:
2043     return legalizeRawBufferStore(MI, MRI, B, true);
2044   default:
2045     return true;
2046   }
2047
2048   return true;
2049 }