lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #if defined(_MSC_VER) || defined(__MINGW32__)
  15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
  16 // from the Visual C++ cmath / math.h headers:
  17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
  18 #define _USE_MATH_DEFINES
  19 #endif
  20
  21 #include "AMDGPU.h"
  22 #include "AMDGPULegalizerInfo.h"
  23 #include "AMDGPUTargetMachine.h"
  24 #include "SIMachineFunctionInfo.h"
  25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  27 #include "llvm/CodeGen/TargetOpcodes.h"
  28 #include "llvm/CodeGen/ValueTypes.h"
  29 #include "llvm/IR/DerivedTypes.h"
  30 #include "llvm/IR/Type.h"
  31 #include "llvm/Support/Debug.h"
  32
  33 #define DEBUG_TYPE "amdgpu-legalinfo"
  34
  35 using namespace llvm;
  36 using namespace LegalizeActions;
  37 using namespace LegalizeMutations;
  38 using namespace LegalityPredicates;
  39
  40
  41 static LegalityPredicate isMultiple32(unsigned TypeIdx,
  42                                       unsigned MaxSize = 512) {
  43   return [=](const LegalityQuery &Query) {
  44     const LLT Ty = Query.Types[TypeIdx];
  45     const LLT EltTy = Ty.getScalarType();
  46     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
  47   };
  48 }
  49
  50 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  51   return [=](const LegalityQuery &Query) {
  52     const LLT Ty = Query.Types[TypeIdx];
  53     return Ty.isVector() &&
  54            Ty.getNumElements() % 2 != 0 &&
  55            Ty.getElementType().getSizeInBits() < 32;
  56   };
  57 }
  58
  59 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  60   return [=](const LegalityQuery &Query) {
  61     const LLT Ty = Query.Types[TypeIdx];
  62     const LLT EltTy = Ty.getElementType();
  63     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  64   };
  65 }
  66
  67 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  68   return [=](const LegalityQuery &Query) {
  69     const LLT Ty = Query.Types[TypeIdx];
  70     const LLT EltTy = Ty.getElementType();
  71     unsigned Size = Ty.getSizeInBits();
  72     unsigned Pieces = (Size + 63) / 64;
  73     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  74     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  75   };
  76 }
  77
  78 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
  79   return [=](const LegalityQuery &Query) {
  80     const LLT QueryTy = Query.Types[TypeIdx];
  81     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
  82   };
  83 }
  84
  85 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
  86   return [=](const LegalityQuery &Query) {
  87     const LLT QueryTy = Query.Types[TypeIdx];
  88     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
  89   };
  90 }
  91
  92 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
  93 // v2s16.
  94 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
  95   return [=](const LegalityQuery &Query) {
  96     const LLT Ty = Query.Types[TypeIdx];
  97     if (Ty.isVector()) {
  98       const int EltSize = Ty.getElementType().getSizeInBits();
  99       return EltSize == 32 || EltSize == 64 ||
 100             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 101              EltSize == 128 || EltSize == 256;
 102     }
 103
 104     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
 105   };
 106 }
 107
 108 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
 109   return [=](const LegalityQuery &Query) {
 110     return Query.Types[TypeIdx].getElementType() == Type;
 111   };
 112 }
 113
 114 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 115                                          const GCNTargetMachine &TM)
 116   :  ST(ST_) {
 117   using namespace TargetOpcode;
 118
 119   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 120     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 121   };
 122
 123   const LLT S1 = LLT::scalar(1);
 124   const LLT S8 = LLT::scalar(8);
 125   const LLT S16 = LLT::scalar(16);
 126   const LLT S32 = LLT::scalar(32);
 127   const LLT S64 = LLT::scalar(64);
 128   const LLT S128 = LLT::scalar(128);
 129   const LLT S256 = LLT::scalar(256);
 130   const LLT S512 = LLT::scalar(512);
 131
 132   const LLT V2S16 = LLT::vector(2, 16);
 133   const LLT V4S16 = LLT::vector(4, 16);
 134
 135   const LLT V2S32 = LLT::vector(2, 32);
 136   const LLT V3S32 = LLT::vector(3, 32);
 137   const LLT V4S32 = LLT::vector(4, 32);
 138   const LLT V5S32 = LLT::vector(5, 32);
 139   const LLT V6S32 = LLT::vector(6, 32);
 140   const LLT V7S32 = LLT::vector(7, 32);
 141   const LLT V8S32 = LLT::vector(8, 32);
 142   const LLT V9S32 = LLT::vector(9, 32);
 143   const LLT V10S32 = LLT::vector(10, 32);
 144   const LLT V11S32 = LLT::vector(11, 32);
 145   const LLT V12S32 = LLT::vector(12, 32);
 146   const LLT V13S32 = LLT::vector(13, 32);
 147   const LLT V14S32 = LLT::vector(14, 32);
 148   const LLT V15S32 = LLT::vector(15, 32);
 149   const LLT V16S32 = LLT::vector(16, 32);
 150
 151   const LLT V2S64 = LLT::vector(2, 64);
 152   const LLT V3S64 = LLT::vector(3, 64);
 153   const LLT V4S64 = LLT::vector(4, 64);
 154   const LLT V5S64 = LLT::vector(5, 64);
 155   const LLT V6S64 = LLT::vector(6, 64);
 156   const LLT V7S64 = LLT::vector(7, 64);
 157   const LLT V8S64 = LLT::vector(8, 64);
 158
 159   std::initializer_list<LLT> AllS32Vectors =
 160     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 161      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
 162   std::initializer_list<LLT> AllS64Vectors =
 163     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
 164
 165   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 166   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 167   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 168   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 169   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 170   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 171   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 172
 173   const LLT CodePtr = FlatPtr;
 174
 175   const std::initializer_list<LLT> AddrSpaces64 = {
 176     GlobalPtr, ConstantPtr, FlatPtr
 177   };
 178
 179   const std::initializer_list<LLT> AddrSpaces32 = {
 180     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 181   };
 182
 183   const std::initializer_list<LLT> FPTypesBase = {
 184     S32, S64
 185   };
 186
 187   const std::initializer_list<LLT> FPTypes16 = {
 188     S32, S64, S16
 189   };
 190
 191   const std::initializer_list<LLT> FPTypesPK16 = {
 192     S32, S64, S16, V2S16
 193   };
 194
 195   setAction({G_BRCOND, S1}, Legal);
 196
 197   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 198   // elements for v3s16
 199   getActionDefinitionsBuilder(G_PHI)
 200     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 201     .legalFor(AllS32Vectors)
 202     .legalFor(AllS64Vectors)
 203     .legalFor(AddrSpaces64)
 204     .legalFor(AddrSpaces32)
 205     .clampScalar(0, S32, S256)
 206     .widenScalarToNextPow2(0, 32)
 207     .clampMaxNumElements(0, S32, 16)
 208     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 209     .legalIf(isPointer(0));
 210
 211   if (ST.has16BitInsts()) {
 212     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 213       .legalFor({S32, S16})
 214       .clampScalar(0, S16, S32)
 215       .scalarize(0);
 216   } else {
 217     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 218       .legalFor({S32})
 219       .clampScalar(0, S32, S32)
 220       .scalarize(0);
 221   }
 222
 223   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 224     .legalFor({S32})
 225     .clampScalar(0, S32, S32)
 226     .scalarize(0);
 227
 228   // Report legal for any types we can handle anywhere. For the cases only legal
 229   // on the SALU, RegBankSelect will be able to re-legalize.
 230   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 231     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 232     .clampScalar(0, S32, S64)
 233     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 234     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
 235     .widenScalarToNextPow2(0)
 236     .scalarize(0);
 237
 238   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
 239                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 240     .legalFor({{S32, S1}})
 241     .clampScalar(0, S32, S32);
 242
 243   getActionDefinitionsBuilder(G_BITCAST)
 244     .legalForCartesianProduct({S32, V2S16})
 245     .legalForCartesianProduct({S64, V2S32, V4S16})
 246     .legalForCartesianProduct({V2S64, V4S32})
 247     // Don't worry about the size constraint.
 248     .legalIf(all(isPointer(0), isPointer(1)));
 249
 250   getActionDefinitionsBuilder(G_FCONSTANT)
 251     .legalFor({S32, S64, S16})
 252     .clampScalar(0, S16, S64);
 253
 254   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
 255     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
 256                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
 257     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 258     .clampScalarOrElt(0, S32, S512)
 259     .legalIf(isMultiple32(0))
 260     .widenScalarToNextPow2(0, 32)
 261     .clampMaxNumElements(0, S32, 16);
 262
 263
 264   // FIXME: i1 operands to intrinsics should always be legal, but other i1
 265   // values may not be legal.  We need to figure out how to distinguish
 266   // between these two scenarios.
 267   getActionDefinitionsBuilder(G_CONSTANT)
 268     .legalFor({S1, S32, S64, S16, GlobalPtr,
 269                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 270     .clampScalar(0, S32, S64)
 271     .widenScalarToNextPow2(0)
 272     .legalIf(isPointer(0));
 273
 274   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 275
 276   auto &FPOpActions = getActionDefinitionsBuilder(
 277     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
 278     .legalFor({S32, S64});
 279   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 280     .customFor({S32, S64});
 281
 282   if (ST.has16BitInsts()) {
 283     if (ST.hasVOP3PInsts())
 284       FPOpActions.legalFor({S16, V2S16});
 285     else
 286       FPOpActions.legalFor({S16});
 287
 288     TrigActions.customFor({S16});
 289   }
 290
 291   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 292       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 293
 294   if (ST.hasVOP3PInsts()) {
 295     MinNumMaxNum.customFor(FPTypesPK16)
 296       .clampMaxNumElements(0, S16, 2)
 297       .clampScalar(0, S16, S64)
 298       .scalarize(0);
 299   } else if (ST.has16BitInsts()) {
 300     MinNumMaxNum.customFor(FPTypes16)
 301       .clampScalar(0, S16, S64)
 302       .scalarize(0);
 303   } else {
 304     MinNumMaxNum.customFor(FPTypesBase)
 305       .clampScalar(0, S32, S64)
 306       .scalarize(0);
 307   }
 308
 309   // TODO: Implement
 310   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
 311
 312   if (ST.hasVOP3PInsts())
 313     FPOpActions.clampMaxNumElements(0, S16, 2);
 314
 315   FPOpActions
 316     .scalarize(0)
 317     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 318
 319   TrigActions
 320     .scalarize(0)
 321     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 322
 323   if (ST.has16BitInsts()) {
 324     getActionDefinitionsBuilder(G_FSQRT)
 325       .legalFor({S32, S64, S16})
 326       .scalarize(0)
 327       .clampScalar(0, S16, S64);
 328   } else {
 329     getActionDefinitionsBuilder(G_FSQRT)
 330       .legalFor({S32, S64})
 331       .scalarize(0)
 332       .clampScalar(0, S32, S64);
 333   }
 334
 335   getActionDefinitionsBuilder(G_FPTRUNC)
 336     .legalFor({{S32, S64}, {S16, S32}})
 337     .scalarize(0);
 338
 339   getActionDefinitionsBuilder(G_FPEXT)
 340     .legalFor({{S64, S32}, {S32, S16}})
 341     .lowerFor({{S64, S16}}) // FIXME: Implement
 342     .scalarize(0);
 343
 344   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
 345   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
 346
 347   getActionDefinitionsBuilder(G_FSUB)
 348       // Use actual fsub instruction
 349       .legalFor({S32})
 350       // Must use fadd + fneg
 351       .lowerFor({S64, S16, V2S16})
 352       .scalarize(0)
 353       .clampScalar(0, S32, S64);
 354
 355   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 356     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 357                {S32, S1}, {S64, S1}, {S16, S1},
 358                // FIXME: Hack
 359                {S64, LLT::scalar(33)},
 360                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
 361     .scalarize(0);
 362
 363   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 364     .legalFor({{S32, S32}, {S64, S32}})
 365     .lowerFor({{S32, S64}})
 366     .customFor({{S64, S64}})
 367     .scalarize(0);
 368
 369   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 370     .legalFor({{S32, S32}, {S32, S64}})
 371     .scalarize(0);
 372
 373   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 374     .legalFor({S32, S64})
 375     .scalarize(0);
 376
 377   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 378     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 379       .legalFor({S32, S64})
 380       .clampScalar(0, S32, S64)
 381       .scalarize(0);
 382   } else {
 383     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 384       .legalFor({S32})
 385       .customFor({S64})
 386       .clampScalar(0, S32, S64)
 387       .scalarize(0);
 388   }
 389
 390   getActionDefinitionsBuilder(G_GEP)
 391     .legalForCartesianProduct(AddrSpaces64, {S64})
 392     .legalForCartesianProduct(AddrSpaces32, {S32})
 393     .scalarize(0);
 394
 395   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 396
 397   auto &CmpBuilder =
 398     getActionDefinitionsBuilder(G_ICMP)
 399     .legalForCartesianProduct(
 400       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 401     .legalFor({{S1, S32}, {S1, S64}});
 402   if (ST.has16BitInsts()) {
 403     CmpBuilder.legalFor({{S1, S16}});
 404   }
 405
 406   CmpBuilder
 407     .widenScalarToNextPow2(1)
 408     .clampScalar(1, S32, S64)
 409     .scalarize(0)
 410     .legalIf(all(typeIs(0, S1), isPointer(1)));
 411
 412   getActionDefinitionsBuilder(G_FCMP)
 413     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 414     .widenScalarToNextPow2(1)
 415     .clampScalar(1, S32, S64)
 416     .scalarize(0);
 417
 418   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
 419   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
 420                                G_FLOG, G_FLOG2, G_FLOG10})
 421     .legalFor({S32})
 422     .scalarize(0);
 423
 424   // The 64-bit versions produce 32-bit results, but only on the SALU.
 425   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
 426                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
 427                                G_CTPOP})
 428     .legalFor({{S32, S32}, {S32, S64}})
 429     .clampScalar(0, S32, S32)
 430     .clampScalar(1, S32, S64)
 431     .scalarize(0)
 432     .widenScalarToNextPow2(0, 32)
 433     .widenScalarToNextPow2(1, 32);
 434
 435   // TODO: Expand for > s32
 436   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
 437     .legalFor({S32})
 438     .clampScalar(0, S32, S32)
 439     .scalarize(0);
 440
 441   if (ST.has16BitInsts()) {
 442     if (ST.hasVOP3PInsts()) {
 443       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 444         .legalFor({S32, S16, V2S16})
 445         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 446         .clampMaxNumElements(0, S16, 2)
 447         .clampScalar(0, S16, S32)
 448         .widenScalarToNextPow2(0)
 449         .scalarize(0);
 450     } else {
 451       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 452         .legalFor({S32, S16})
 453         .widenScalarToNextPow2(0)
 454         .clampScalar(0, S16, S32)
 455         .scalarize(0);
 456     }
 457   } else {
 458     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 459       .legalFor({S32})
 460       .clampScalar(0, S32, S32)
 461       .widenScalarToNextPow2(0)
 462       .scalarize(0);
 463   }
 464
 465   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 466     return [=](const LegalityQuery &Query) {
 467       return Query.Types[TypeIdx0].getSizeInBits() <
 468              Query.Types[TypeIdx1].getSizeInBits();
 469     };
 470   };
 471
 472   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 473     return [=](const LegalityQuery &Query) {
 474       return Query.Types[TypeIdx0].getSizeInBits() >
 475              Query.Types[TypeIdx1].getSizeInBits();
 476     };
 477   };
 478
 479   getActionDefinitionsBuilder(G_INTTOPTR)
 480     // List the common cases
 481     .legalForCartesianProduct(AddrSpaces64, {S64})
 482     .legalForCartesianProduct(AddrSpaces32, {S32})
 483     .scalarize(0)
 484     // Accept any address space as long as the size matches
 485     .legalIf(sameSize(0, 1))
 486     .widenScalarIf(smallerThan(1, 0),
 487       [](const LegalityQuery &Query) {
 488         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 489       })
 490     .narrowScalarIf(greaterThan(1, 0),
 491       [](const LegalityQuery &Query) {
 492         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 493       });
 494
 495   getActionDefinitionsBuilder(G_PTRTOINT)
 496     // List the common cases
 497     .legalForCartesianProduct(AddrSpaces64, {S64})
 498     .legalForCartesianProduct(AddrSpaces32, {S32})
 499     .scalarize(0)
 500     // Accept any address space as long as the size matches
 501     .legalIf(sameSize(0, 1))
 502     .widenScalarIf(smallerThan(0, 1),
 503       [](const LegalityQuery &Query) {
 504         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 505       })
 506     .narrowScalarIf(
 507       greaterThan(0, 1),
 508       [](const LegalityQuery &Query) {
 509         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 510       });
 511
 512   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 513     .scalarize(0)
 514     .custom();
 515
 516   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 517   // handle some operations by just promoting the register during
 518   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 519   getActionDefinitionsBuilder({G_LOAD, G_STORE})
 520     .narrowScalarIf([](const LegalityQuery &Query) {
 521         unsigned Size = Query.Types[0].getSizeInBits();
 522         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 523         return (Size > 32 && MemSize < Size);
 524       },
 525       [](const LegalityQuery &Query) {
 526         return std::make_pair(0, LLT::scalar(32));
 527       })
 528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 529     .fewerElementsIf([=](const LegalityQuery &Query) {
 530         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 531         return (MemSize == 96) &&
 532                Query.Types[0].isVector() &&
 533                !ST.hasDwordx3LoadStores();
 534       },
 535       [=](const LegalityQuery &Query) {
 536         return std::make_pair(0, V2S32);
 537       })
 538     .legalIf([=](const LegalityQuery &Query) {
 539         const LLT &Ty0 = Query.Types[0];
 540
 541         unsigned Size = Ty0.getSizeInBits();
 542         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 543         if (Size < 32 || (Size > 32 && MemSize < Size))
 544           return false;
 545
 546         if (Ty0.isVector() && Size != MemSize)
 547           return false;
 548
 549         // TODO: Decompose private loads into 4-byte components.
 550         // TODO: Illegal flat loads on SI
 551         switch (MemSize) {
 552         case 8:
 553         case 16:
 554           return Size == 32;
 555         case 32:
 556         case 64:
 557         case 128:
 558           return true;
 559
 560         case 96:
 561           return ST.hasDwordx3LoadStores();
 562
 563         case 256:
 564         case 512:
 565           // TODO: Possibly support loads of i256 and i512 .  This will require
 566           // adding i256 and i512 types to MVT in order for to be able to use
 567           // TableGen.
 568           // TODO: Add support for other vector types, this will require
 569           //       defining more value mappings for the new types.
 570           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
 571                                     Ty0.getScalarType().getSizeInBits() == 64);
 572
 573         default:
 574           return false;
 575         }
 576       })
 577     .clampScalar(0, S32, S64);
 578
 579
 580   // FIXME: Handle alignment requirements.
 581   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
 582     .legalForTypesWithMemDesc({
 583         {S32, GlobalPtr, 8, 8},
 584         {S32, GlobalPtr, 16, 8},
 585         {S32, LocalPtr, 8, 8},
 586         {S32, LocalPtr, 16, 8},
 587         {S32, PrivatePtr, 8, 8},
 588         {S32, PrivatePtr, 16, 8}});
 589   if (ST.hasFlatAddressSpace()) {
 590     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
 591                                        {S32, FlatPtr, 16, 8}});
 592   }
 593
 594   ExtLoads.clampScalar(0, S32, S32)
 595           .widenScalarToNextPow2(0)
 596           .unsupportedIfMemSizeNotPow2()
 597           .lower();
 598
 599   auto &Atomics = getActionDefinitionsBuilder(
 600     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
 601      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
 602      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
 603      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
 604     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
 605                {S64, GlobalPtr}, {S64, LocalPtr}});
 606   if (ST.hasFlatAddressSpace()) {
 607     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
 608   }
 609
 610   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
 611     .legalFor({{S32, LocalPtr}});
 612
 613   // TODO: Pointer types, any 32-bit or 64-bit vector
 614   getActionDefinitionsBuilder(G_SELECT)
 615     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
 616           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
 617           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
 618     .clampScalar(0, S16, S64)
 619     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 620     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
 621     .scalarize(1)
 622     .clampMaxNumElements(0, S32, 2)
 623     .clampMaxNumElements(0, LocalPtr, 2)
 624     .clampMaxNumElements(0, PrivatePtr, 2)
 625     .scalarize(0)
 626     .widenScalarToNextPow2(0)
 627     .legalIf(all(isPointer(0), typeIs(1, S1)));
 628
 629   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
 630   // be more flexible with the shift amount type.
 631   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
 632     .legalFor({{S32, S32}, {S64, S32}});
 633   if (ST.has16BitInsts()) {
 634     if (ST.hasVOP3PInsts()) {
 635       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
 636             .clampMaxNumElements(0, S16, 2);
 637     } else
 638       Shifts.legalFor({{S16, S32}, {S16, S16}});
 639
 640     Shifts.clampScalar(1, S16, S32);
 641     Shifts.clampScalar(0, S16, S64);
 642     Shifts.widenScalarToNextPow2(0, 16);
 643   } else {
 644     // Make sure we legalize the shift amount type first, as the general
 645     // expansion for the shifted type will produce much worse code if it hasn't
 646     // been truncated already.
 647     Shifts.clampScalar(1, S32, S32);
 648     Shifts.clampScalar(0, S32, S64);
 649     Shifts.widenScalarToNextPow2(0, 32);
 650   }
 651   Shifts.scalarize(0);
 652
 653   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
 654     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
 655     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
 656     unsigned IdxTypeIdx = 2;
 657
 658     getActionDefinitionsBuilder(Op)
 659       .customIf([=](const LegalityQuery &Query) {
 660           const LLT EltTy = Query.Types[EltTypeIdx];
 661           const LLT VecTy = Query.Types[VecTypeIdx];
 662           const LLT IdxTy = Query.Types[IdxTypeIdx];
 663           return (EltTy.getSizeInBits() == 16 ||
 664                   EltTy.getSizeInBits() % 32 == 0) &&
 665                  VecTy.getSizeInBits() % 32 == 0 &&
 666                  VecTy.getSizeInBits() <= 512 &&
 667                  IdxTy.getSizeInBits() == 32;
 668         })
 669       .clampScalar(EltTypeIdx, S32, S64)
 670       .clampScalar(VecTypeIdx, S32, S64)
 671       .clampScalar(IdxTypeIdx, S32, S32);
 672   }
 673
 674   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
 675     .unsupportedIf([=](const LegalityQuery &Query) {
 676         const LLT &EltTy = Query.Types[1].getElementType();
 677         return Query.Types[0] != EltTy;
 678       });
 679
 680   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
 681     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
 682     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
 683
 684     // FIXME: Doesn't handle extract of illegal sizes.
 685     getActionDefinitionsBuilder(Op)
 686       .legalIf([=](const LegalityQuery &Query) {
 687           const LLT BigTy = Query.Types[BigTyIdx];
 688           const LLT LitTy = Query.Types[LitTyIdx];
 689           return (BigTy.getSizeInBits() % 32 == 0) &&
 690                  (LitTy.getSizeInBits() % 16 == 0);
 691         })
 692       .widenScalarIf(
 693         [=](const LegalityQuery &Query) {
 694           const LLT BigTy = Query.Types[BigTyIdx];
 695           return (BigTy.getScalarSizeInBits() < 16);
 696         },
 697         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
 698       .widenScalarIf(
 699         [=](const LegalityQuery &Query) {
 700           const LLT LitTy = Query.Types[LitTyIdx];
 701           return (LitTy.getScalarSizeInBits() < 16);
 702         },
 703         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
 704       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 705       .widenScalarToNextPow2(BigTyIdx, 32);
 706
 707   }
 708
 709   getActionDefinitionsBuilder(G_BUILD_VECTOR)
 710       .legalForCartesianProduct(AllS32Vectors, {S32})
 711       .legalForCartesianProduct(AllS64Vectors, {S64})
 712       .clampNumElements(0, V16S32, V16S32)
 713       .clampNumElements(0, V2S64, V8S64)
 714       .minScalarSameAs(1, 0)
 715       .legalIf(isRegisterType(0))
 716       .minScalarOrElt(0, S32);
 717
 718   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
 719     .legalIf(isRegisterType(0));
 720
 721   // TODO: Don't fully scalarize v2s16 pieces
 722   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
 723
 724   // Merge/Unmerge
 725   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
 726     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
 727     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 728
 729     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
 730       const LLT &Ty = Query.Types[TypeIdx];
 731       if (Ty.isVector()) {
 732         const LLT &EltTy = Ty.getElementType();
 733         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
 734           return true;
 735         if (!isPowerOf2_32(EltTy.getSizeInBits()))
 736           return true;
 737       }
 738       return false;
 739     };
 740
 741     getActionDefinitionsBuilder(Op)
 742       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
 743       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
 744       // worth considering the multiples of 64 since 2*192 and 2*384 are not
 745       // valid.
 746       .clampScalar(LitTyIdx, S16, S256)
 747       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
 748       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 749       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
 750                            elementTypeIs(1, S16)),
 751                        changeTo(1, V2S16))
 752       // Break up vectors with weird elements into scalars
 753       .fewerElementsIf(
 754         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
 755         scalarize(0))
 756       .fewerElementsIf(
 757         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
 758         scalarize(1))
 759       .clampScalar(BigTyIdx, S32, S512)
 760       .lowerFor({{S16, V2S16}})
 761       .widenScalarIf(
 762         [=](const LegalityQuery &Query) {
 763           const LLT &Ty = Query.Types[BigTyIdx];
 764           return !isPowerOf2_32(Ty.getSizeInBits()) &&
 765                  Ty.getSizeInBits() % 16 != 0;
 766         },
 767         [=](const LegalityQuery &Query) {
 768           // Pick the next power of 2, or a multiple of 64 over 128.
 769           // Whichever is smaller.
 770           const LLT &Ty = Query.Types[BigTyIdx];
 771           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
 772           if (NewSizeInBits >= 256) {
 773             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
 774             if (RoundedTo < NewSizeInBits)
 775               NewSizeInBits = RoundedTo;
 776           }
 777           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
 778         })
 779       .legalIf([=](const LegalityQuery &Query) {
 780           const LLT &BigTy = Query.Types[BigTyIdx];
 781           const LLT &LitTy = Query.Types[LitTyIdx];
 782
 783           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
 784             return false;
 785           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
 786             return false;
 787
 788           return BigTy.getSizeInBits() % 16 == 0 &&
 789                  LitTy.getSizeInBits() % 16 == 0 &&
 790                  BigTy.getSizeInBits() <= 512;
 791         })
 792       // Any vectors left are the wrong size. Scalarize them.
 793       .scalarize(0)
 794       .scalarize(1);
 795   }
 796
 797   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 798
 799   computeTables();
 800   verify(*ST.getInstrInfo());
 801 }
 802
 803 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
 804                                          MachineRegisterInfo &MRI,
 805                                          MachineIRBuilder &MIRBuilder,
 806                                          GISelChangeObserver &Observer) const {
 807   switch (MI.getOpcode()) {
 808   case TargetOpcode::G_ADDRSPACE_CAST:
 809     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
 810   case TargetOpcode::G_FRINT:
 811     return legalizeFrint(MI, MRI, MIRBuilder);
 812   case TargetOpcode::G_FCEIL:
 813     return legalizeFceil(MI, MRI, MIRBuilder);
 814   case TargetOpcode::G_INTRINSIC_TRUNC:
 815     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
 816   case TargetOpcode::G_SITOFP:
 817     return legalizeITOFP(MI, MRI, MIRBuilder, true);
 818   case TargetOpcode::G_UITOFP:
 819     return legalizeITOFP(MI, MRI, MIRBuilder, false);
 820   case TargetOpcode::G_FMINNUM:
 821   case TargetOpcode::G_FMAXNUM:
 822   case TargetOpcode::G_FMINNUM_IEEE:
 823   case TargetOpcode::G_FMAXNUM_IEEE:
 824     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
 825   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
 826     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
 827   case TargetOpcode::G_INSERT_VECTOR_ELT:
 828     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
 829   case TargetOpcode::G_FSIN:
 830   case TargetOpcode::G_FCOS:
 831     return legalizeSinCos(MI, MRI, MIRBuilder);
 832   default:
 833     return false;
 834   }
 835
 836   llvm_unreachable("expected switch to return");
 837 }
 838
 839 Register AMDGPULegalizerInfo::getSegmentAperture(
 840   unsigned AS,
 841   MachineRegisterInfo &MRI,
 842   MachineIRBuilder &MIRBuilder) const {
 843   MachineFunction &MF = MIRBuilder.getMF();
 844   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 845   const LLT S32 = LLT::scalar(32);
 846
 847   if (ST.hasApertureRegs()) {
 848     // FIXME: Use inline constants (src_{shared, private}_base) instead of
 849     // getreg.
 850     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
 851         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
 852         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
 853     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
 854         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
 855         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
 856     unsigned Encoding =
 857         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
 858         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
 859         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
 860
 861     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
 862     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 863
 864     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
 865       .addDef(GetReg)
 866       .addImm(Encoding);
 867     MRI.setType(GetReg, S32);
 868
 869     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
 870     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
 871       .addDef(ApertureReg)
 872       .addUse(GetReg)
 873       .addUse(ShiftAmt.getReg(0));
 874
 875     return ApertureReg;
 876   }
 877
 878   Register QueuePtr = MRI.createGenericVirtualRegister(
 879     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
 880
 881   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 882   if (!loadInputValue(QueuePtr, MIRBuilder, &MFI->getArgInfo().QueuePtr))
 883     return Register();
 884
 885   // Offset into amd_queue_t for group_segment_aperture_base_hi /
 886   // private_segment_aperture_base_hi.
 887   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 888
 889   // FIXME: Don't use undef
 890   Value *V = UndefValue::get(PointerType::get(
 891                                Type::getInt8Ty(MF.getFunction().getContext()),
 892                                AMDGPUAS::CONSTANT_ADDRESS));
 893
 894   MachinePointerInfo PtrInfo(V, StructOffset);
 895   MachineMemOperand *MMO = MF.getMachineMemOperand(
 896     PtrInfo,
 897     MachineMemOperand::MOLoad |
 898     MachineMemOperand::MODereferenceable |
 899     MachineMemOperand::MOInvariant,
 900     4,
 901     MinAlign(64, StructOffset));
 902
 903   Register LoadResult = MRI.createGenericVirtualRegister(S32);
 904   Register LoadAddr;
 905
 906   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
 907   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
 908   return LoadResult;
 909 }
 910
 911 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
 912   MachineInstr &MI, MachineRegisterInfo &MRI,
 913   MachineIRBuilder &MIRBuilder) const {
 914   MachineFunction &MF = MIRBuilder.getMF();
 915
 916   MIRBuilder.setInstr(MI);
 917
 918   const LLT S32 = LLT::scalar(32);
 919   Register Dst = MI.getOperand(0).getReg();
 920   Register Src = MI.getOperand(1).getReg();
 921
 922   LLT DstTy = MRI.getType(Dst);
 923   LLT SrcTy = MRI.getType(Src);
 924   unsigned DestAS = DstTy.getAddressSpace();
 925   unsigned SrcAS = SrcTy.getAddressSpace();
 926
 927   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
 928   // vector element.
 929   assert(!DstTy.isVector());
 930
 931   const AMDGPUTargetMachine &TM
 932     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
 933
 934   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 935   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
 936     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
 937     return true;
 938   }
 939
 940   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
 941     // Truncate.
 942     MIRBuilder.buildExtract(Dst, Src, 0);
 943     MI.eraseFromParent();
 944     return true;
 945   }
 946
 947   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
 948     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 949     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
 950
 951     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
 952     // another. Merge operands are required to be the same type, but creating an
 953     // extra ptrtoint would be kind of pointless.
 954     auto HighAddr = MIRBuilder.buildConstant(
 955       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
 956     MIRBuilder.buildMerge(Dst, {Src, HighAddr.getReg(0)});
 957     MI.eraseFromParent();
 958     return true;
 959   }
 960
 961   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
 962     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
 963            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
 964     unsigned NullVal = TM.getNullPointerValue(DestAS);
 965
 966     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
 967     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
 968
 969     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
 970
 971     // Extract low 32-bits of the pointer.
 972     MIRBuilder.buildExtract(PtrLo32, Src, 0);
 973
 974     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
 975     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
 976     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
 977
 978     MI.eraseFromParent();
 979     return true;
 980   }
 981
 982   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
 983     return false;
 984
 985   if (!ST.hasFlatAddressSpace())
 986     return false;
 987
 988   auto SegmentNull =
 989       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
 990   auto FlatNull =
 991       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
 992
 993   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
 994   if (!ApertureReg.isValid())
 995     return false;
 996
 997   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
 998   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
 999
1000   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1001
1002   // Coerce the type of the low half of the result so we can use merge_values.
1003   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1004   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
1005     .addDef(SrcAsInt)
1006     .addUse(Src);
1007
1008   // TODO: Should we allow mismatched types but matching sizes in merges to
1009   // avoid the ptrtoint?
1010   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1011   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1012
1013   MI.eraseFromParent();
1014   return true;
1015 }
1016
1017 bool AMDGPULegalizerInfo::legalizeFrint(
1018   MachineInstr &MI, MachineRegisterInfo &MRI,
1019   MachineIRBuilder &MIRBuilder) const {
1020   MIRBuilder.setInstr(MI);
1021
1022   Register Src = MI.getOperand(1).getReg();
1023   LLT Ty = MRI.getType(Src);
1024   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1025
1026   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1027   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1028
1029   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
1030   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
1031
1032   // TODO: Should this propagate fast-math-flags?
1033   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
1034   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
1035
1036   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
1037   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
1038
1039   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1040   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1041   return true;
1042 }
1043
1044 bool AMDGPULegalizerInfo::legalizeFceil(
1045   MachineInstr &MI, MachineRegisterInfo &MRI,
1046   MachineIRBuilder &B) const {
1047   B.setInstr(MI);
1048
1049   const LLT S1 = LLT::scalar(1);
1050   const LLT S64 = LLT::scalar(64);
1051
1052   Register Src = MI.getOperand(1).getReg();
1053   assert(MRI.getType(Src) == S64);
1054
1055   // result = trunc(src)
1056   // if (src > 0.0 && src != result)
1057   //   result += 1.0
1058
1059   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1060
1061   const auto Zero = B.buildFConstant(S64, 0.0);
1062   const auto One = B.buildFConstant(S64, 1.0);
1063   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1064   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1065   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1066   auto Add = B.buildSelect(S64, And, One, Zero);
1067
1068   // TODO: Should this propagate fast-math-flags?
1069   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1070   return true;
1071 }
1072
1073 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1074                                               MachineIRBuilder &B) {
1075   const unsigned FractBits = 52;
1076   const unsigned ExpBits = 11;
1077   LLT S32 = LLT::scalar(32);
1078
1079   auto Const0 = B.buildConstant(S32, FractBits - 32);
1080   auto Const1 = B.buildConstant(S32, ExpBits);
1081
1082   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1083     .addUse(Const0.getReg(0))
1084     .addUse(Const1.getReg(0));
1085
1086   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1087 }
1088
1089 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1090   MachineInstr &MI, MachineRegisterInfo &MRI,
1091   MachineIRBuilder &B) const {
1092   B.setInstr(MI);
1093
1094   const LLT S1 = LLT::scalar(1);
1095   const LLT S32 = LLT::scalar(32);
1096   const LLT S64 = LLT::scalar(64);
1097
1098   Register Src = MI.getOperand(1).getReg();
1099   assert(MRI.getType(Src) == S64);
1100
1101   // TODO: Should this use extract since the low half is unused?
1102   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1103   Register Hi = Unmerge.getReg(1);
1104
1105   // Extract the upper half, since this is where we will find the sign and
1106   // exponent.
1107   auto Exp = extractF64Exponent(Hi, B);
1108
1109   const unsigned FractBits = 52;
1110
1111   // Extract the sign bit.
1112   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1113   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1114
1115   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1116
1117   const auto Zero32 = B.buildConstant(S32, 0);
1118
1119   // Extend back to 64-bits.
1120   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1121
1122   auto Shr = B.buildAShr(S64, FractMask, Exp);
1123   auto Not = B.buildNot(S64, Shr);
1124   auto Tmp0 = B.buildAnd(S64, Src, Not);
1125   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1126
1127   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1128   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1129
1130   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1131   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1132   return true;
1133 }
1134
1135 bool AMDGPULegalizerInfo::legalizeITOFP(
1136   MachineInstr &MI, MachineRegisterInfo &MRI,
1137   MachineIRBuilder &B, bool Signed) const {
1138   B.setInstr(MI);
1139
1140   Register Dst = MI.getOperand(0).getReg();
1141   Register Src = MI.getOperand(1).getReg();
1142
1143   const LLT S64 = LLT::scalar(64);
1144   const LLT S32 = LLT::scalar(32);
1145
1146   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1147
1148   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1149
1150   auto CvtHi = Signed ?
1151     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1152     B.buildUITOFP(S64, Unmerge.getReg(1));
1153
1154   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1155
1156   auto ThirtyTwo = B.buildConstant(S32, 32);
1157   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1158     .addUse(CvtHi.getReg(0))
1159     .addUse(ThirtyTwo.getReg(0));
1160
1161   // TODO: Should this propagate fast-math-flags?
1162   B.buildFAdd(Dst, LdExp, CvtLo);
1163   MI.eraseFromParent();
1164   return true;
1165 }
1166
1167 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1168   MachineInstr &MI, MachineRegisterInfo &MRI,
1169   MachineIRBuilder &B) const {
1170   MachineFunction &MF = B.getMF();
1171   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1172
1173   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1174                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1175
1176   // With ieee_mode disabled, the instructions have the correct behavior
1177   // already for G_FMINNUM/G_FMAXNUM
1178   if (!MFI->getMode().IEEE)
1179     return !IsIEEEOp;
1180
1181   if (IsIEEEOp)
1182     return true;
1183
1184   MachineIRBuilder HelperBuilder(MI);
1185   GISelObserverWrapper DummyObserver;
1186   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1187   HelperBuilder.setMBB(*MI.getParent());
1188   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1189 }
1190
1191 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1192   MachineInstr &MI, MachineRegisterInfo &MRI,
1193   MachineIRBuilder &B) const {
1194   // TODO: Should move some of this into LegalizerHelper.
1195
1196   // TODO: Promote dynamic indexing of s16 to s32
1197   // TODO: Dynamic s64 indexing is only legal for SGPR.
1198   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1199   if (!IdxVal) // Dynamic case will be selected to register indexing.
1200     return true;
1201
1202   Register Dst = MI.getOperand(0).getReg();
1203   Register Vec = MI.getOperand(1).getReg();
1204
1205   LLT VecTy = MRI.getType(Vec);
1206   LLT EltTy = VecTy.getElementType();
1207   assert(EltTy == MRI.getType(Dst));
1208
1209   B.setInstr(MI);
1210
1211   if (IdxVal.getValue() < VecTy.getNumElements())
1212     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1213   else
1214     B.buildUndef(Dst);
1215
1216   MI.eraseFromParent();
1217   return true;
1218 }
1219
1220 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1221   MachineInstr &MI, MachineRegisterInfo &MRI,
1222   MachineIRBuilder &B) const {
1223   // TODO: Should move some of this into LegalizerHelper.
1224
1225   // TODO: Promote dynamic indexing of s16 to s32
1226   // TODO: Dynamic s64 indexing is only legal for SGPR.
1227   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1228   if (!IdxVal) // Dynamic case will be selected to register indexing.
1229     return true;
1230
1231   Register Dst = MI.getOperand(0).getReg();
1232   Register Vec = MI.getOperand(1).getReg();
1233   Register Ins = MI.getOperand(2).getReg();
1234
1235   LLT VecTy = MRI.getType(Vec);
1236   LLT EltTy = VecTy.getElementType();
1237   assert(EltTy == MRI.getType(Ins));
1238
1239   B.setInstr(MI);
1240
1241   if (IdxVal.getValue() < VecTy.getNumElements())
1242     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1243   else
1244     B.buildUndef(Dst);
1245
1246   MI.eraseFromParent();
1247   return true;
1248 }
1249
1250 bool AMDGPULegalizerInfo::legalizeSinCos(
1251   MachineInstr &MI, MachineRegisterInfo &MRI,
1252   MachineIRBuilder &B) const {
1253   B.setInstr(MI);
1254
1255   Register DstReg = MI.getOperand(0).getReg();
1256   Register SrcReg = MI.getOperand(1).getReg();
1257   LLT Ty = MRI.getType(DstReg);
1258   unsigned Flags = MI.getFlags();
1259
1260   Register TrigVal;
1261   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1262   if (ST.hasTrigReducedRange()) {
1263     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1264     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1265       .addUse(MulVal.getReg(0))
1266       .setMIFlags(Flags).getReg(0);
1267   } else
1268     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1269
1270   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1271     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1272   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1273     .addUse(TrigVal)
1274     .setMIFlags(Flags);
1275   MI.eraseFromParent();
1276   return true;
1277 }
1278
1279 // Return the use branch instruction, otherwise null if the usage is invalid.
1280 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1281                                        MachineRegisterInfo &MRI) {
1282   Register CondDef = MI.getOperand(0).getReg();
1283   if (!MRI.hasOneNonDBGUse(CondDef))
1284     return nullptr;
1285
1286   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1287   return UseMI.getParent() == MI.getParent() &&
1288     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1289 }
1290
1291 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1292                                                 Register Reg, LLT Ty) const {
1293   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1294   if (LiveIn)
1295     return LiveIn;
1296
1297   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1298   MRI.addLiveIn(Reg, NewReg);
1299   return NewReg;
1300 }
1301
1302 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1303                                          const ArgDescriptor *Arg) const {
1304   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1305     return false; // TODO: Handle these
1306
1307   assert(Arg->getRegister().isPhysical());
1308
1309   MachineRegisterInfo &MRI = *B.getMRI();
1310
1311   LLT Ty = MRI.getType(DstReg);
1312   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1313
1314   if (Arg->isMasked()) {
1315     // TODO: Should we try to emit this once in the entry block?
1316     const LLT S32 = LLT::scalar(32);
1317     const unsigned Mask = Arg->getMask();
1318     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1319
1320     auto ShiftAmt = B.buildConstant(S32, Shift);
1321     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1322     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1323   } else
1324     B.buildCopy(DstReg, LiveIn);
1325
1326   // Insert the argument copy if it doens't already exist.
1327   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1328   if (!MRI.getVRegDef(LiveIn)) {
1329     // FIXME: Should have scoped insert pt
1330     MachineBasicBlock &OrigInsBB = B.getMBB();
1331     auto OrigInsPt = B.getInsertPt();
1332
1333     MachineBasicBlock &EntryMBB = B.getMF().front();
1334     EntryMBB.addLiveIn(Arg->getRegister());
1335     B.setInsertPt(EntryMBB, EntryMBB.begin());
1336     B.buildCopy(LiveIn, Arg->getRegister());
1337
1338     B.setInsertPt(OrigInsBB, OrigInsPt);
1339   }
1340
1341   return true;
1342 }
1343
1344 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1345   MachineInstr &MI,
1346   MachineRegisterInfo &MRI,
1347   MachineIRBuilder &B,
1348   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1349   B.setInstr(MI);
1350
1351   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1352
1353   const ArgDescriptor *Arg;
1354   const TargetRegisterClass *RC;
1355   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1356   if (!Arg) {
1357     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1358     return false;
1359   }
1360
1361   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1362     MI.eraseFromParent();
1363     return true;
1364   }
1365
1366   return false;
1367 }
1368
1369 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1370                                            MachineRegisterInfo &MRI,
1371                                            MachineIRBuilder &B) const {
1372   B.setInstr(MI);
1373   Register Res = MI.getOperand(0).getReg();
1374   Register LHS = MI.getOperand(2).getReg();
1375   Register RHS = MI.getOperand(3).getReg();
1376   uint16_t Flags = MI.getFlags();
1377
1378   LLT S32 = LLT::scalar(32);
1379   LLT S1 = LLT::scalar(1);
1380
1381   auto Abs = B.buildFAbs(S32, RHS, Flags);
1382   const APFloat C0Val(1.0f);
1383
1384   auto C0 = B.buildConstant(S32, 0x6f800000);
1385   auto C1 = B.buildConstant(S32, 0x2f800000);
1386   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1387
1388   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1389   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1390
1391   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1392
1393   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1394     .addUse(Mul0.getReg(0))
1395     .setMIFlags(Flags);
1396
1397   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1398
1399   B.buildFMul(Res, Sel, Mul1, Flags);
1400
1401   MI.eraseFromParent();
1402   return true;
1403 }
1404
1405 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1406                                                  MachineRegisterInfo &MRI,
1407                                                  MachineIRBuilder &B) const {
1408   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1409   if (!MFI->isEntryFunction()) {
1410     return legalizePreloadedArgIntrin(MI, MRI, B,
1411                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1412   }
1413
1414   B.setInstr(MI);
1415
1416   uint64_t Offset =
1417     ST.getTargetLowering()->getImplicitParameterOffset(
1418       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1419   Register DstReg = MI.getOperand(0).getReg();
1420   LLT DstTy = MRI.getType(DstReg);
1421   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1422
1423   const ArgDescriptor *Arg;
1424   const TargetRegisterClass *RC;
1425   std::tie(Arg, RC)
1426     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1427   if (!Arg)
1428     return false;
1429
1430   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1431   if (!loadInputValue(KernargPtrReg, B, Arg))
1432     return false;
1433
1434   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1435   MI.eraseFromParent();
1436   return true;
1437 }
1438
1439 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1440                                               MachineRegisterInfo &MRI,
1441                                               MachineIRBuilder &B,
1442                                               unsigned AddrSpace) const {
1443   B.setInstr(MI);
1444   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1445   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1446   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1447   MI.eraseFromParent();
1448   return true;
1449 }
1450
1451 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1452                                             MachineRegisterInfo &MRI,
1453                                             MachineIRBuilder &B) const {
1454   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1455   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1456   case Intrinsic::amdgcn_if: {
1457     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1458       const SIRegisterInfo *TRI
1459         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1460
1461       B.setInstr(*BrCond);
1462       Register Def = MI.getOperand(1).getReg();
1463       Register Use = MI.getOperand(3).getReg();
1464       B.buildInstr(AMDGPU::SI_IF)
1465         .addDef(Def)
1466         .addUse(Use)
1467         .addMBB(BrCond->getOperand(1).getMBB());
1468
1469       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1470       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1471       MI.eraseFromParent();
1472       BrCond->eraseFromParent();
1473       return true;
1474     }
1475
1476     return false;
1477   }
1478   case Intrinsic::amdgcn_loop: {
1479     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1480       const SIRegisterInfo *TRI
1481         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1482
1483       B.setInstr(*BrCond);
1484       Register Reg = MI.getOperand(2).getReg();
1485       B.buildInstr(AMDGPU::SI_LOOP)
1486         .addUse(Reg)
1487         .addMBB(BrCond->getOperand(1).getMBB());
1488       MI.eraseFromParent();
1489       BrCond->eraseFromParent();
1490       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1491       return true;
1492     }
1493
1494     return false;
1495   }
1496   case Intrinsic::amdgcn_kernarg_segment_ptr:
1497     return legalizePreloadedArgIntrin(
1498       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1499   case Intrinsic::amdgcn_implicitarg_ptr:
1500     return legalizeImplicitArgPtr(MI, MRI, B);
1501   case Intrinsic::amdgcn_workitem_id_x:
1502     return legalizePreloadedArgIntrin(MI, MRI, B,
1503                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1504   case Intrinsic::amdgcn_workitem_id_y:
1505     return legalizePreloadedArgIntrin(MI, MRI, B,
1506                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1507   case Intrinsic::amdgcn_workitem_id_z:
1508     return legalizePreloadedArgIntrin(MI, MRI, B,
1509                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1510   case Intrinsic::amdgcn_workgroup_id_x:
1511     return legalizePreloadedArgIntrin(MI, MRI, B,
1512                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1513   case Intrinsic::amdgcn_workgroup_id_y:
1514     return legalizePreloadedArgIntrin(MI, MRI, B,
1515                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1516   case Intrinsic::amdgcn_workgroup_id_z:
1517     return legalizePreloadedArgIntrin(MI, MRI, B,
1518                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1519   case Intrinsic::amdgcn_dispatch_ptr:
1520     return legalizePreloadedArgIntrin(MI, MRI, B,
1521                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1522   case Intrinsic::amdgcn_queue_ptr:
1523     return legalizePreloadedArgIntrin(MI, MRI, B,
1524                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1525   case Intrinsic::amdgcn_implicit_buffer_ptr:
1526     return legalizePreloadedArgIntrin(
1527       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1528   case Intrinsic::amdgcn_dispatch_id:
1529     return legalizePreloadedArgIntrin(MI, MRI, B,
1530                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1531   case Intrinsic::amdgcn_fdiv_fast:
1532     return legalizeFDIVFast(MI, MRI, B);
1533   case Intrinsic::amdgcn_is_shared:
1534     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1535   case Intrinsic::amdgcn_is_private:
1536     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1537   default:
1538     return true;
1539   }
1540
1541   return true;
1542 }