lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #if defined(_MSC_VER) || defined(__MINGW32__)
  15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
  16 // from the Visual C++ cmath / math.h headers:
  17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
  18 #define _USE_MATH_DEFINES
  19 #endif
  20
  21 #include "AMDGPU.h"
  22 #include "AMDGPULegalizerInfo.h"
  23 #include "AMDGPUTargetMachine.h"
  24 #include "SIMachineFunctionInfo.h"
  25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  27 #include "llvm/CodeGen/TargetOpcodes.h"
  28 #include "llvm/CodeGen/ValueTypes.h"
  29 #include "llvm/IR/DerivedTypes.h"
  30 #include "llvm/IR/DiagnosticInfo.h"
  31 #include "llvm/IR/Type.h"
  32 #include "llvm/Support/Debug.h"
  33
  34 #define DEBUG_TYPE "amdgpu-legalinfo"
  35
  36 using namespace llvm;
  37 using namespace LegalizeActions;
  38 using namespace LegalizeMutations;
  39 using namespace LegalityPredicates;
  40
  41
  42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
  43                                       unsigned MaxSize = 512) {
  44   return [=](const LegalityQuery &Query) {
  45     const LLT Ty = Query.Types[TypeIdx];
  46     const LLT EltTy = Ty.getScalarType();
  47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
  48   };
  49 }
  50
  51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  52   return [=](const LegalityQuery &Query) {
  53     const LLT Ty = Query.Types[TypeIdx];
  54     return Ty.isVector() &&
  55            Ty.getNumElements() % 2 != 0 &&
  56            Ty.getElementType().getSizeInBits() < 32;
  57   };
  58 }
  59
  60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  61   return [=](const LegalityQuery &Query) {
  62     const LLT Ty = Query.Types[TypeIdx];
  63     const LLT EltTy = Ty.getElementType();
  64     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  65   };
  66 }
  67
  68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  69   return [=](const LegalityQuery &Query) {
  70     const LLT Ty = Query.Types[TypeIdx];
  71     const LLT EltTy = Ty.getElementType();
  72     unsigned Size = Ty.getSizeInBits();
  73     unsigned Pieces = (Size + 63) / 64;
  74     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  75     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  76   };
  77 }
  78
  79 // Increase the number of vector elements to reach the next multiple of 32-bit
  80 // type.
  81 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
  82   return [=](const LegalityQuery &Query) {
  83     const LLT Ty = Query.Types[TypeIdx];
  84
  85     const LLT EltTy = Ty.getElementType();
  86     const int Size = Ty.getSizeInBits();
  87     const int EltSize = EltTy.getSizeInBits();
  88     const int NextMul32 = (Size + 31) / 32;
  89
  90     assert(EltSize < 32);
  91
  92     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
  93     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
  94   };
  95 }
  96
  97 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
  98   return [=](const LegalityQuery &Query) {
  99     const LLT QueryTy = Query.Types[TypeIdx];
 100     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
 101   };
 102 }
 103
 104 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
 105   return [=](const LegalityQuery &Query) {
 106     const LLT QueryTy = Query.Types[TypeIdx];
 107     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
 108   };
 109 }
 110
 111 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
 112   return [=](const LegalityQuery &Query) {
 113     const LLT QueryTy = Query.Types[TypeIdx];
 114     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
 115   };
 116 }
 117
 118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
 119 // v2s16.
 120 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 121   return [=](const LegalityQuery &Query) {
 122     const LLT Ty = Query.Types[TypeIdx];
 123     if (Ty.isVector()) {
 124       const int EltSize = Ty.getElementType().getSizeInBits();
 125       return EltSize == 32 || EltSize == 64 ||
 126             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 127              EltSize == 128 || EltSize == 256;
 128     }
 129
 130     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
 131   };
 132 }
 133
 134 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
 135   return [=](const LegalityQuery &Query) {
 136     return Query.Types[TypeIdx].getElementType() == Type;
 137   };
 138 }
 139
 140 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
 141   return [=](const LegalityQuery &Query) {
 142     const LLT Ty = Query.Types[TypeIdx];
 143     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
 144            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
 145   };
 146 }
 147
 148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 149                                          const GCNTargetMachine &TM)
 150   :  ST(ST_) {
 151   using namespace TargetOpcode;
 152
 153   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 154     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 155   };
 156
 157   const LLT S1 = LLT::scalar(1);
 158   const LLT S8 = LLT::scalar(8);
 159   const LLT S16 = LLT::scalar(16);
 160   const LLT S32 = LLT::scalar(32);
 161   const LLT S64 = LLT::scalar(64);
 162   const LLT S96 = LLT::scalar(96);
 163   const LLT S128 = LLT::scalar(128);
 164   const LLT S256 = LLT::scalar(256);
 165   const LLT S512 = LLT::scalar(512);
 166
 167   const LLT V2S16 = LLT::vector(2, 16);
 168   const LLT V4S16 = LLT::vector(4, 16);
 169
 170   const LLT V2S32 = LLT::vector(2, 32);
 171   const LLT V3S32 = LLT::vector(3, 32);
 172   const LLT V4S32 = LLT::vector(4, 32);
 173   const LLT V5S32 = LLT::vector(5, 32);
 174   const LLT V6S32 = LLT::vector(6, 32);
 175   const LLT V7S32 = LLT::vector(7, 32);
 176   const LLT V8S32 = LLT::vector(8, 32);
 177   const LLT V9S32 = LLT::vector(9, 32);
 178   const LLT V10S32 = LLT::vector(10, 32);
 179   const LLT V11S32 = LLT::vector(11, 32);
 180   const LLT V12S32 = LLT::vector(12, 32);
 181   const LLT V13S32 = LLT::vector(13, 32);
 182   const LLT V14S32 = LLT::vector(14, 32);
 183   const LLT V15S32 = LLT::vector(15, 32);
 184   const LLT V16S32 = LLT::vector(16, 32);
 185
 186   const LLT V2S64 = LLT::vector(2, 64);
 187   const LLT V3S64 = LLT::vector(3, 64);
 188   const LLT V4S64 = LLT::vector(4, 64);
 189   const LLT V5S64 = LLT::vector(5, 64);
 190   const LLT V6S64 = LLT::vector(6, 64);
 191   const LLT V7S64 = LLT::vector(7, 64);
 192   const LLT V8S64 = LLT::vector(8, 64);
 193
 194   std::initializer_list<LLT> AllS32Vectors =
 195     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 196      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
 197   std::initializer_list<LLT> AllS64Vectors =
 198     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
 199
 200   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 201   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 202   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 203   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 204   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 205   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 206   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 207
 208   const LLT CodePtr = FlatPtr;
 209
 210   const std::initializer_list<LLT> AddrSpaces64 = {
 211     GlobalPtr, ConstantPtr, FlatPtr
 212   };
 213
 214   const std::initializer_list<LLT> AddrSpaces32 = {
 215     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 216   };
 217
 218   const std::initializer_list<LLT> FPTypesBase = {
 219     S32, S64
 220   };
 221
 222   const std::initializer_list<LLT> FPTypes16 = {
 223     S32, S64, S16
 224   };
 225
 226   const std::initializer_list<LLT> FPTypesPK16 = {
 227     S32, S64, S16, V2S16
 228   };
 229
 230   setAction({G_BRCOND, S1}, Legal);
 231
 232   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 233   // elements for v3s16
 234   getActionDefinitionsBuilder(G_PHI)
 235     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 236     .legalFor(AllS32Vectors)
 237     .legalFor(AllS64Vectors)
 238     .legalFor(AddrSpaces64)
 239     .legalFor(AddrSpaces32)
 240     .clampScalar(0, S32, S256)
 241     .widenScalarToNextPow2(0, 32)
 242     .clampMaxNumElements(0, S32, 16)
 243     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 244     .legalIf(isPointer(0));
 245
 246   if (ST.has16BitInsts()) {
 247     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 248       .legalFor({S32, S16})
 249       .clampScalar(0, S16, S32)
 250       .scalarize(0);
 251   } else {
 252     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 253       .legalFor({S32})
 254       .clampScalar(0, S32, S32)
 255       .scalarize(0);
 256   }
 257
 258   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 259     .legalFor({S32})
 260     .clampScalar(0, S32, S32)
 261     .scalarize(0);
 262
 263   // Report legal for any types we can handle anywhere. For the cases only legal
 264   // on the SALU, RegBankSelect will be able to re-legalize.
 265   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 266     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 267     .clampScalar(0, S32, S64)
 268     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 269     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
 270     .widenScalarToNextPow2(0)
 271     .scalarize(0);
 272
 273   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
 274                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 275     .legalFor({{S32, S1}})
 276     .clampScalar(0, S32, S32);
 277
 278   getActionDefinitionsBuilder(G_BITCAST)
 279     .legalForCartesianProduct({S32, V2S16})
 280     .legalForCartesianProduct({S64, V2S32, V4S16})
 281     .legalForCartesianProduct({V2S64, V4S32})
 282     // Don't worry about the size constraint.
 283     .legalIf(all(isPointer(0), isPointer(1)))
 284     // FIXME: Testing hack
 285     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
 286
 287   getActionDefinitionsBuilder(G_FCONSTANT)
 288     .legalFor({S32, S64, S16})
 289     .clampScalar(0, S16, S64);
 290
 291   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
 292     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
 293                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
 294     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 295     .clampScalarOrElt(0, S32, S512)
 296     .legalIf(isMultiple32(0))
 297     .widenScalarToNextPow2(0, 32)
 298     .clampMaxNumElements(0, S32, 16);
 299
 300
 301   // FIXME: i1 operands to intrinsics should always be legal, but other i1
 302   // values may not be legal.  We need to figure out how to distinguish
 303   // between these two scenarios.
 304   getActionDefinitionsBuilder(G_CONSTANT)
 305     .legalFor({S1, S32, S64, S16, GlobalPtr,
 306                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 307     .clampScalar(0, S32, S64)
 308     .widenScalarToNextPow2(0)
 309     .legalIf(isPointer(0));
 310
 311   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 312   getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr});
 313
 314
 315   auto &FPOpActions = getActionDefinitionsBuilder(
 316     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
 317     .legalFor({S32, S64});
 318   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 319     .customFor({S32, S64});
 320
 321   if (ST.has16BitInsts()) {
 322     if (ST.hasVOP3PInsts())
 323       FPOpActions.legalFor({S16, V2S16});
 324     else
 325       FPOpActions.legalFor({S16});
 326
 327     TrigActions.customFor({S16});
 328   }
 329
 330   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 331       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 332
 333   if (ST.hasVOP3PInsts()) {
 334     MinNumMaxNum.customFor(FPTypesPK16)
 335       .clampMaxNumElements(0, S16, 2)
 336       .clampScalar(0, S16, S64)
 337       .scalarize(0);
 338   } else if (ST.has16BitInsts()) {
 339     MinNumMaxNum.customFor(FPTypes16)
 340       .clampScalar(0, S16, S64)
 341       .scalarize(0);
 342   } else {
 343     MinNumMaxNum.customFor(FPTypesBase)
 344       .clampScalar(0, S32, S64)
 345       .scalarize(0);
 346   }
 347
 348   if (ST.hasVOP3PInsts())
 349     FPOpActions.clampMaxNumElements(0, S16, 2);
 350
 351   FPOpActions
 352     .scalarize(0)
 353     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 354
 355   TrigActions
 356     .scalarize(0)
 357     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 358
 359   getActionDefinitionsBuilder({G_FNEG, G_FABS})
 360     .legalFor(FPTypesPK16)
 361     .clampMaxNumElements(0, S16, 2)
 362     .scalarize(0)
 363     .clampScalar(0, S16, S64);
 364
 365   // TODO: Implement
 366   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
 367
 368   if (ST.has16BitInsts()) {
 369     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 370       .legalFor({S32, S64, S16})
 371       .scalarize(0)
 372       .clampScalar(0, S16, S64);
 373   } else {
 374     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 375       .legalFor({S32, S64})
 376       .scalarize(0)
 377       .clampScalar(0, S32, S64);
 378   }
 379
 380   getActionDefinitionsBuilder(G_FPTRUNC)
 381     .legalFor({{S32, S64}, {S16, S32}})
 382     .scalarize(0);
 383
 384   getActionDefinitionsBuilder(G_FPEXT)
 385     .legalFor({{S64, S32}, {S32, S16}})
 386     .lowerFor({{S64, S16}}) // FIXME: Implement
 387     .scalarize(0);
 388
 389   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
 390   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
 391
 392   getActionDefinitionsBuilder(G_FSUB)
 393       // Use actual fsub instruction
 394       .legalFor({S32})
 395       // Must use fadd + fneg
 396       .lowerFor({S64, S16, V2S16})
 397       .scalarize(0)
 398       .clampScalar(0, S32, S64);
 399
 400   // Whether this is legal depends on the floating point mode for the function.
 401   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
 402   if (ST.hasMadF16())
 403     FMad.customFor({S32, S16});
 404   else
 405     FMad.customFor({S32});
 406   FMad.scalarize(0)
 407       .lower();
 408
 409   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 410     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 411                {S32, S1}, {S64, S1}, {S16, S1},
 412                {S96, S32},
 413                // FIXME: Hack
 414                {S64, LLT::scalar(33)},
 415                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
 416     .scalarize(0);
 417
 418   // TODO: Legal for s1->s64, requires split for VALU.
 419   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 420     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}})
 421     .lowerFor({{S32, S64}})
 422     .customFor({{S64, S64}})
 423     .scalarize(0);
 424
 425   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 426     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
 427     .scalarize(0);
 428
 429   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 430     .legalFor({S32, S64})
 431     .scalarize(0);
 432
 433   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 434     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 435       .legalFor({S32, S64})
 436       .clampScalar(0, S32, S64)
 437       .scalarize(0);
 438   } else {
 439     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 440       .legalFor({S32})
 441       .customFor({S64})
 442       .clampScalar(0, S32, S64)
 443       .scalarize(0);
 444   }
 445
 446   getActionDefinitionsBuilder(G_GEP)
 447     .legalForCartesianProduct(AddrSpaces64, {S64})
 448     .legalForCartesianProduct(AddrSpaces32, {S32})
 449     .scalarize(0);
 450
 451   getActionDefinitionsBuilder(G_PTR_MASK)
 452     .scalarize(0)
 453     .alwaysLegal();
 454
 455   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 456
 457   auto &CmpBuilder =
 458     getActionDefinitionsBuilder(G_ICMP)
 459     .legalForCartesianProduct(
 460       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 461     .legalFor({{S1, S32}, {S1, S64}});
 462   if (ST.has16BitInsts()) {
 463     CmpBuilder.legalFor({{S1, S16}});
 464   }
 465
 466   CmpBuilder
 467     .widenScalarToNextPow2(1)
 468     .clampScalar(1, S32, S64)
 469     .scalarize(0)
 470     .legalIf(all(typeIs(0, S1), isPointer(1)));
 471
 472   getActionDefinitionsBuilder(G_FCMP)
 473     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 474     .widenScalarToNextPow2(1)
 475     .clampScalar(1, S32, S64)
 476     .scalarize(0);
 477
 478   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
 479   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
 480                                G_FLOG, G_FLOG2, G_FLOG10})
 481     .legalFor({S32})
 482     .scalarize(0);
 483
 484   // The 64-bit versions produce 32-bit results, but only on the SALU.
 485   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
 486                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
 487                                G_CTPOP})
 488     .legalFor({{S32, S32}, {S32, S64}})
 489     .clampScalar(0, S32, S32)
 490     .clampScalar(1, S32, S64)
 491     .scalarize(0)
 492     .widenScalarToNextPow2(0, 32)
 493     .widenScalarToNextPow2(1, 32);
 494
 495   // TODO: Expand for > s32
 496   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
 497     .legalFor({S32})
 498     .clampScalar(0, S32, S32)
 499     .scalarize(0);
 500
 501   if (ST.has16BitInsts()) {
 502     if (ST.hasVOP3PInsts()) {
 503       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 504         .legalFor({S32, S16, V2S16})
 505         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 506         .clampMaxNumElements(0, S16, 2)
 507         .clampScalar(0, S16, S32)
 508         .widenScalarToNextPow2(0)
 509         .scalarize(0);
 510     } else {
 511       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 512         .legalFor({S32, S16})
 513         .widenScalarToNextPow2(0)
 514         .clampScalar(0, S16, S32)
 515         .scalarize(0);
 516     }
 517   } else {
 518     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 519       .legalFor({S32})
 520       .clampScalar(0, S32, S32)
 521       .widenScalarToNextPow2(0)
 522       .scalarize(0);
 523   }
 524
 525   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 526     return [=](const LegalityQuery &Query) {
 527       return Query.Types[TypeIdx0].getSizeInBits() <
 528              Query.Types[TypeIdx1].getSizeInBits();
 529     };
 530   };
 531
 532   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 533     return [=](const LegalityQuery &Query) {
 534       return Query.Types[TypeIdx0].getSizeInBits() >
 535              Query.Types[TypeIdx1].getSizeInBits();
 536     };
 537   };
 538
 539   getActionDefinitionsBuilder(G_INTTOPTR)
 540     // List the common cases
 541     .legalForCartesianProduct(AddrSpaces64, {S64})
 542     .legalForCartesianProduct(AddrSpaces32, {S32})
 543     .scalarize(0)
 544     // Accept any address space as long as the size matches
 545     .legalIf(sameSize(0, 1))
 546     .widenScalarIf(smallerThan(1, 0),
 547       [](const LegalityQuery &Query) {
 548         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 549       })
 550     .narrowScalarIf(greaterThan(1, 0),
 551       [](const LegalityQuery &Query) {
 552         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 553       });
 554
 555   getActionDefinitionsBuilder(G_PTRTOINT)
 556     // List the common cases
 557     .legalForCartesianProduct(AddrSpaces64, {S64})
 558     .legalForCartesianProduct(AddrSpaces32, {S32})
 559     .scalarize(0)
 560     // Accept any address space as long as the size matches
 561     .legalIf(sameSize(0, 1))
 562     .widenScalarIf(smallerThan(0, 1),
 563       [](const LegalityQuery &Query) {
 564         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 565       })
 566     .narrowScalarIf(
 567       greaterThan(0, 1),
 568       [](const LegalityQuery &Query) {
 569         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 570       });
 571
 572   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 573     .scalarize(0)
 574     .custom();
 575
 576   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 577   // handle some operations by just promoting the register during
 578   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 579   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
 580     switch (AS) {
 581     // FIXME: Private element size.
 582     case AMDGPUAS::PRIVATE_ADDRESS:
 583       return 32;
 584     // FIXME: Check subtarget
 585     case AMDGPUAS::LOCAL_ADDRESS:
 586       return ST.useDS128() ? 128 : 64;
 587
 588     // Treat constant and global as identical. SMRD loads are sometimes usable
 589     // for global loads (ideally constant address space should be eliminated)
 590     // depending on the context. Legality cannot be context dependent, but
 591     // RegBankSelect can split the load as necessary depending on the pointer
 592     // register bank/uniformity and if the memory is invariant or not written in
 593     // a kernel.
 594     case AMDGPUAS::CONSTANT_ADDRESS:
 595     case AMDGPUAS::GLOBAL_ADDRESS:
 596       return 512;
 597     default:
 598       return 128;
 599     }
 600   };
 601
 602   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
 603     const LLT DstTy = Query.Types[0];
 604
 605     // Split vector extloads.
 606     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 607     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
 608       return true;
 609
 610     const LLT PtrTy = Query.Types[1];
 611     unsigned AS = PtrTy.getAddressSpace();
 612     if (MemSize > maxSizeForAddrSpace(AS))
 613       return true;
 614
 615     // Catch weird sized loads that don't evenly divide into the access sizes
 616     // TODO: May be able to widen depending on alignment etc.
 617     unsigned NumRegs = MemSize / 32;
 618     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
 619       return true;
 620
 621     unsigned Align = Query.MMODescrs[0].AlignInBits;
 622     if (Align < MemSize) {
 623       const SITargetLowering *TLI = ST.getTargetLowering();
 624       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
 625     }
 626
 627     return false;
 628   };
 629
 630   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
 631   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
 632   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
 633
 634   // TODO: Refine based on subtargets which support unaligned access or 128-bit
 635   // LDS
 636   // TODO: Unsupported flat for SI.
 637
 638   for (unsigned Op : {G_LOAD, G_STORE}) {
 639     const bool IsStore = Op == G_STORE;
 640
 641     auto &Actions = getActionDefinitionsBuilder(Op);
 642     // Whitelist the common cases.
 643     // TODO: Pointer loads
 644     // TODO: Wide constant loads
 645     // TODO: Only CI+ has 3x loads
 646     // TODO: Loads to s16 on gfx9
 647     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
 648                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
 649                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
 650                                       {S96, GlobalPtr, 96, GlobalAlign32},
 651                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
 652                                       {S128, GlobalPtr, 128, GlobalAlign32},
 653                                       {S64, GlobalPtr, 64, GlobalAlign32},
 654                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
 655                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
 656                                       {S32, GlobalPtr, 8, GlobalAlign8},
 657                                       {S32, GlobalPtr, 16, GlobalAlign16},
 658
 659                                       {S32, LocalPtr, 32, 32},
 660                                       {S64, LocalPtr, 64, 32},
 661                                       {V2S32, LocalPtr, 64, 32},
 662                                       {S32, LocalPtr, 8, 8},
 663                                       {S32, LocalPtr, 16, 16},
 664                                       {V2S16, LocalPtr, 32, 32},
 665
 666                                       {S32, PrivatePtr, 32, 32},
 667                                       {S32, PrivatePtr, 8, 8},
 668                                       {S32, PrivatePtr, 16, 16},
 669                                       {V2S16, PrivatePtr, 32, 32},
 670
 671                                       {S32, FlatPtr, 32, GlobalAlign32},
 672                                       {S32, FlatPtr, 16, GlobalAlign16},
 673                                       {S32, FlatPtr, 8, GlobalAlign8},
 674                                       {V2S16, FlatPtr, 32, GlobalAlign32},
 675
 676                                       {S32, ConstantPtr, 32, GlobalAlign32},
 677                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
 678                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
 679                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
 680                                       {S64, ConstantPtr, 64, GlobalAlign32},
 681                                       {S128, ConstantPtr, 128, GlobalAlign32},
 682                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
 683     Actions
 684         .customIf(typeIs(1, Constant32Ptr))
 685         .narrowScalarIf(
 686             [=](const LegalityQuery &Query) -> bool {
 687               return !Query.Types[0].isVector() && needToSplitLoad(Query);
 688             },
 689             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
 690               const LLT DstTy = Query.Types[0];
 691               const LLT PtrTy = Query.Types[1];
 692
 693               const unsigned DstSize = DstTy.getSizeInBits();
 694               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 695
 696               // Split extloads.
 697               if (DstSize > MemSize)
 698                 return std::make_pair(0, LLT::scalar(MemSize));
 699
 700               if (DstSize > 32 && (DstSize % 32 != 0)) {
 701                 // FIXME: Need a way to specify non-extload of larger size if
 702                 // suitably aligned.
 703                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
 704               }
 705
 706               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
 707               if (MemSize > MaxSize)
 708                 return std::make_pair(0, LLT::scalar(MaxSize));
 709
 710               unsigned Align = Query.MMODescrs[0].AlignInBits;
 711               return std::make_pair(0, LLT::scalar(Align));
 712             })
 713         .fewerElementsIf(
 714             [=](const LegalityQuery &Query) -> bool {
 715               return Query.Types[0].isVector() && needToSplitLoad(Query);
 716             },
 717             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
 718               const LLT DstTy = Query.Types[0];
 719               const LLT PtrTy = Query.Types[1];
 720
 721               LLT EltTy = DstTy.getElementType();
 722               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
 723
 724               // Split if it's too large for the address space.
 725               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
 726                 unsigned NumElts = DstTy.getNumElements();
 727                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
 728
 729                 // FIXME: Refine when odd breakdowns handled
 730                 // The scalars will need to be re-legalized.
 731                 if (NumPieces == 1 || NumPieces >= NumElts ||
 732                     NumElts % NumPieces != 0)
 733                   return std::make_pair(0, EltTy);
 734
 735                 return std::make_pair(0,
 736                                       LLT::vector(NumElts / NumPieces, EltTy));
 737               }
 738
 739               // Need to split because of alignment.
 740               unsigned Align = Query.MMODescrs[0].AlignInBits;
 741               unsigned EltSize = EltTy.getSizeInBits();
 742               if (EltSize > Align &&
 743                   (EltSize / Align < DstTy.getNumElements())) {
 744                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
 745               }
 746
 747               // May need relegalization for the scalars.
 748               return std::make_pair(0, EltTy);
 749             })
 750         .minScalar(0, S32);
 751
 752     if (IsStore)
 753       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
 754
 755     // TODO: Need a bitcast lower option?
 756     Actions
 757         .legalIf([=](const LegalityQuery &Query) {
 758           const LLT Ty0 = Query.Types[0];
 759           unsigned Size = Ty0.getSizeInBits();
 760           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 761           unsigned Align = Query.MMODescrs[0].AlignInBits;
 762
 763           // No extending vector loads.
 764           if (Size > MemSize && Ty0.isVector())
 765             return false;
 766
 767           // FIXME: Widening store from alignment not valid.
 768           if (MemSize < Size)
 769             MemSize = std::max(MemSize, Align);
 770
 771           switch (MemSize) {
 772           case 8:
 773           case 16:
 774             return Size == 32;
 775           case 32:
 776           case 64:
 777           case 128:
 778             return true;
 779           case 96:
 780             return ST.hasDwordx3LoadStores();
 781           case 256:
 782           case 512:
 783             return true;
 784           default:
 785             return false;
 786           }
 787         })
 788         .widenScalarToNextPow2(0)
 789         // TODO: v3s32->v4s32 with alignment
 790         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
 791   }
 792
 793   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
 794                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
 795                                                   {S32, GlobalPtr, 16, 2 * 8},
 796                                                   {S32, LocalPtr, 8, 8},
 797                                                   {S32, LocalPtr, 16, 16},
 798                                                   {S32, PrivatePtr, 8, 8},
 799                                                   {S32, PrivatePtr, 16, 16},
 800                                                   {S32, ConstantPtr, 8, 8},
 801                                                   {S32, ConstantPtr, 16, 2 * 8}});
 802   if (ST.hasFlatAddressSpace()) {
 803     ExtLoads.legalForTypesWithMemDesc(
 804         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
 805   }
 806
 807   ExtLoads.clampScalar(0, S32, S32)
 808           .widenScalarToNextPow2(0)
 809           .unsupportedIfMemSizeNotPow2()
 810           .lower();
 811
 812   auto &Atomics = getActionDefinitionsBuilder(
 813     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
 814      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
 815      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
 816      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
 817     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
 818                {S64, GlobalPtr}, {S64, LocalPtr}});
 819   if (ST.hasFlatAddressSpace()) {
 820     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
 821   }
 822
 823   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
 824     .legalFor({{S32, LocalPtr}});
 825
 826   // TODO: Pointer types, any 32-bit or 64-bit vector
 827   getActionDefinitionsBuilder(G_SELECT)
 828     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
 829           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
 830           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
 831     .clampScalar(0, S16, S64)
 832     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 833     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
 834     .scalarize(1)
 835     .clampMaxNumElements(0, S32, 2)
 836     .clampMaxNumElements(0, LocalPtr, 2)
 837     .clampMaxNumElements(0, PrivatePtr, 2)
 838     .scalarize(0)
 839     .widenScalarToNextPow2(0)
 840     .legalIf(all(isPointer(0), typeIs(1, S1)));
 841
 842   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
 843   // be more flexible with the shift amount type.
 844   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
 845     .legalFor({{S32, S32}, {S64, S32}});
 846   if (ST.has16BitInsts()) {
 847     if (ST.hasVOP3PInsts()) {
 848       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
 849             .clampMaxNumElements(0, S16, 2);
 850     } else
 851       Shifts.legalFor({{S16, S32}, {S16, S16}});
 852
 853     Shifts.clampScalar(1, S16, S32);
 854     Shifts.clampScalar(0, S16, S64);
 855     Shifts.widenScalarToNextPow2(0, 16);
 856   } else {
 857     // Make sure we legalize the shift amount type first, as the general
 858     // expansion for the shifted type will produce much worse code if it hasn't
 859     // been truncated already.
 860     Shifts.clampScalar(1, S32, S32);
 861     Shifts.clampScalar(0, S32, S64);
 862     Shifts.widenScalarToNextPow2(0, 32);
 863   }
 864   Shifts.scalarize(0);
 865
 866   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
 867     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
 868     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
 869     unsigned IdxTypeIdx = 2;
 870
 871     getActionDefinitionsBuilder(Op)
 872       .customIf([=](const LegalityQuery &Query) {
 873           const LLT EltTy = Query.Types[EltTypeIdx];
 874           const LLT VecTy = Query.Types[VecTypeIdx];
 875           const LLT IdxTy = Query.Types[IdxTypeIdx];
 876           return (EltTy.getSizeInBits() == 16 ||
 877                   EltTy.getSizeInBits() % 32 == 0) &&
 878                  VecTy.getSizeInBits() % 32 == 0 &&
 879                  VecTy.getSizeInBits() <= 512 &&
 880                  IdxTy.getSizeInBits() == 32;
 881         })
 882       .clampScalar(EltTypeIdx, S32, S64)
 883       .clampScalar(VecTypeIdx, S32, S64)
 884       .clampScalar(IdxTypeIdx, S32, S32);
 885   }
 886
 887   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
 888     .unsupportedIf([=](const LegalityQuery &Query) {
 889         const LLT &EltTy = Query.Types[1].getElementType();
 890         return Query.Types[0] != EltTy;
 891       });
 892
 893   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
 894     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
 895     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
 896
 897     // FIXME: Doesn't handle extract of illegal sizes.
 898     getActionDefinitionsBuilder(Op)
 899       .legalIf([=](const LegalityQuery &Query) {
 900           const LLT BigTy = Query.Types[BigTyIdx];
 901           const LLT LitTy = Query.Types[LitTyIdx];
 902           return (BigTy.getSizeInBits() % 32 == 0) &&
 903                  (LitTy.getSizeInBits() % 16 == 0);
 904         })
 905       .widenScalarIf(
 906         [=](const LegalityQuery &Query) {
 907           const LLT BigTy = Query.Types[BigTyIdx];
 908           return (BigTy.getScalarSizeInBits() < 16);
 909         },
 910         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
 911       .widenScalarIf(
 912         [=](const LegalityQuery &Query) {
 913           const LLT LitTy = Query.Types[LitTyIdx];
 914           return (LitTy.getScalarSizeInBits() < 16);
 915         },
 916         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
 917       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 918       .widenScalarToNextPow2(BigTyIdx, 32);
 919
 920   }
 921
 922   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
 923     .legalForCartesianProduct(AllS32Vectors, {S32})
 924     .legalForCartesianProduct(AllS64Vectors, {S64})
 925     .clampNumElements(0, V16S32, V16S32)
 926     .clampNumElements(0, V2S64, V8S64);
 927
 928   if (ST.hasScalarPackInsts())
 929     BuildVector.legalFor({V2S16, S32});
 930
 931   BuildVector
 932     .minScalarSameAs(1, 0)
 933     .legalIf(isRegisterType(0))
 934     .minScalarOrElt(0, S32);
 935
 936   if (ST.hasScalarPackInsts()) {
 937     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
 938       .legalFor({V2S16, S32})
 939       .lower();
 940   } else {
 941     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
 942       .lower();
 943   }
 944
 945   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
 946     .legalIf(isRegisterType(0));
 947
 948   // TODO: Don't fully scalarize v2s16 pieces
 949   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
 950
 951   // Merge/Unmerge
 952   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
 953     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
 954     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 955
 956     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
 957       const LLT &Ty = Query.Types[TypeIdx];
 958       if (Ty.isVector()) {
 959         const LLT &EltTy = Ty.getElementType();
 960         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
 961           return true;
 962         if (!isPowerOf2_32(EltTy.getSizeInBits()))
 963           return true;
 964       }
 965       return false;
 966     };
 967
 968     getActionDefinitionsBuilder(Op)
 969       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
 970       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
 971       // worth considering the multiples of 64 since 2*192 and 2*384 are not
 972       // valid.
 973       .clampScalar(LitTyIdx, S16, S256)
 974       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
 975       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 976       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
 977                            elementTypeIs(1, S16)),
 978                        changeTo(1, V2S16))
 979       // Break up vectors with weird elements into scalars
 980       .fewerElementsIf(
 981         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
 982         scalarize(0))
 983       .fewerElementsIf(
 984         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
 985         scalarize(1))
 986       .clampScalar(BigTyIdx, S32, S512)
 987       .lowerFor({{S16, V2S16}})
 988       .widenScalarIf(
 989         [=](const LegalityQuery &Query) {
 990           const LLT &Ty = Query.Types[BigTyIdx];
 991           return !isPowerOf2_32(Ty.getSizeInBits()) &&
 992                  Ty.getSizeInBits() % 16 != 0;
 993         },
 994         [=](const LegalityQuery &Query) {
 995           // Pick the next power of 2, or a multiple of 64 over 128.
 996           // Whichever is smaller.
 997           const LLT &Ty = Query.Types[BigTyIdx];
 998           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
 999           if (NewSizeInBits >= 256) {
1000             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1001             if (RoundedTo < NewSizeInBits)
1002               NewSizeInBits = RoundedTo;
1003           }
1004           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1005         })
1006       .legalIf([=](const LegalityQuery &Query) {
1007           const LLT &BigTy = Query.Types[BigTyIdx];
1008           const LLT &LitTy = Query.Types[LitTyIdx];
1009
1010           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1011             return false;
1012           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1013             return false;
1014
1015           return BigTy.getSizeInBits() % 16 == 0 &&
1016                  LitTy.getSizeInBits() % 16 == 0 &&
1017                  BigTy.getSizeInBits() <= 512;
1018         })
1019       // Any vectors left are the wrong size. Scalarize them.
1020       .scalarize(0)
1021       .scalarize(1);
1022   }
1023
1024   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1025
1026   computeTables();
1027   verify(*ST.getInstrInfo());
1028 }
1029
1030 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1031                                          MachineRegisterInfo &MRI,
1032                                          MachineIRBuilder &B,
1033                                          GISelChangeObserver &Observer) const {
1034   switch (MI.getOpcode()) {
1035   case TargetOpcode::G_ADDRSPACE_CAST:
1036     return legalizeAddrSpaceCast(MI, MRI, B);
1037   case TargetOpcode::G_FRINT:
1038     return legalizeFrint(MI, MRI, B);
1039   case TargetOpcode::G_FCEIL:
1040     return legalizeFceil(MI, MRI, B);
1041   case TargetOpcode::G_INTRINSIC_TRUNC:
1042     return legalizeIntrinsicTrunc(MI, MRI, B);
1043   case TargetOpcode::G_SITOFP:
1044     return legalizeITOFP(MI, MRI, B, true);
1045   case TargetOpcode::G_UITOFP:
1046     return legalizeITOFP(MI, MRI, B, false);
1047   case TargetOpcode::G_FMINNUM:
1048   case TargetOpcode::G_FMAXNUM:
1049   case TargetOpcode::G_FMINNUM_IEEE:
1050   case TargetOpcode::G_FMAXNUM_IEEE:
1051     return legalizeMinNumMaxNum(MI, MRI, B);
1052   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1053     return legalizeExtractVectorElt(MI, MRI, B);
1054   case TargetOpcode::G_INSERT_VECTOR_ELT:
1055     return legalizeInsertVectorElt(MI, MRI, B);
1056   case TargetOpcode::G_FSIN:
1057   case TargetOpcode::G_FCOS:
1058     return legalizeSinCos(MI, MRI, B);
1059   case TargetOpcode::G_GLOBAL_VALUE:
1060     return legalizeGlobalValue(MI, MRI, B);
1061   case TargetOpcode::G_LOAD:
1062     return legalizeLoad(MI, MRI, B, Observer);
1063   case TargetOpcode::G_FMAD:
1064     return legalizeFMad(MI, MRI, B);
1065   default:
1066     return false;
1067   }
1068
1069   llvm_unreachable("expected switch to return");
1070 }
1071
1072 Register AMDGPULegalizerInfo::getSegmentAperture(
1073   unsigned AS,
1074   MachineRegisterInfo &MRI,
1075   MachineIRBuilder &B) const {
1076   MachineFunction &MF = B.getMF();
1077   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078   const LLT S32 = LLT::scalar(32);
1079
1080   if (ST.hasApertureRegs()) {
1081     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1082     // getreg.
1083     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1084         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1085         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1086     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1087         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1088         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1089     unsigned Encoding =
1090         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1091         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1092         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1093
1094     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1095     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1096
1097     B.buildInstr(AMDGPU::S_GETREG_B32)
1098       .addDef(GetReg)
1099       .addImm(Encoding);
1100     MRI.setType(GetReg, S32);
1101
1102     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1103     B.buildInstr(TargetOpcode::G_SHL)
1104       .addDef(ApertureReg)
1105       .addUse(GetReg)
1106       .addUse(ShiftAmt.getReg(0));
1107
1108     return ApertureReg;
1109   }
1110
1111   Register QueuePtr = MRI.createGenericVirtualRegister(
1112     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1113
1114   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1115   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1116     return Register();
1117
1118   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1119   // private_segment_aperture_base_hi.
1120   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1121
1122   // FIXME: Don't use undef
1123   Value *V = UndefValue::get(PointerType::get(
1124                                Type::getInt8Ty(MF.getFunction().getContext()),
1125                                AMDGPUAS::CONSTANT_ADDRESS));
1126
1127   MachinePointerInfo PtrInfo(V, StructOffset);
1128   MachineMemOperand *MMO = MF.getMachineMemOperand(
1129     PtrInfo,
1130     MachineMemOperand::MOLoad |
1131     MachineMemOperand::MODereferenceable |
1132     MachineMemOperand::MOInvariant,
1133     4,
1134     MinAlign(64, StructOffset));
1135
1136   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1137   Register LoadAddr;
1138
1139   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1140   B.buildLoad(LoadResult, LoadAddr, *MMO);
1141   return LoadResult;
1142 }
1143
1144 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1145   MachineInstr &MI, MachineRegisterInfo &MRI,
1146   MachineIRBuilder &B) const {
1147   MachineFunction &MF = B.getMF();
1148
1149   B.setInstr(MI);
1150
1151   const LLT S32 = LLT::scalar(32);
1152   Register Dst = MI.getOperand(0).getReg();
1153   Register Src = MI.getOperand(1).getReg();
1154
1155   LLT DstTy = MRI.getType(Dst);
1156   LLT SrcTy = MRI.getType(Src);
1157   unsigned DestAS = DstTy.getAddressSpace();
1158   unsigned SrcAS = SrcTy.getAddressSpace();
1159
1160   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1161   // vector element.
1162   assert(!DstTy.isVector());
1163
1164   const AMDGPUTargetMachine &TM
1165     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1166
1167   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1169     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1170     return true;
1171   }
1172
1173   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1174     // Truncate.
1175     B.buildExtract(Dst, Src, 0);
1176     MI.eraseFromParent();
1177     return true;
1178   }
1179
1180   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1181     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1182     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1183
1184     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1185     // another. Merge operands are required to be the same type, but creating an
1186     // extra ptrtoint would be kind of pointless.
1187     auto HighAddr = B.buildConstant(
1188       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1189     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1190     MI.eraseFromParent();
1191     return true;
1192   }
1193
1194   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1195     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1196            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1197     unsigned NullVal = TM.getNullPointerValue(DestAS);
1198
1199     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1200     auto FlatNull = B.buildConstant(SrcTy, 0);
1201
1202     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1203
1204     // Extract low 32-bits of the pointer.
1205     B.buildExtract(PtrLo32, Src, 0);
1206
1207     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1208     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1209     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1210
1211     MI.eraseFromParent();
1212     return true;
1213   }
1214
1215   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1216     return false;
1217
1218   if (!ST.hasFlatAddressSpace())
1219     return false;
1220
1221   auto SegmentNull =
1222       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1223   auto FlatNull =
1224       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1225
1226   Register ApertureReg = getSegmentAperture(DestAS, MRI, B);
1227   if (!ApertureReg.isValid())
1228     return false;
1229
1230   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1231   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1232
1233   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1234
1235   // Coerce the type of the low half of the result so we can use merge_values.
1236   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1237   B.buildInstr(TargetOpcode::G_PTRTOINT)
1238     .addDef(SrcAsInt)
1239     .addUse(Src);
1240
1241   // TODO: Should we allow mismatched types but matching sizes in merges to
1242   // avoid the ptrtoint?
1243   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1244   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1245
1246   MI.eraseFromParent();
1247   return true;
1248 }
1249
1250 bool AMDGPULegalizerInfo::legalizeFrint(
1251   MachineInstr &MI, MachineRegisterInfo &MRI,
1252   MachineIRBuilder &B) const {
1253   B.setInstr(MI);
1254
1255   Register Src = MI.getOperand(1).getReg();
1256   LLT Ty = MRI.getType(Src);
1257   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1258
1259   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1260   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1261
1262   auto C1 = B.buildFConstant(Ty, C1Val);
1263   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1264
1265   // TODO: Should this propagate fast-math-flags?
1266   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1267   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1268
1269   auto C2 = B.buildFConstant(Ty, C2Val);
1270   auto Fabs = B.buildFAbs(Ty, Src);
1271
1272   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1273   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1274   return true;
1275 }
1276
1277 bool AMDGPULegalizerInfo::legalizeFceil(
1278   MachineInstr &MI, MachineRegisterInfo &MRI,
1279   MachineIRBuilder &B) const {
1280   B.setInstr(MI);
1281
1282   const LLT S1 = LLT::scalar(1);
1283   const LLT S64 = LLT::scalar(64);
1284
1285   Register Src = MI.getOperand(1).getReg();
1286   assert(MRI.getType(Src) == S64);
1287
1288   // result = trunc(src)
1289   // if (src > 0.0 && src != result)
1290   //   result += 1.0
1291
1292   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1293
1294   const auto Zero = B.buildFConstant(S64, 0.0);
1295   const auto One = B.buildFConstant(S64, 1.0);
1296   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1297   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1298   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1299   auto Add = B.buildSelect(S64, And, One, Zero);
1300
1301   // TODO: Should this propagate fast-math-flags?
1302   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1303   return true;
1304 }
1305
1306 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1307                                               MachineIRBuilder &B) {
1308   const unsigned FractBits = 52;
1309   const unsigned ExpBits = 11;
1310   LLT S32 = LLT::scalar(32);
1311
1312   auto Const0 = B.buildConstant(S32, FractBits - 32);
1313   auto Const1 = B.buildConstant(S32, ExpBits);
1314
1315   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1316     .addUse(Const0.getReg(0))
1317     .addUse(Const1.getReg(0));
1318
1319   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1320 }
1321
1322 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1323   MachineInstr &MI, MachineRegisterInfo &MRI,
1324   MachineIRBuilder &B) const {
1325   B.setInstr(MI);
1326
1327   const LLT S1 = LLT::scalar(1);
1328   const LLT S32 = LLT::scalar(32);
1329   const LLT S64 = LLT::scalar(64);
1330
1331   Register Src = MI.getOperand(1).getReg();
1332   assert(MRI.getType(Src) == S64);
1333
1334   // TODO: Should this use extract since the low half is unused?
1335   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1336   Register Hi = Unmerge.getReg(1);
1337
1338   // Extract the upper half, since this is where we will find the sign and
1339   // exponent.
1340   auto Exp = extractF64Exponent(Hi, B);
1341
1342   const unsigned FractBits = 52;
1343
1344   // Extract the sign bit.
1345   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1346   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1347
1348   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1349
1350   const auto Zero32 = B.buildConstant(S32, 0);
1351
1352   // Extend back to 64-bits.
1353   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1354
1355   auto Shr = B.buildAShr(S64, FractMask, Exp);
1356   auto Not = B.buildNot(S64, Shr);
1357   auto Tmp0 = B.buildAnd(S64, Src, Not);
1358   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1359
1360   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1361   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1362
1363   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1364   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1365   return true;
1366 }
1367
1368 bool AMDGPULegalizerInfo::legalizeITOFP(
1369   MachineInstr &MI, MachineRegisterInfo &MRI,
1370   MachineIRBuilder &B, bool Signed) const {
1371   B.setInstr(MI);
1372
1373   Register Dst = MI.getOperand(0).getReg();
1374   Register Src = MI.getOperand(1).getReg();
1375
1376   const LLT S64 = LLT::scalar(64);
1377   const LLT S32 = LLT::scalar(32);
1378
1379   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1380
1381   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1382
1383   auto CvtHi = Signed ?
1384     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1385     B.buildUITOFP(S64, Unmerge.getReg(1));
1386
1387   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1388
1389   auto ThirtyTwo = B.buildConstant(S32, 32);
1390   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1391     .addUse(CvtHi.getReg(0))
1392     .addUse(ThirtyTwo.getReg(0));
1393
1394   // TODO: Should this propagate fast-math-flags?
1395   B.buildFAdd(Dst, LdExp, CvtLo);
1396   MI.eraseFromParent();
1397   return true;
1398 }
1399
1400 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1401   MachineInstr &MI, MachineRegisterInfo &MRI,
1402   MachineIRBuilder &B) const {
1403   MachineFunction &MF = B.getMF();
1404   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1405
1406   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1407                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1408
1409   // With ieee_mode disabled, the instructions have the correct behavior
1410   // already for G_FMINNUM/G_FMAXNUM
1411   if (!MFI->getMode().IEEE)
1412     return !IsIEEEOp;
1413
1414   if (IsIEEEOp)
1415     return true;
1416
1417   MachineIRBuilder HelperBuilder(MI);
1418   GISelObserverWrapper DummyObserver;
1419   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1420   HelperBuilder.setInstr(MI);
1421   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1422 }
1423
1424 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1425   MachineInstr &MI, MachineRegisterInfo &MRI,
1426   MachineIRBuilder &B) const {
1427   // TODO: Should move some of this into LegalizerHelper.
1428
1429   // TODO: Promote dynamic indexing of s16 to s32
1430   // TODO: Dynamic s64 indexing is only legal for SGPR.
1431   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1432   if (!IdxVal) // Dynamic case will be selected to register indexing.
1433     return true;
1434
1435   Register Dst = MI.getOperand(0).getReg();
1436   Register Vec = MI.getOperand(1).getReg();
1437
1438   LLT VecTy = MRI.getType(Vec);
1439   LLT EltTy = VecTy.getElementType();
1440   assert(EltTy == MRI.getType(Dst));
1441
1442   B.setInstr(MI);
1443
1444   if (IdxVal.getValue() < VecTy.getNumElements())
1445     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1446   else
1447     B.buildUndef(Dst);
1448
1449   MI.eraseFromParent();
1450   return true;
1451 }
1452
1453 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1454   MachineInstr &MI, MachineRegisterInfo &MRI,
1455   MachineIRBuilder &B) const {
1456   // TODO: Should move some of this into LegalizerHelper.
1457
1458   // TODO: Promote dynamic indexing of s16 to s32
1459   // TODO: Dynamic s64 indexing is only legal for SGPR.
1460   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1461   if (!IdxVal) // Dynamic case will be selected to register indexing.
1462     return true;
1463
1464   Register Dst = MI.getOperand(0).getReg();
1465   Register Vec = MI.getOperand(1).getReg();
1466   Register Ins = MI.getOperand(2).getReg();
1467
1468   LLT VecTy = MRI.getType(Vec);
1469   LLT EltTy = VecTy.getElementType();
1470   assert(EltTy == MRI.getType(Ins));
1471
1472   B.setInstr(MI);
1473
1474   if (IdxVal.getValue() < VecTy.getNumElements())
1475     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1476   else
1477     B.buildUndef(Dst);
1478
1479   MI.eraseFromParent();
1480   return true;
1481 }
1482
1483 bool AMDGPULegalizerInfo::legalizeSinCos(
1484   MachineInstr &MI, MachineRegisterInfo &MRI,
1485   MachineIRBuilder &B) const {
1486   B.setInstr(MI);
1487
1488   Register DstReg = MI.getOperand(0).getReg();
1489   Register SrcReg = MI.getOperand(1).getReg();
1490   LLT Ty = MRI.getType(DstReg);
1491   unsigned Flags = MI.getFlags();
1492
1493   Register TrigVal;
1494   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1495   if (ST.hasTrigReducedRange()) {
1496     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1497     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1498       .addUse(MulVal.getReg(0))
1499       .setMIFlags(Flags).getReg(0);
1500   } else
1501     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1502
1503   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1504     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1505   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1506     .addUse(TrigVal)
1507     .setMIFlags(Flags);
1508   MI.eraseFromParent();
1509   return true;
1510 }
1511
1512 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1513   MachineInstr &MI, MachineRegisterInfo &MRI,
1514   MachineIRBuilder &B) const {
1515   Register DstReg = MI.getOperand(0).getReg();
1516   LLT Ty = MRI.getType(DstReg);
1517   unsigned AS = Ty.getAddressSpace();
1518
1519   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1520   MachineFunction &MF = B.getMF();
1521   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1522
1523   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1524     B.setInstr(MI);
1525
1526     if (!MFI->isEntryFunction()) {
1527       const Function &Fn = MF.getFunction();
1528       DiagnosticInfoUnsupported BadLDSDecl(
1529         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1530       Fn.getContext().diagnose(BadLDSDecl);
1531     }
1532
1533     // TODO: We could emit code to handle the initialization somewhere.
1534     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1535       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1536       MI.eraseFromParent();
1537       return true;
1538     }
1539   } else
1540     return false;
1541
1542   const Function &Fn = MF.getFunction();
1543   DiagnosticInfoUnsupported BadInit(
1544     Fn, "unsupported initializer for address space", MI.getDebugLoc());
1545   Fn.getContext().diagnose(BadInit);
1546   return true;
1547 }
1548
1549 bool AMDGPULegalizerInfo::legalizeLoad(
1550   MachineInstr &MI, MachineRegisterInfo &MRI,
1551   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1552   B.setInstr(MI);
1553   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1554   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1555   Observer.changingInstr(MI);
1556   MI.getOperand(1).setReg(Cast.getReg(0));
1557   Observer.changedInstr(MI);
1558   return true;
1559 }
1560
1561 bool AMDGPULegalizerInfo::legalizeFMad(
1562   MachineInstr &MI, MachineRegisterInfo &MRI,
1563   MachineIRBuilder &B) const {
1564   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1565   assert(Ty.isScalar());
1566
1567   // TODO: Always legal with future ftz flag.
1568   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1569     return true;
1570   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1571     return true;
1572
1573   MachineFunction &MF = B.getMF();
1574
1575   MachineIRBuilder HelperBuilder(MI);
1576   GISelObserverWrapper DummyObserver;
1577   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1578   HelperBuilder.setMBB(*MI.getParent());
1579   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1580 }
1581
1582 // Return the use branch instruction, otherwise null if the usage is invalid.
1583 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1584                                        MachineRegisterInfo &MRI) {
1585   Register CondDef = MI.getOperand(0).getReg();
1586   if (!MRI.hasOneNonDBGUse(CondDef))
1587     return nullptr;
1588
1589   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1590   return UseMI.getParent() == MI.getParent() &&
1591     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1592 }
1593
1594 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1595                                                 Register Reg, LLT Ty) const {
1596   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1597   if (LiveIn)
1598     return LiveIn;
1599
1600   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1601   MRI.addLiveIn(Reg, NewReg);
1602   return NewReg;
1603 }
1604
1605 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1606                                          const ArgDescriptor *Arg) const {
1607   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1608     return false; // TODO: Handle these
1609
1610   assert(Arg->getRegister().isPhysical());
1611
1612   MachineRegisterInfo &MRI = *B.getMRI();
1613
1614   LLT Ty = MRI.getType(DstReg);
1615   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1616
1617   if (Arg->isMasked()) {
1618     // TODO: Should we try to emit this once in the entry block?
1619     const LLT S32 = LLT::scalar(32);
1620     const unsigned Mask = Arg->getMask();
1621     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1622
1623     auto ShiftAmt = B.buildConstant(S32, Shift);
1624     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1625     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1626   } else
1627     B.buildCopy(DstReg, LiveIn);
1628
1629   // Insert the argument copy if it doens't already exist.
1630   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1631   if (!MRI.getVRegDef(LiveIn)) {
1632     // FIXME: Should have scoped insert pt
1633     MachineBasicBlock &OrigInsBB = B.getMBB();
1634     auto OrigInsPt = B.getInsertPt();
1635
1636     MachineBasicBlock &EntryMBB = B.getMF().front();
1637     EntryMBB.addLiveIn(Arg->getRegister());
1638     B.setInsertPt(EntryMBB, EntryMBB.begin());
1639     B.buildCopy(LiveIn, Arg->getRegister());
1640
1641     B.setInsertPt(OrigInsBB, OrigInsPt);
1642   }
1643
1644   return true;
1645 }
1646
1647 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1648   MachineInstr &MI,
1649   MachineRegisterInfo &MRI,
1650   MachineIRBuilder &B,
1651   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1652   B.setInstr(MI);
1653
1654   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1655
1656   const ArgDescriptor *Arg;
1657   const TargetRegisterClass *RC;
1658   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1659   if (!Arg) {
1660     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1661     return false;
1662   }
1663
1664   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1665     MI.eraseFromParent();
1666     return true;
1667   }
1668
1669   return false;
1670 }
1671
1672 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1673                                            MachineRegisterInfo &MRI,
1674                                            MachineIRBuilder &B) const {
1675   B.setInstr(MI);
1676   Register Res = MI.getOperand(0).getReg();
1677   Register LHS = MI.getOperand(2).getReg();
1678   Register RHS = MI.getOperand(3).getReg();
1679   uint16_t Flags = MI.getFlags();
1680
1681   LLT S32 = LLT::scalar(32);
1682   LLT S1 = LLT::scalar(1);
1683
1684   auto Abs = B.buildFAbs(S32, RHS, Flags);
1685   const APFloat C0Val(1.0f);
1686
1687   auto C0 = B.buildConstant(S32, 0x6f800000);
1688   auto C1 = B.buildConstant(S32, 0x2f800000);
1689   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1690
1691   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1692   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1693
1694   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1695
1696   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1697     .addUse(Mul0.getReg(0))
1698     .setMIFlags(Flags);
1699
1700   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1701
1702   B.buildFMul(Res, Sel, Mul1, Flags);
1703
1704   MI.eraseFromParent();
1705   return true;
1706 }
1707
1708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1709                                                  MachineRegisterInfo &MRI,
1710                                                  MachineIRBuilder &B) const {
1711   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1712   if (!MFI->isEntryFunction()) {
1713     return legalizePreloadedArgIntrin(MI, MRI, B,
1714                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1715   }
1716
1717   B.setInstr(MI);
1718
1719   uint64_t Offset =
1720     ST.getTargetLowering()->getImplicitParameterOffset(
1721       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1722   Register DstReg = MI.getOperand(0).getReg();
1723   LLT DstTy = MRI.getType(DstReg);
1724   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1725
1726   const ArgDescriptor *Arg;
1727   const TargetRegisterClass *RC;
1728   std::tie(Arg, RC)
1729     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1730   if (!Arg)
1731     return false;
1732
1733   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1734   if (!loadInputValue(KernargPtrReg, B, Arg))
1735     return false;
1736
1737   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1738   MI.eraseFromParent();
1739   return true;
1740 }
1741
1742 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1743                                               MachineRegisterInfo &MRI,
1744                                               MachineIRBuilder &B,
1745                                               unsigned AddrSpace) const {
1746   B.setInstr(MI);
1747   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1748   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1749   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1750   MI.eraseFromParent();
1751   return true;
1752 }
1753
1754 /// Handle register layout difference for f16 images for some subtargets.
1755 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1756                                              MachineRegisterInfo &MRI,
1757                                              Register Reg) const {
1758   if (!ST.hasUnpackedD16VMem())
1759     return Reg;
1760
1761   const LLT S16 = LLT::scalar(16);
1762   const LLT S32 = LLT::scalar(32);
1763   LLT StoreVT = MRI.getType(Reg);
1764   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1765
1766   auto Unmerge = B.buildUnmerge(S16, Reg);
1767
1768   SmallVector<Register, 4> WideRegs;
1769   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1770     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1771
1772   int NumElts = StoreVT.getNumElements();
1773
1774   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1775 }
1776
1777 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1778                                                  MachineRegisterInfo &MRI,
1779                                                  MachineIRBuilder &B,
1780                                                  bool IsFormat) const {
1781   // TODO: Reject f16 format on targets where unsupported.
1782   Register VData = MI.getOperand(1).getReg();
1783   LLT Ty = MRI.getType(VData);
1784
1785   B.setInstr(MI);
1786
1787   const LLT S32 = LLT::scalar(32);
1788   const LLT S16 = LLT::scalar(16);
1789
1790   // Fixup illegal register types for i8 stores.
1791   if (Ty == LLT::scalar(8) || Ty == S16) {
1792     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1793     MI.getOperand(1).setReg(AnyExt);
1794     return true;
1795   }
1796
1797   if (Ty.isVector()) {
1798     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1799       if (IsFormat)
1800         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1801       return true;
1802     }
1803
1804     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1805   }
1806
1807   return Ty == S32;
1808 }
1809
1810 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1811                                             MachineRegisterInfo &MRI,
1812                                             MachineIRBuilder &B) const {
1813   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1814   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1815   case Intrinsic::amdgcn_if: {
1816     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1817       const SIRegisterInfo *TRI
1818         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1819
1820       B.setInstr(*BrCond);
1821       Register Def = MI.getOperand(1).getReg();
1822       Register Use = MI.getOperand(3).getReg();
1823       B.buildInstr(AMDGPU::SI_IF)
1824         .addDef(Def)
1825         .addUse(Use)
1826         .addMBB(BrCond->getOperand(1).getMBB());
1827
1828       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1829       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1830       MI.eraseFromParent();
1831       BrCond->eraseFromParent();
1832       return true;
1833     }
1834
1835     return false;
1836   }
1837   case Intrinsic::amdgcn_loop: {
1838     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1839       const SIRegisterInfo *TRI
1840         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1841
1842       B.setInstr(*BrCond);
1843       Register Reg = MI.getOperand(2).getReg();
1844       B.buildInstr(AMDGPU::SI_LOOP)
1845         .addUse(Reg)
1846         .addMBB(BrCond->getOperand(1).getMBB());
1847       MI.eraseFromParent();
1848       BrCond->eraseFromParent();
1849       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1850       return true;
1851     }
1852
1853     return false;
1854   }
1855   case Intrinsic::amdgcn_kernarg_segment_ptr:
1856     return legalizePreloadedArgIntrin(
1857       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1858   case Intrinsic::amdgcn_implicitarg_ptr:
1859     return legalizeImplicitArgPtr(MI, MRI, B);
1860   case Intrinsic::amdgcn_workitem_id_x:
1861     return legalizePreloadedArgIntrin(MI, MRI, B,
1862                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1863   case Intrinsic::amdgcn_workitem_id_y:
1864     return legalizePreloadedArgIntrin(MI, MRI, B,
1865                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1866   case Intrinsic::amdgcn_workitem_id_z:
1867     return legalizePreloadedArgIntrin(MI, MRI, B,
1868                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1869   case Intrinsic::amdgcn_workgroup_id_x:
1870     return legalizePreloadedArgIntrin(MI, MRI, B,
1871                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1872   case Intrinsic::amdgcn_workgroup_id_y:
1873     return legalizePreloadedArgIntrin(MI, MRI, B,
1874                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1875   case Intrinsic::amdgcn_workgroup_id_z:
1876     return legalizePreloadedArgIntrin(MI, MRI, B,
1877                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1878   case Intrinsic::amdgcn_dispatch_ptr:
1879     return legalizePreloadedArgIntrin(MI, MRI, B,
1880                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1881   case Intrinsic::amdgcn_queue_ptr:
1882     return legalizePreloadedArgIntrin(MI, MRI, B,
1883                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1884   case Intrinsic::amdgcn_implicit_buffer_ptr:
1885     return legalizePreloadedArgIntrin(
1886       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1887   case Intrinsic::amdgcn_dispatch_id:
1888     return legalizePreloadedArgIntrin(MI, MRI, B,
1889                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1890   case Intrinsic::amdgcn_fdiv_fast:
1891     return legalizeFDIVFast(MI, MRI, B);
1892   case Intrinsic::amdgcn_is_shared:
1893     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1894   case Intrinsic::amdgcn_is_private:
1895     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1896   case Intrinsic::amdgcn_wavefrontsize: {
1897     B.setInstr(MI);
1898     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
1899     MI.eraseFromParent();
1900     return true;
1901   }
1902   case Intrinsic::amdgcn_raw_buffer_store:
1903     return legalizeRawBufferStore(MI, MRI, B, false);
1904   case Intrinsic::amdgcn_raw_buffer_store_format:
1905     return legalizeRawBufferStore(MI, MRI, B, true);
1906   default:
1907     return true;
1908   }
1909
1910   return true;
1911 }