lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "AMDGPULegalizerInfo.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "SIMachineFunctionInfo.h"
  18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  20 #include "llvm/CodeGen/TargetOpcodes.h"
  21 #include "llvm/CodeGen/ValueTypes.h"
  22 #include "llvm/IR/DerivedTypes.h"
  23 #include "llvm/IR/Type.h"
  24 #include "llvm/Support/Debug.h"
  25
  26 #define DEBUG_TYPE "amdgpu-legalinfo"
  27
  28 using namespace llvm;
  29 using namespace LegalizeActions;
  30 using namespace LegalizeMutations;
  31 using namespace LegalityPredicates;
  32
  33
  34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
  35                                       unsigned MaxSize = 512) {
  36   return [=](const LegalityQuery &Query) {
  37     const LLT Ty = Query.Types[TypeIdx];
  38     const LLT EltTy = Ty.getScalarType();
  39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
  40   };
  41 }
  42
  43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  44   return [=](const LegalityQuery &Query) {
  45     const LLT Ty = Query.Types[TypeIdx];
  46     return Ty.isVector() &&
  47            Ty.getNumElements() % 2 != 0 &&
  48            Ty.getElementType().getSizeInBits() < 32;
  49   };
  50 }
  51
  52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  53   return [=](const LegalityQuery &Query) {
  54     const LLT Ty = Query.Types[TypeIdx];
  55     const LLT EltTy = Ty.getElementType();
  56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  57   };
  58 }
  59
  60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  61   return [=](const LegalityQuery &Query) {
  62     const LLT Ty = Query.Types[TypeIdx];
  63     const LLT EltTy = Ty.getElementType();
  64     unsigned Size = Ty.getSizeInBits();
  65     unsigned Pieces = (Size + 63) / 64;
  66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  68   };
  69 }
  70
  71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
  72   return [=](const LegalityQuery &Query) {
  73     const LLT QueryTy = Query.Types[TypeIdx];
  74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
  75   };
  76 }
  77
  78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
  79   return [=](const LegalityQuery &Query) {
  80     const LLT QueryTy = Query.Types[TypeIdx];
  81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
  82   };
  83 }
  84
  85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
  86 // v2s16.
  87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
  88   return [=](const LegalityQuery &Query) {
  89     const LLT Ty = Query.Types[TypeIdx];
  90     if (Ty.isVector()) {
  91       const int EltSize = Ty.getElementType().getSizeInBits();
  92       return EltSize == 32 || EltSize == 64 ||
  93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
  94              EltSize == 128 || EltSize == 256;
  95     }
  96
  97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
  98   };
  99 }
 100
 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 102                                          const GCNTargetMachine &TM)
 103   :  ST(ST_) {
 104   using namespace TargetOpcode;
 105
 106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 108   };
 109
 110   const LLT S1 = LLT::scalar(1);
 111   const LLT S8 = LLT::scalar(8);
 112   const LLT S16 = LLT::scalar(16);
 113   const LLT S32 = LLT::scalar(32);
 114   const LLT S64 = LLT::scalar(64);
 115   const LLT S128 = LLT::scalar(128);
 116   const LLT S256 = LLT::scalar(256);
 117   const LLT S512 = LLT::scalar(512);
 118
 119   const LLT V2S16 = LLT::vector(2, 16);
 120   const LLT V4S16 = LLT::vector(4, 16);
 121
 122   const LLT V2S32 = LLT::vector(2, 32);
 123   const LLT V3S32 = LLT::vector(3, 32);
 124   const LLT V4S32 = LLT::vector(4, 32);
 125   const LLT V5S32 = LLT::vector(5, 32);
 126   const LLT V6S32 = LLT::vector(6, 32);
 127   const LLT V7S32 = LLT::vector(7, 32);
 128   const LLT V8S32 = LLT::vector(8, 32);
 129   const LLT V9S32 = LLT::vector(9, 32);
 130   const LLT V10S32 = LLT::vector(10, 32);
 131   const LLT V11S32 = LLT::vector(11, 32);
 132   const LLT V12S32 = LLT::vector(12, 32);
 133   const LLT V13S32 = LLT::vector(13, 32);
 134   const LLT V14S32 = LLT::vector(14, 32);
 135   const LLT V15S32 = LLT::vector(15, 32);
 136   const LLT V16S32 = LLT::vector(16, 32);
 137
 138   const LLT V2S64 = LLT::vector(2, 64);
 139   const LLT V3S64 = LLT::vector(3, 64);
 140   const LLT V4S64 = LLT::vector(4, 64);
 141   const LLT V5S64 = LLT::vector(5, 64);
 142   const LLT V6S64 = LLT::vector(6, 64);
 143   const LLT V7S64 = LLT::vector(7, 64);
 144   const LLT V8S64 = LLT::vector(8, 64);
 145
 146   std::initializer_list<LLT> AllS32Vectors =
 147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
 149   std::initializer_list<LLT> AllS64Vectors =
 150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
 151
 152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 154   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 155   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 156   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 157   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 158   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 159
 160   const LLT CodePtr = FlatPtr;
 161
 162   const std::initializer_list<LLT> AddrSpaces64 = {
 163     GlobalPtr, ConstantPtr, FlatPtr
 164   };
 165
 166   const std::initializer_list<LLT> AddrSpaces32 = {
 167     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 168   };
 169
 170   const std::initializer_list<LLT> FPTypesBase = {
 171     S32, S64
 172   };
 173
 174   const std::initializer_list<LLT> FPTypes16 = {
 175     S32, S64, S16
 176   };
 177
 178   const std::initializer_list<LLT> FPTypesPK16 = {
 179     S32, S64, S16, V2S16
 180   };
 181
 182   setAction({G_BRCOND, S1}, Legal);
 183
 184   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 185   // elements for v3s16
 186   getActionDefinitionsBuilder(G_PHI)
 187     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 188     .legalFor(AllS32Vectors)
 189     .legalFor(AllS64Vectors)
 190     .legalFor(AddrSpaces64)
 191     .legalFor(AddrSpaces32)
 192     .clampScalar(0, S32, S256)
 193     .widenScalarToNextPow2(0, 32)
 194     .clampMaxNumElements(0, S32, 16)
 195     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 196     .legalIf(isPointer(0));
 197
 198   if (ST.has16BitInsts()) {
 199     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 200       .legalFor({S32, S16})
 201       .clampScalar(0, S16, S32)
 202       .scalarize(0);
 203   } else {
 204     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 205       .legalFor({S32})
 206       .clampScalar(0, S32, S32)
 207       .scalarize(0);
 208   }
 209
 210   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 211     .legalFor({S32})
 212     .clampScalar(0, S32, S32)
 213     .scalarize(0);
 214
 215   // Report legal for any types we can handle anywhere. For the cases only legal
 216   // on the SALU, RegBankSelect will be able to re-legalize.
 217   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 218     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 219     .clampScalar(0, S32, S64)
 220     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 221     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
 222     .widenScalarToNextPow2(0)
 223     .scalarize(0);
 224
 225   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
 226                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 227     .legalFor({{S32, S1}})
 228     .clampScalar(0, S32, S32);
 229
 230   getActionDefinitionsBuilder(G_BITCAST)
 231     .legalForCartesianProduct({S32, V2S16})
 232     .legalForCartesianProduct({S64, V2S32, V4S16})
 233     .legalForCartesianProduct({V2S64, V4S32})
 234     // Don't worry about the size constraint.
 235     .legalIf(all(isPointer(0), isPointer(1)));
 236
 237   if (ST.has16BitInsts()) {
 238     getActionDefinitionsBuilder(G_FCONSTANT)
 239       .legalFor({S32, S64, S16})
 240       .clampScalar(0, S16, S64);
 241   } else {
 242     getActionDefinitionsBuilder(G_FCONSTANT)
 243       .legalFor({S32, S64})
 244       .clampScalar(0, S32, S64);
 245   }
 246
 247   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
 248     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
 249                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
 250     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 251     .clampScalarOrElt(0, S32, S512)
 252     .legalIf(isMultiple32(0))
 253     .widenScalarToNextPow2(0, 32)
 254     .clampMaxNumElements(0, S32, 16);
 255
 256
 257   // FIXME: i1 operands to intrinsics should always be legal, but other i1
 258   // values may not be legal.  We need to figure out how to distinguish
 259   // between these two scenarios.
 260   getActionDefinitionsBuilder(G_CONSTANT)
 261     .legalFor({S1, S32, S64, GlobalPtr,
 262                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 263     .clampScalar(0, S32, S64)
 264     .widenScalarToNextPow2(0)
 265     .legalIf(isPointer(0));
 266
 267   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 268
 269   auto &FPOpActions = getActionDefinitionsBuilder(
 270     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
 271     .legalFor({S32, S64});
 272
 273   if (ST.has16BitInsts()) {
 274     if (ST.hasVOP3PInsts())
 275       FPOpActions.legalFor({S16, V2S16});
 276     else
 277       FPOpActions.legalFor({S16});
 278   }
 279
 280   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 281       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 282
 283   if (ST.hasVOP3PInsts()) {
 284     MinNumMaxNum.customFor(FPTypesPK16)
 285       .clampMaxNumElements(0, S16, 2)
 286       .clampScalar(0, S16, S64)
 287       .scalarize(0);
 288   } else if (ST.has16BitInsts()) {
 289     MinNumMaxNum.customFor(FPTypes16)
 290       .clampScalar(0, S16, S64)
 291       .scalarize(0);
 292   } else {
 293     MinNumMaxNum.customFor(FPTypesBase)
 294       .clampScalar(0, S32, S64)
 295       .scalarize(0);
 296   }
 297
 298   // TODO: Implement
 299   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
 300
 301   if (ST.hasVOP3PInsts())
 302     FPOpActions.clampMaxNumElements(0, S16, 2);
 303   FPOpActions
 304     .scalarize(0)
 305     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 306
 307   if (ST.has16BitInsts()) {
 308     getActionDefinitionsBuilder(G_FSQRT)
 309       .legalFor({S32, S64, S16})
 310       .scalarize(0)
 311       .clampScalar(0, S16, S64);
 312   } else {
 313     getActionDefinitionsBuilder(G_FSQRT)
 314       .legalFor({S32, S64})
 315       .scalarize(0)
 316       .clampScalar(0, S32, S64);
 317   }
 318
 319   getActionDefinitionsBuilder(G_FPTRUNC)
 320     .legalFor({{S32, S64}, {S16, S32}})
 321     .scalarize(0);
 322
 323   getActionDefinitionsBuilder(G_FPEXT)
 324     .legalFor({{S64, S32}, {S32, S16}})
 325     .lowerFor({{S64, S16}}) // FIXME: Implement
 326     .scalarize(0);
 327
 328   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
 329   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
 330
 331   getActionDefinitionsBuilder(G_FSUB)
 332       // Use actual fsub instruction
 333       .legalFor({S32})
 334       // Must use fadd + fneg
 335       .lowerFor({S64, S16, V2S16})
 336       .scalarize(0)
 337       .clampScalar(0, S32, S64);
 338
 339   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 340     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 341                {S32, S1}, {S64, S1}, {S16, S1},
 342                // FIXME: Hack
 343                {S64, LLT::scalar(33)},
 344                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
 345     .scalarize(0);
 346
 347   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 348     .legalFor({{S32, S32}, {S64, S32}})
 349     .lowerFor({{S32, S64}})
 350     .customFor({{S64, S64}})
 351     .scalarize(0);
 352
 353   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 354     .legalFor({{S32, S32}, {S32, S64}})
 355     .scalarize(0);
 356
 357   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 358     .legalFor({S32, S64})
 359     .scalarize(0);
 360
 361   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 362     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 363       .legalFor({S32, S64})
 364       .clampScalar(0, S32, S64)
 365       .scalarize(0);
 366   } else {
 367     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 368       .legalFor({S32})
 369       .customFor({S64})
 370       .clampScalar(0, S32, S64)
 371       .scalarize(0);
 372   }
 373
 374   getActionDefinitionsBuilder(G_GEP)
 375     .legalForCartesianProduct(AddrSpaces64, {S64})
 376     .legalForCartesianProduct(AddrSpaces32, {S32})
 377     .scalarize(0);
 378
 379   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 380
 381   auto &CmpBuilder =
 382     getActionDefinitionsBuilder(G_ICMP)
 383     .legalForCartesianProduct(
 384       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 385     .legalFor({{S1, S32}, {S1, S64}});
 386   if (ST.has16BitInsts()) {
 387     CmpBuilder.legalFor({{S1, S16}});
 388   }
 389
 390   CmpBuilder
 391     .widenScalarToNextPow2(1)
 392     .clampScalar(1, S32, S64)
 393     .scalarize(0)
 394     .legalIf(all(typeIs(0, S1), isPointer(1)));
 395
 396   getActionDefinitionsBuilder(G_FCMP)
 397     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 398     .widenScalarToNextPow2(1)
 399     .clampScalar(1, S32, S64)
 400     .scalarize(0);
 401
 402   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
 403   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
 404                                G_FLOG, G_FLOG2, G_FLOG10})
 405     .legalFor({S32})
 406     .scalarize(0);
 407
 408   // The 64-bit versions produce 32-bit results, but only on the SALU.
 409   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
 410                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
 411                                G_CTPOP})
 412     .legalFor({{S32, S32}, {S32, S64}})
 413     .clampScalar(0, S32, S32)
 414     .clampScalar(1, S32, S64)
 415     .scalarize(0)
 416     .widenScalarToNextPow2(0, 32)
 417     .widenScalarToNextPow2(1, 32);
 418
 419   // TODO: Expand for > s32
 420   getActionDefinitionsBuilder(G_BSWAP)
 421     .legalFor({S32})
 422     .clampScalar(0, S32, S32)
 423     .scalarize(0);
 424
 425   if (ST.has16BitInsts()) {
 426     if (ST.hasVOP3PInsts()) {
 427       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 428         .legalFor({S32, S16, V2S16})
 429         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 430         .clampMaxNumElements(0, S16, 2)
 431         .clampScalar(0, S16, S32)
 432         .widenScalarToNextPow2(0)
 433         .scalarize(0);
 434     } else {
 435       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 436         .legalFor({S32, S16})
 437         .widenScalarToNextPow2(0)
 438         .clampScalar(0, S16, S32)
 439         .scalarize(0);
 440     }
 441   } else {
 442     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 443       .legalFor({S32})
 444       .clampScalar(0, S32, S32)
 445       .widenScalarToNextPow2(0)
 446       .scalarize(0);
 447   }
 448
 449   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 450     return [=](const LegalityQuery &Query) {
 451       return Query.Types[TypeIdx0].getSizeInBits() <
 452              Query.Types[TypeIdx1].getSizeInBits();
 453     };
 454   };
 455
 456   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 457     return [=](const LegalityQuery &Query) {
 458       return Query.Types[TypeIdx0].getSizeInBits() >
 459              Query.Types[TypeIdx1].getSizeInBits();
 460     };
 461   };
 462
 463   getActionDefinitionsBuilder(G_INTTOPTR)
 464     // List the common cases
 465     .legalForCartesianProduct(AddrSpaces64, {S64})
 466     .legalForCartesianProduct(AddrSpaces32, {S32})
 467     .scalarize(0)
 468     // Accept any address space as long as the size matches
 469     .legalIf(sameSize(0, 1))
 470     .widenScalarIf(smallerThan(1, 0),
 471       [](const LegalityQuery &Query) {
 472         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 473       })
 474     .narrowScalarIf(greaterThan(1, 0),
 475       [](const LegalityQuery &Query) {
 476         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 477       });
 478
 479   getActionDefinitionsBuilder(G_PTRTOINT)
 480     // List the common cases
 481     .legalForCartesianProduct(AddrSpaces64, {S64})
 482     .legalForCartesianProduct(AddrSpaces32, {S32})
 483     .scalarize(0)
 484     // Accept any address space as long as the size matches
 485     .legalIf(sameSize(0, 1))
 486     .widenScalarIf(smallerThan(0, 1),
 487       [](const LegalityQuery &Query) {
 488         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 489       })
 490     .narrowScalarIf(
 491       greaterThan(0, 1),
 492       [](const LegalityQuery &Query) {
 493         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 494       });
 495
 496   if (ST.hasFlatAddressSpace()) {
 497     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 498       .scalarize(0)
 499       .custom();
 500   }
 501
 502   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 503   // handle some operations by just promoting the register during
 504   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 505   getActionDefinitionsBuilder({G_LOAD, G_STORE})
 506     .narrowScalarIf([](const LegalityQuery &Query) {
 507         unsigned Size = Query.Types[0].getSizeInBits();
 508         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 509         return (Size > 32 && MemSize < Size);
 510       },
 511       [](const LegalityQuery &Query) {
 512         return std::make_pair(0, LLT::scalar(32));
 513       })
 514     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 515     .fewerElementsIf([=](const LegalityQuery &Query) {
 516         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 517         return (MemSize == 96) &&
 518                Query.Types[0].isVector() &&
 519                !ST.hasDwordx3LoadStores();
 520       },
 521       [=](const LegalityQuery &Query) {
 522         return std::make_pair(0, V2S32);
 523       })
 524     .legalIf([=](const LegalityQuery &Query) {
 525         const LLT &Ty0 = Query.Types[0];
 526
 527         unsigned Size = Ty0.getSizeInBits();
 528         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 529         if (Size < 32 || (Size > 32 && MemSize < Size))
 530           return false;
 531
 532         if (Ty0.isVector() && Size != MemSize)
 533           return false;
 534
 535         // TODO: Decompose private loads into 4-byte components.
 536         // TODO: Illegal flat loads on SI
 537         switch (MemSize) {
 538         case 8:
 539         case 16:
 540           return Size == 32;
 541         case 32:
 542         case 64:
 543         case 128:
 544           return true;
 545
 546         case 96:
 547           return ST.hasDwordx3LoadStores();
 548
 549         case 256:
 550         case 512:
 551           // TODO: Possibly support loads of i256 and i512 .  This will require
 552           // adding i256 and i512 types to MVT in order for to be able to use
 553           // TableGen.
 554           // TODO: Add support for other vector types, this will require
 555           //       defining more value mappings for the new types.
 556           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
 557                                     Ty0.getScalarType().getSizeInBits() == 64);
 558
 559         default:
 560           return false;
 561         }
 562       })
 563     .clampScalar(0, S32, S64);
 564
 565
 566   // FIXME: Handle alignment requirements.
 567   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
 568     .legalForTypesWithMemDesc({
 569         {S32, GlobalPtr, 8, 8},
 570         {S32, GlobalPtr, 16, 8},
 571         {S32, LocalPtr, 8, 8},
 572         {S32, LocalPtr, 16, 8},
 573         {S32, PrivatePtr, 8, 8},
 574         {S32, PrivatePtr, 16, 8}});
 575   if (ST.hasFlatAddressSpace()) {
 576     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
 577                                        {S32, FlatPtr, 16, 8}});
 578   }
 579
 580   ExtLoads.clampScalar(0, S32, S32)
 581           .widenScalarToNextPow2(0)
 582           .unsupportedIfMemSizeNotPow2()
 583           .lower();
 584
 585   auto &Atomics = getActionDefinitionsBuilder(
 586     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
 587      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
 588      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
 589      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
 590     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
 591                {S64, GlobalPtr}, {S64, LocalPtr}});
 592   if (ST.hasFlatAddressSpace()) {
 593     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
 594   }
 595
 596   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
 597     .legalFor({{S32, LocalPtr}});
 598
 599   // TODO: Pointer types, any 32-bit or 64-bit vector
 600   getActionDefinitionsBuilder(G_SELECT)
 601     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
 602           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
 603           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
 604     .clampScalar(0, S16, S64)
 605     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 606     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
 607     .scalarize(1)
 608     .clampMaxNumElements(0, S32, 2)
 609     .clampMaxNumElements(0, LocalPtr, 2)
 610     .clampMaxNumElements(0, PrivatePtr, 2)
 611     .scalarize(0)
 612     .widenScalarToNextPow2(0)
 613     .legalIf(all(isPointer(0), typeIs(1, S1)));
 614
 615   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
 616   // be more flexible with the shift amount type.
 617   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
 618     .legalFor({{S32, S32}, {S64, S32}});
 619   if (ST.has16BitInsts()) {
 620     if (ST.hasVOP3PInsts()) {
 621       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
 622             .clampMaxNumElements(0, S16, 2);
 623     } else
 624       Shifts.legalFor({{S16, S32}, {S16, S16}});
 625
 626     Shifts.clampScalar(1, S16, S32);
 627     Shifts.clampScalar(0, S16, S64);
 628     Shifts.widenScalarToNextPow2(0, 16);
 629   } else {
 630     // Make sure we legalize the shift amount type first, as the general
 631     // expansion for the shifted type will produce much worse code if it hasn't
 632     // been truncated already.
 633     Shifts.clampScalar(1, S32, S32);
 634     Shifts.clampScalar(0, S32, S64);
 635     Shifts.widenScalarToNextPow2(0, 32);
 636   }
 637   Shifts.scalarize(0);
 638
 639   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
 640     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
 641     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
 642     unsigned IdxTypeIdx = 2;
 643
 644     getActionDefinitionsBuilder(Op)
 645       .customIf([=](const LegalityQuery &Query) {
 646           const LLT EltTy = Query.Types[EltTypeIdx];
 647           const LLT VecTy = Query.Types[VecTypeIdx];
 648           const LLT IdxTy = Query.Types[IdxTypeIdx];
 649           return (EltTy.getSizeInBits() == 16 ||
 650                   EltTy.getSizeInBits() % 32 == 0) &&
 651                  VecTy.getSizeInBits() % 32 == 0 &&
 652                  VecTy.getSizeInBits() <= 512 &&
 653                  IdxTy.getSizeInBits() == 32;
 654         })
 655       .clampScalar(EltTypeIdx, S32, S64)
 656       .clampScalar(VecTypeIdx, S32, S64)
 657       .clampScalar(IdxTypeIdx, S32, S32);
 658   }
 659
 660   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
 661     .unsupportedIf([=](const LegalityQuery &Query) {
 662         const LLT &EltTy = Query.Types[1].getElementType();
 663         return Query.Types[0] != EltTy;
 664       });
 665
 666   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
 667     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
 668     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
 669
 670     // FIXME: Doesn't handle extract of illegal sizes.
 671     getActionDefinitionsBuilder(Op)
 672       .legalIf([=](const LegalityQuery &Query) {
 673           const LLT BigTy = Query.Types[BigTyIdx];
 674           const LLT LitTy = Query.Types[LitTyIdx];
 675           return (BigTy.getSizeInBits() % 32 == 0) &&
 676                  (LitTy.getSizeInBits() % 16 == 0);
 677         })
 678       .widenScalarIf(
 679         [=](const LegalityQuery &Query) {
 680           const LLT BigTy = Query.Types[BigTyIdx];
 681           return (BigTy.getScalarSizeInBits() < 16);
 682         },
 683         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
 684       .widenScalarIf(
 685         [=](const LegalityQuery &Query) {
 686           const LLT LitTy = Query.Types[LitTyIdx];
 687           return (LitTy.getScalarSizeInBits() < 16);
 688         },
 689         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
 690       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 691       .widenScalarToNextPow2(BigTyIdx, 32);
 692
 693   }
 694
 695   getActionDefinitionsBuilder(G_BUILD_VECTOR)
 696       .legalForCartesianProduct(AllS32Vectors, {S32})
 697       .legalForCartesianProduct(AllS64Vectors, {S64})
 698       .clampNumElements(0, V16S32, V16S32)
 699       .clampNumElements(0, V2S64, V8S64)
 700       .minScalarSameAs(1, 0)
 701       .legalIf(isRegisterType(0))
 702       .minScalarOrElt(0, S32);
 703
 704   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
 705     .legalIf(isRegisterType(0));
 706
 707   // Merge/Unmerge
 708   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
 709     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
 710     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 711
 712     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
 713       const LLT &Ty = Query.Types[TypeIdx];
 714       if (Ty.isVector()) {
 715         const LLT &EltTy = Ty.getElementType();
 716         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
 717           return true;
 718         if (!isPowerOf2_32(EltTy.getSizeInBits()))
 719           return true;
 720       }
 721       return false;
 722     };
 723
 724     getActionDefinitionsBuilder(Op)
 725       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
 726       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
 727       // worth considering the multiples of 64 since 2*192 and 2*384 are not
 728       // valid.
 729       .clampScalar(LitTyIdx, S16, S256)
 730       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
 731
 732       // Break up vectors with weird elements into scalars
 733       .fewerElementsIf(
 734         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
 735         scalarize(0))
 736       .fewerElementsIf(
 737         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
 738         scalarize(1))
 739       .clampScalar(BigTyIdx, S32, S512)
 740       .lowerFor({{S16, V2S16}})
 741       .widenScalarIf(
 742         [=](const LegalityQuery &Query) {
 743           const LLT &Ty = Query.Types[BigTyIdx];
 744           return !isPowerOf2_32(Ty.getSizeInBits()) &&
 745                  Ty.getSizeInBits() % 16 != 0;
 746         },
 747         [=](const LegalityQuery &Query) {
 748           // Pick the next power of 2, or a multiple of 64 over 128.
 749           // Whichever is smaller.
 750           const LLT &Ty = Query.Types[BigTyIdx];
 751           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
 752           if (NewSizeInBits >= 256) {
 753             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
 754             if (RoundedTo < NewSizeInBits)
 755               NewSizeInBits = RoundedTo;
 756           }
 757           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
 758         })
 759       .legalIf([=](const LegalityQuery &Query) {
 760           const LLT &BigTy = Query.Types[BigTyIdx];
 761           const LLT &LitTy = Query.Types[LitTyIdx];
 762
 763           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
 764             return false;
 765           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
 766             return false;
 767
 768           return BigTy.getSizeInBits() % 16 == 0 &&
 769                  LitTy.getSizeInBits() % 16 == 0 &&
 770                  BigTy.getSizeInBits() <= 512;
 771         })
 772       // Any vectors left are the wrong size. Scalarize them.
 773       .scalarize(0)
 774       .scalarize(1);
 775   }
 776
 777   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 778
 779   computeTables();
 780   verify(*ST.getInstrInfo());
 781 }
 782
 783 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
 784                                          MachineRegisterInfo &MRI,
 785                                          MachineIRBuilder &MIRBuilder,
 786                                          GISelChangeObserver &Observer) const {
 787   switch (MI.getOpcode()) {
 788   case TargetOpcode::G_ADDRSPACE_CAST:
 789     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
 790   case TargetOpcode::G_FRINT:
 791     return legalizeFrint(MI, MRI, MIRBuilder);
 792   case TargetOpcode::G_FCEIL:
 793     return legalizeFceil(MI, MRI, MIRBuilder);
 794   case TargetOpcode::G_INTRINSIC_TRUNC:
 795     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
 796   case TargetOpcode::G_SITOFP:
 797     return legalizeITOFP(MI, MRI, MIRBuilder, true);
 798   case TargetOpcode::G_UITOFP:
 799     return legalizeITOFP(MI, MRI, MIRBuilder, false);
 800   case TargetOpcode::G_FMINNUM:
 801   case TargetOpcode::G_FMAXNUM:
 802   case TargetOpcode::G_FMINNUM_IEEE:
 803   case TargetOpcode::G_FMAXNUM_IEEE:
 804     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
 805   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
 806     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
 807   case TargetOpcode::G_INSERT_VECTOR_ELT:
 808     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
 809   default:
 810     return false;
 811   }
 812
 813   llvm_unreachable("expected switch to return");
 814 }
 815
 816 Register AMDGPULegalizerInfo::getSegmentAperture(
 817   unsigned AS,
 818   MachineRegisterInfo &MRI,
 819   MachineIRBuilder &MIRBuilder) const {
 820   MachineFunction &MF = MIRBuilder.getMF();
 821   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 822   const LLT S32 = LLT::scalar(32);
 823
 824   if (ST.hasApertureRegs()) {
 825     // FIXME: Use inline constants (src_{shared, private}_base) instead of
 826     // getreg.
 827     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
 828         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
 829         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
 830     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
 831         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
 832         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
 833     unsigned Encoding =
 834         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
 835         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
 836         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
 837
 838     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
 839     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 840
 841     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
 842       .addDef(GetReg)
 843       .addImm(Encoding);
 844     MRI.setType(GetReg, S32);
 845
 846     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
 847     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
 848       .addDef(ApertureReg)
 849       .addUse(GetReg)
 850       .addUse(ShiftAmt.getReg(0));
 851
 852     return ApertureReg;
 853   }
 854
 855   Register QueuePtr = MRI.createGenericVirtualRegister(
 856     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
 857
 858   // FIXME: Placeholder until we can track the input registers.
 859   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
 860
 861   // Offset into amd_queue_t for group_segment_aperture_base_hi /
 862   // private_segment_aperture_base_hi.
 863   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 864
 865   // FIXME: Don't use undef
 866   Value *V = UndefValue::get(PointerType::get(
 867                                Type::getInt8Ty(MF.getFunction().getContext()),
 868                                AMDGPUAS::CONSTANT_ADDRESS));
 869
 870   MachinePointerInfo PtrInfo(V, StructOffset);
 871   MachineMemOperand *MMO = MF.getMachineMemOperand(
 872     PtrInfo,
 873     MachineMemOperand::MOLoad |
 874     MachineMemOperand::MODereferenceable |
 875     MachineMemOperand::MOInvariant,
 876     4,
 877     MinAlign(64, StructOffset));
 878
 879   Register LoadResult = MRI.createGenericVirtualRegister(S32);
 880   Register LoadAddr;
 881
 882   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
 883   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
 884   return LoadResult;
 885 }
 886
 887 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
 888   MachineInstr &MI, MachineRegisterInfo &MRI,
 889   MachineIRBuilder &MIRBuilder) const {
 890   MachineFunction &MF = MIRBuilder.getMF();
 891
 892   MIRBuilder.setInstr(MI);
 893
 894   Register Dst = MI.getOperand(0).getReg();
 895   Register Src = MI.getOperand(1).getReg();
 896
 897   LLT DstTy = MRI.getType(Dst);
 898   LLT SrcTy = MRI.getType(Src);
 899   unsigned DestAS = DstTy.getAddressSpace();
 900   unsigned SrcAS = SrcTy.getAddressSpace();
 901
 902   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
 903   // vector element.
 904   assert(!DstTy.isVector());
 905
 906   const AMDGPUTargetMachine &TM
 907     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
 908
 909   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 910   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
 911     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
 912     return true;
 913   }
 914
 915   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
 916     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
 917            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
 918     unsigned NullVal = TM.getNullPointerValue(DestAS);
 919
 920     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
 921     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
 922
 923     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
 924
 925     // Extract low 32-bits of the pointer.
 926     MIRBuilder.buildExtract(PtrLo32, Src, 0);
 927
 928     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
 929     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
 930     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
 931
 932     MI.eraseFromParent();
 933     return true;
 934   }
 935
 936   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
 937          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
 938
 939   auto SegmentNull =
 940       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
 941   auto FlatNull =
 942       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
 943
 944   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
 945
 946   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
 947   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
 948
 949   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
 950
 951   // Coerce the type of the low half of the result so we can use merge_values.
 952   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
 953   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
 954     .addDef(SrcAsInt)
 955     .addUse(Src);
 956
 957   // TODO: Should we allow mismatched types but matching sizes in merges to
 958   // avoid the ptrtoint?
 959   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
 960   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
 961
 962   MI.eraseFromParent();
 963   return true;
 964 }
 965
 966 bool AMDGPULegalizerInfo::legalizeFrint(
 967   MachineInstr &MI, MachineRegisterInfo &MRI,
 968   MachineIRBuilder &MIRBuilder) const {
 969   MIRBuilder.setInstr(MI);
 970
 971   Register Src = MI.getOperand(1).getReg();
 972   LLT Ty = MRI.getType(Src);
 973   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
 974
 975   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
 976   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
 977
 978   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
 979   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
 980
 981   // TODO: Should this propagate fast-math-flags?
 982   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
 983   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
 984
 985   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
 986   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
 987
 988   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
 989   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
 990   return true;
 991 }
 992
 993 bool AMDGPULegalizerInfo::legalizeFceil(
 994   MachineInstr &MI, MachineRegisterInfo &MRI,
 995   MachineIRBuilder &B) const {
 996   B.setInstr(MI);
 997
 998   const LLT S1 = LLT::scalar(1);
 999   const LLT S64 = LLT::scalar(64);
1000
1001   Register Src = MI.getOperand(1).getReg();
1002   assert(MRI.getType(Src) == S64);
1003
1004   // result = trunc(src)
1005   // if (src > 0.0 && src != result)
1006   //   result += 1.0
1007
1008   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1009
1010   const auto Zero = B.buildFConstant(S64, 0.0);
1011   const auto One = B.buildFConstant(S64, 1.0);
1012   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1013   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1014   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1015   auto Add = B.buildSelect(S64, And, One, Zero);
1016
1017   // TODO: Should this propagate fast-math-flags?
1018   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1019   return true;
1020 }
1021
1022 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1023                                               MachineIRBuilder &B) {
1024   const unsigned FractBits = 52;
1025   const unsigned ExpBits = 11;
1026   LLT S32 = LLT::scalar(32);
1027
1028   auto Const0 = B.buildConstant(S32, FractBits - 32);
1029   auto Const1 = B.buildConstant(S32, ExpBits);
1030
1031   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1032     .addUse(Const0.getReg(0))
1033     .addUse(Const1.getReg(0));
1034
1035   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1036 }
1037
1038 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1039   MachineInstr &MI, MachineRegisterInfo &MRI,
1040   MachineIRBuilder &B) const {
1041   B.setInstr(MI);
1042
1043   const LLT S1 = LLT::scalar(1);
1044   const LLT S32 = LLT::scalar(32);
1045   const LLT S64 = LLT::scalar(64);
1046
1047   Register Src = MI.getOperand(1).getReg();
1048   assert(MRI.getType(Src) == S64);
1049
1050   // TODO: Should this use extract since the low half is unused?
1051   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1052   Register Hi = Unmerge.getReg(1);
1053
1054   // Extract the upper half, since this is where we will find the sign and
1055   // exponent.
1056   auto Exp = extractF64Exponent(Hi, B);
1057
1058   const unsigned FractBits = 52;
1059
1060   // Extract the sign bit.
1061   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1062   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1063
1064   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1065
1066   const auto Zero32 = B.buildConstant(S32, 0);
1067
1068   // Extend back to 64-bits.
1069   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1070
1071   auto Shr = B.buildAShr(S64, FractMask, Exp);
1072   auto Not = B.buildNot(S64, Shr);
1073   auto Tmp0 = B.buildAnd(S64, Src, Not);
1074   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1075
1076   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1077   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1078
1079   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1080   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1081   return true;
1082 }
1083
1084 bool AMDGPULegalizerInfo::legalizeITOFP(
1085   MachineInstr &MI, MachineRegisterInfo &MRI,
1086   MachineIRBuilder &B, bool Signed) const {
1087   B.setInstr(MI);
1088
1089   Register Dst = MI.getOperand(0).getReg();
1090   Register Src = MI.getOperand(1).getReg();
1091
1092   const LLT S64 = LLT::scalar(64);
1093   const LLT S32 = LLT::scalar(32);
1094
1095   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1096
1097   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1098
1099   auto CvtHi = Signed ?
1100     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1101     B.buildUITOFP(S64, Unmerge.getReg(1));
1102
1103   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1104
1105   auto ThirtyTwo = B.buildConstant(S32, 32);
1106   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1107     .addUse(CvtHi.getReg(0))
1108     .addUse(ThirtyTwo.getReg(0));
1109
1110   // TODO: Should this propagate fast-math-flags?
1111   B.buildFAdd(Dst, LdExp, CvtLo);
1112   MI.eraseFromParent();
1113   return true;
1114 }
1115
1116 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1117   MachineInstr &MI, MachineRegisterInfo &MRI,
1118   MachineIRBuilder &B) const {
1119   MachineFunction &MF = B.getMF();
1120   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1121
1122   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1123                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1124
1125   // With ieee_mode disabled, the instructions have the correct behavior
1126   // already for G_FMINNUM/G_FMAXNUM
1127   if (!MFI->getMode().IEEE)
1128     return !IsIEEEOp;
1129
1130   if (IsIEEEOp)
1131     return true;
1132
1133   MachineIRBuilder HelperBuilder(MI);
1134   GISelObserverWrapper DummyObserver;
1135   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1136   HelperBuilder.setMBB(*MI.getParent());
1137   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1138 }
1139
1140 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1141   MachineInstr &MI, MachineRegisterInfo &MRI,
1142   MachineIRBuilder &B) const {
1143   // TODO: Should move some of this into LegalizerHelper.
1144
1145   // TODO: Promote dynamic indexing of s16 to s32
1146   // TODO: Dynamic s64 indexing is only legal for SGPR.
1147   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1148   if (!IdxVal) // Dynamic case will be selected to register indexing.
1149     return true;
1150
1151   Register Dst = MI.getOperand(0).getReg();
1152   Register Vec = MI.getOperand(1).getReg();
1153
1154   LLT VecTy = MRI.getType(Vec);
1155   LLT EltTy = VecTy.getElementType();
1156   assert(EltTy == MRI.getType(Dst));
1157
1158   B.setInstr(MI);
1159
1160   if (IdxVal.getValue() < VecTy.getNumElements())
1161     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1162   else
1163     B.buildUndef(Dst);
1164
1165   MI.eraseFromParent();
1166   return true;
1167 }
1168
1169 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1170   MachineInstr &MI, MachineRegisterInfo &MRI,
1171   MachineIRBuilder &B) const {
1172   // TODO: Should move some of this into LegalizerHelper.
1173
1174   // TODO: Promote dynamic indexing of s16 to s32
1175   // TODO: Dynamic s64 indexing is only legal for SGPR.
1176   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1177   if (!IdxVal) // Dynamic case will be selected to register indexing.
1178     return true;
1179
1180   Register Dst = MI.getOperand(0).getReg();
1181   Register Vec = MI.getOperand(1).getReg();
1182   Register Ins = MI.getOperand(2).getReg();
1183
1184   LLT VecTy = MRI.getType(Vec);
1185   LLT EltTy = VecTy.getElementType();
1186   assert(EltTy == MRI.getType(Ins));
1187
1188   B.setInstr(MI);
1189
1190   if (IdxVal.getValue() < VecTy.getNumElements())
1191     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1192   else
1193     B.buildUndef(Dst);
1194
1195   MI.eraseFromParent();
1196   return true;
1197 }
1198
1199 // Return the use branch instruction, otherwise null if the usage is invalid.
1200 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1201                                        MachineRegisterInfo &MRI) {
1202   Register CondDef = MI.getOperand(0).getReg();
1203   if (!MRI.hasOneNonDBGUse(CondDef))
1204     return nullptr;
1205
1206   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1207   return UseMI.getParent() == MI.getParent() &&
1208     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1209 }
1210
1211 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1212                                                 Register Reg, LLT Ty) const {
1213   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1214   if (LiveIn)
1215     return LiveIn;
1216
1217   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1218   MRI.addLiveIn(Reg, NewReg);
1219   return NewReg;
1220 }
1221
1222 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1223                                          const ArgDescriptor *Arg) const {
1224   if (!Arg->isRegister())
1225     return false; // TODO: Handle these
1226
1227   assert(Arg->getRegister() != 0);
1228   assert(Arg->getRegister().isPhysical());
1229
1230   MachineRegisterInfo &MRI = *B.getMRI();
1231
1232   LLT Ty = MRI.getType(DstReg);
1233   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1234
1235   if (Arg->isMasked()) {
1236     // TODO: Should we try to emit this once in the entry block?
1237     const LLT S32 = LLT::scalar(32);
1238     const unsigned Mask = Arg->getMask();
1239     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1240
1241     auto ShiftAmt = B.buildConstant(S32, Shift);
1242     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1243     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1244   } else
1245     B.buildCopy(DstReg, LiveIn);
1246
1247   // Insert the argument copy if it doens't already exist.
1248   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1249   if (!MRI.getVRegDef(LiveIn)) {
1250     MachineBasicBlock &EntryMBB = B.getMF().front();
1251     EntryMBB.addLiveIn(Arg->getRegister());
1252     B.setInsertPt(EntryMBB, EntryMBB.begin());
1253     B.buildCopy(LiveIn, Arg->getRegister());
1254   }
1255
1256   return true;
1257 }
1258
1259 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1260   MachineInstr &MI,
1261   MachineRegisterInfo &MRI,
1262   MachineIRBuilder &B,
1263   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1264   B.setInstr(MI);
1265
1266   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1267
1268   const ArgDescriptor *Arg;
1269   const TargetRegisterClass *RC;
1270   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1271   if (!Arg) {
1272     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1273     return false;
1274   }
1275
1276   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1277     MI.eraseFromParent();
1278     return true;
1279   }
1280
1281   return false;
1282 }
1283
1284 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1285                                            MachineRegisterInfo &MRI,
1286                                            MachineIRBuilder &B) const {
1287   B.setInstr(MI);
1288   Register Res = MI.getOperand(0).getReg();
1289   Register LHS = MI.getOperand(2).getReg();
1290   Register RHS = MI.getOperand(3).getReg();
1291   uint16_t Flags = MI.getFlags();
1292
1293   LLT S32 = LLT::scalar(32);
1294   LLT S1 = LLT::scalar(1);
1295
1296   auto Abs = B.buildFAbs(S32, RHS, Flags);
1297   const APFloat C0Val(1.0f);
1298
1299   auto C0 = B.buildConstant(S32, 0x6f800000);
1300   auto C1 = B.buildConstant(S32, 0x2f800000);
1301   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1302
1303   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1304   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1305
1306   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1307
1308   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1309     .addUse(Mul0.getReg(0))
1310     .setMIFlags(Flags);
1311
1312   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1313
1314   B.buildFMul(Res, Sel, Mul1, Flags);
1315
1316   MI.eraseFromParent();
1317   return true;
1318 }
1319
1320 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1321                                                  MachineRegisterInfo &MRI,
1322                                                  MachineIRBuilder &B) const {
1323   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1324   if (!MFI->isEntryFunction()) {
1325     return legalizePreloadedArgIntrin(MI, MRI, B,
1326                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1327   }
1328
1329   B.setInstr(MI);
1330
1331   uint64_t Offset =
1332     ST.getTargetLowering()->getImplicitParameterOffset(
1333       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1334   Register DstReg = MI.getOperand(0).getReg();
1335   LLT DstTy = MRI.getType(DstReg);
1336   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1337
1338   const ArgDescriptor *Arg;
1339   const TargetRegisterClass *RC;
1340   std::tie(Arg, RC)
1341     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1342   if (!Arg)
1343     return false;
1344
1345   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1346   if (!loadInputValue(KernargPtrReg, B, Arg))
1347     return false;
1348
1349   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1350   MI.eraseFromParent();
1351   return true;
1352 }
1353
1354 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1355                                             MachineRegisterInfo &MRI,
1356                                             MachineIRBuilder &B) const {
1357   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1358   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1359   case Intrinsic::amdgcn_if: {
1360     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1361       const SIRegisterInfo *TRI
1362         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1363
1364       B.setInstr(*BrCond);
1365       Register Def = MI.getOperand(1).getReg();
1366       Register Use = MI.getOperand(3).getReg();
1367       B.buildInstr(AMDGPU::SI_IF)
1368         .addDef(Def)
1369         .addUse(Use)
1370         .addMBB(BrCond->getOperand(1).getMBB());
1371
1372       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1373       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1374       MI.eraseFromParent();
1375       BrCond->eraseFromParent();
1376       return true;
1377     }
1378
1379     return false;
1380   }
1381   case Intrinsic::amdgcn_loop: {
1382     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1383       const SIRegisterInfo *TRI
1384         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1385
1386       B.setInstr(*BrCond);
1387       Register Reg = MI.getOperand(2).getReg();
1388       B.buildInstr(AMDGPU::SI_LOOP)
1389         .addUse(Reg)
1390         .addMBB(BrCond->getOperand(1).getMBB());
1391       MI.eraseFromParent();
1392       BrCond->eraseFromParent();
1393       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1394       return true;
1395     }
1396
1397     return false;
1398   }
1399   case Intrinsic::amdgcn_kernarg_segment_ptr:
1400     return legalizePreloadedArgIntrin(
1401       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1402   case Intrinsic::amdgcn_implicitarg_ptr:
1403     return legalizeImplicitArgPtr(MI, MRI, B);
1404   case Intrinsic::amdgcn_workitem_id_x:
1405     return legalizePreloadedArgIntrin(MI, MRI, B,
1406                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1407   case Intrinsic::amdgcn_workitem_id_y:
1408     return legalizePreloadedArgIntrin(MI, MRI, B,
1409                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1410   case Intrinsic::amdgcn_workitem_id_z:
1411     return legalizePreloadedArgIntrin(MI, MRI, B,
1412                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1413   case Intrinsic::amdgcn_workgroup_id_x:
1414     return legalizePreloadedArgIntrin(MI, MRI, B,
1415                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1416   case Intrinsic::amdgcn_workgroup_id_y:
1417     return legalizePreloadedArgIntrin(MI, MRI, B,
1418                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1419   case Intrinsic::amdgcn_workgroup_id_z:
1420     return legalizePreloadedArgIntrin(MI, MRI, B,
1421                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1422   case Intrinsic::amdgcn_dispatch_ptr:
1423     return legalizePreloadedArgIntrin(MI, MRI, B,
1424                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1425   case Intrinsic::amdgcn_queue_ptr:
1426     return legalizePreloadedArgIntrin(MI, MRI, B,
1427                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1428   case Intrinsic::amdgcn_implicit_buffer_ptr:
1429     return legalizePreloadedArgIntrin(
1430       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1431   case Intrinsic::amdgcn_dispatch_id:
1432     return legalizePreloadedArgIntrin(MI, MRI, B,
1433                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1434   case Intrinsic::amdgcn_fdiv_fast:
1435     return legalizeFDIVFast(MI, MRI, B);
1436   default:
1437     return true;
1438   }
1439
1440   return true;
1441 }