llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPULegalizerInfo.h"
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUGlobalISelUtils.h"
  18 #include "AMDGPUInstrInfo.h"
  19 #include "AMDGPUTargetMachine.h"
  20 #include "SIMachineFunctionInfo.h"
  21 #include "Utils/AMDGPUBaseInfo.h"
  22 #include "llvm/ADT/ScopeExit.h"
  23 #include "llvm/BinaryFormat/ELF.h"
  24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
  26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  27 #include "llvm/IR/DiagnosticInfo.h"
  28 #include "llvm/IR/IntrinsicsAMDGPU.h"
  29
  30 #define DEBUG_TYPE "amdgpu-legalinfo"
  31
  32 using namespace llvm;
  33 using namespace LegalizeActions;
  34 using namespace LegalizeMutations;
  35 using namespace LegalityPredicates;
  36 using namespace MIPatternMatch;
  37
  38 // Hack until load/store selection patterns support any tuple of legal types.
  39 static cl::opt<bool> EnableNewLegality(
  40   "amdgpu-global-isel-new-legality",
  41   cl::desc("Use GlobalISel desired legality, rather than try to use"
  42            "rules compatible with selection patterns"),
  43   cl::init(false),
  44   cl::ReallyHidden);
  45
  46 static constexpr unsigned MaxRegisterSize = 1024;
  47
  48 // Round the number of elements to the next power of two elements
  49 static LLT getPow2VectorType(LLT Ty) {
  50   unsigned NElts = Ty.getNumElements();
  51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
  52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
  53 }
  54
  55 // Round the number of bits to the next power of two bits
  56 static LLT getPow2ScalarType(LLT Ty) {
  57   unsigned Bits = Ty.getSizeInBits();
  58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
  59   return LLT::scalar(Pow2Bits);
  60 }
  61
  62 /// \returs true if this is an odd sized vector which should widen by adding an
  63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
  64 /// excludes s1 vectors, which should always be scalarized.
  65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  66   return [=](const LegalityQuery &Query) {
  67     const LLT Ty = Query.Types[TypeIdx];
  68     if (!Ty.isVector())
  69       return false;
  70
  71     const LLT EltTy = Ty.getElementType();
  72     const unsigned EltSize = EltTy.getSizeInBits();
  73     return Ty.getNumElements() % 2 != 0 &&
  74            EltSize > 1 && EltSize < 32 &&
  75            Ty.getSizeInBits() % 32 != 0;
  76   };
  77 }
  78
  79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
  80   return [=](const LegalityQuery &Query) {
  81     const LLT Ty = Query.Types[TypeIdx];
  82     return Ty.getSizeInBits() % 32 == 0;
  83   };
  84 }
  85
  86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
  87   return [=](const LegalityQuery &Query) {
  88     const LLT Ty = Query.Types[TypeIdx];
  89     const LLT EltTy = Ty.getScalarType();
  90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
  91   };
  92 }
  93
  94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  95   return [=](const LegalityQuery &Query) {
  96     const LLT Ty = Query.Types[TypeIdx];
  97     const LLT EltTy = Ty.getElementType();
  98     return std::make_pair(TypeIdx,
  99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
 100   };
 101 }
 102
 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
 104   return [=](const LegalityQuery &Query) {
 105     const LLT Ty = Query.Types[TypeIdx];
 106     const LLT EltTy = Ty.getElementType();
 107     unsigned Size = Ty.getSizeInBits();
 108     unsigned Pieces = (Size + 63) / 64;
 109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
 110     return std::make_pair(
 111         TypeIdx,
 112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
 113   };
 114 }
 115
 116 // Increase the number of vector elements to reach the next multiple of 32-bit
 117 // type.
 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
 119   return [=](const LegalityQuery &Query) {
 120     const LLT Ty = Query.Types[TypeIdx];
 121
 122     const LLT EltTy = Ty.getElementType();
 123     const int Size = Ty.getSizeInBits();
 124     const int EltSize = EltTy.getSizeInBits();
 125     const int NextMul32 = (Size + 31) / 32;
 126
 127     assert(EltSize < 32);
 128
 129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
 130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
 131   };
 132 }
 133
 134 static LLT getBitcastRegisterType(const LLT Ty) {
 135   const unsigned Size = Ty.getSizeInBits();
 136
 137   LLT CoercedTy;
 138   if (Size <= 32) {
 139     // <2 x s8> -> s16
 140     // <4 x s8> -> s32
 141     return LLT::scalar(Size);
 142   }
 143
 144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
 145 }
 146
 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
 148   return [=](const LegalityQuery &Query) {
 149     const LLT Ty = Query.Types[TypeIdx];
 150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
 151   };
 152 }
 153
 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
 155   return [=](const LegalityQuery &Query) {
 156     const LLT Ty = Query.Types[TypeIdx];
 157     unsigned Size = Ty.getSizeInBits();
 158     assert(Size % 32 == 0);
 159     return std::make_pair(
 160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
 161   };
 162 }
 163
 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
 165   return [=](const LegalityQuery &Query) {
 166     const LLT QueryTy = Query.Types[TypeIdx];
 167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
 168   };
 169 }
 170
 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
 172   return [=](const LegalityQuery &Query) {
 173     const LLT QueryTy = Query.Types[TypeIdx];
 174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
 175   };
 176 }
 177
 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
 179   return [=](const LegalityQuery &Query) {
 180     const LLT QueryTy = Query.Types[TypeIdx];
 181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
 182   };
 183 }
 184
 185 static bool isRegisterSize(unsigned Size) {
 186   return Size % 32 == 0 && Size <= MaxRegisterSize;
 187 }
 188
 189 static bool isRegisterVectorElementType(LLT EltTy) {
 190   const int EltSize = EltTy.getSizeInBits();
 191   return EltSize == 16 || EltSize % 32 == 0;
 192 }
 193
 194 static bool isRegisterVectorType(LLT Ty) {
 195   const int EltSize = Ty.getElementType().getSizeInBits();
 196   return EltSize == 32 || EltSize == 64 ||
 197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 198          EltSize == 128 || EltSize == 256;
 199 }
 200
 201 static bool isRegisterType(LLT Ty) {
 202   if (!isRegisterSize(Ty.getSizeInBits()))
 203     return false;
 204
 205   if (Ty.isVector())
 206     return isRegisterVectorType(Ty);
 207
 208   return true;
 209 }
 210
 211 // Any combination of 32 or 64-bit elements up the maximum register size, and
 212 // multiples of v2s16.
 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 214   return [=](const LegalityQuery &Query) {
 215     return isRegisterType(Query.Types[TypeIdx]);
 216   };
 217 }
 218
 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
 220   return [=](const LegalityQuery &Query) {
 221     const LLT QueryTy = Query.Types[TypeIdx];
 222     if (!QueryTy.isVector())
 223       return false;
 224     const LLT EltTy = QueryTy.getElementType();
 225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
 226   };
 227 }
 228
 229 // If we have a truncating store or an extending load with a data size larger
 230 // than 32-bits, we need to reduce to a 32-bit type.
 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
 232   return [=](const LegalityQuery &Query) {
 233     const LLT Ty = Query.Types[TypeIdx];
 234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
 235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
 236   };
 237 }
 238
 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 240 // handle some operations by just promoting the register during
 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
 243                                     bool IsLoad) {
 244   switch (AS) {
 245   case AMDGPUAS::PRIVATE_ADDRESS:
 246     // FIXME: Private element size.
 247     return ST.enableFlatScratch() ? 128 : 32;
 248   case AMDGPUAS::LOCAL_ADDRESS:
 249     return ST.useDS128() ? 128 : 64;
 250   case AMDGPUAS::GLOBAL_ADDRESS:
 251   case AMDGPUAS::CONSTANT_ADDRESS:
 252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
 253     // Treat constant and global as identical. SMRD loads are sometimes usable for
 254     // global loads (ideally constant address space should be eliminated)
 255     // depending on the context. Legality cannot be context dependent, but
 256     // RegBankSelect can split the load as necessary depending on the pointer
 257     // register bank/uniformity and if the memory is invariant or not written in a
 258     // kernel.
 259     return IsLoad ? 512 : 128;
 260   default:
 261     // Flat addresses may contextually need to be split to 32-bit parts if they
 262     // may alias scratch depending on the subtarget.
 263     return 128;
 264   }
 265 }
 266
 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
 268                                  const LegalityQuery &Query) {
 269   const LLT Ty = Query.Types[0];
 270
 271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
 272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
 273
 274   unsigned RegSize = Ty.getSizeInBits();
 275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
 276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
 277   unsigned AS = Query.Types[1].getAddressSpace();
 278
 279   // All of these need to be custom lowered to cast the pointer operand.
 280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
 281     return false;
 282
 283   // Do not handle extending vector loads.
 284   if (Ty.isVector() && MemSize != RegSize)
 285     return false;
 286
 287   // TODO: We should be able to widen loads if the alignment is high enough, but
 288   // we also need to modify the memory access size.
 289 #if 0
 290   // Accept widening loads based on alignment.
 291   if (IsLoad && MemSize < Size)
 292     MemSize = std::max(MemSize, Align);
 293 #endif
 294
 295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
 296   if (MemSize != RegSize && RegSize != 32)
 297     return false;
 298
 299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
 300     return false;
 301
 302   switch (MemSize) {
 303   case 8:
 304   case 16:
 305   case 32:
 306   case 64:
 307   case 128:
 308     break;
 309   case 96:
 310     if (!ST.hasDwordx3LoadStores())
 311       return false;
 312     break;
 313   case 256:
 314   case 512:
 315     // These may contextually need to be broken down.
 316     break;
 317   default:
 318     return false;
 319   }
 320
 321   assert(RegSize >= MemSize);
 322
 323   if (AlignBits < MemSize) {
 324     const SITargetLowering *TLI = ST.getTargetLowering();
 325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
 326                                                  Align(AlignBits / 8)))
 327       return false;
 328   }
 329
 330   return true;
 331 }
 332
 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
 334 // workaround this. Eventually it should ignore the type for loads and only care
 335 // about the size. Return true in cases where we will workaround this for now by
 336 // bitcasting.
 337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
 338   if (EnableNewLegality)
 339     return false;
 340
 341   const unsigned Size = Ty.getSizeInBits();
 342   if (Size <= 64)
 343     return false;
 344   if (!Ty.isVector())
 345     return true;
 346
 347   LLT EltTy = Ty.getElementType();
 348   if (EltTy.isPointer())
 349     return true;
 350
 351   unsigned EltSize = EltTy.getSizeInBits();
 352   return EltSize != 32 && EltSize != 64;
 353 }
 354
 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
 356   const LLT Ty = Query.Types[0];
 357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
 358          !loadStoreBitcastWorkaround(Ty);
 359 }
 360
 361 /// Return true if a load or store of the type should be lowered with a bitcast
 362 /// to a different type.
 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
 364                                        const LLT MemTy) {
 365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
 366   const unsigned Size = Ty.getSizeInBits();
 367   if (Size != MemSizeInBits)
 368     return Size <= 32 && Ty.isVector();
 369
 370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
 371     return true;
 372
 373   // Don't try to handle bitcasting vector ext loads for now.
 374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
 375          (Size <= 32 || isRegisterSize(Size)) &&
 376          !isRegisterVectorElementType(Ty.getElementType());
 377 }
 378
 379 /// Return true if we should legalize a load by widening an odd sized memory
 380 /// access up to the alignment. Note this case when the memory access itself
 381 /// changes, not the size of the result register.
 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
 383                             unsigned AlignInBits, unsigned AddrSpace,
 384                             unsigned Opcode) {
 385   unsigned SizeInBits = MemoryTy.getSizeInBits();
 386   // We don't want to widen cases that are naturally legal.
 387   if (isPowerOf2_32(SizeInBits))
 388     return false;
 389
 390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
 391   // end up widening these for a scalar load during RegBankSelect, since there
 392   // aren't 96-bit scalar loads.
 393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
 394     return false;
 395
 396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
 397     return false;
 398
 399   // A load is known dereferenceable up to the alignment, so it's legal to widen
 400   // to it.
 401   //
 402   // TODO: Could check dereferenceable for less aligned cases.
 403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
 404   if (AlignInBits < RoundedSize)
 405     return false;
 406
 407   // Do not widen if it would introduce a slow unaligned load.
 408   const SITargetLowering *TLI = ST.getTargetLowering();
 409   bool Fast = false;
 410   return TLI->allowsMisalignedMemoryAccessesImpl(
 411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
 412              MachineMemOperand::MOLoad, &Fast) &&
 413          Fast;
 414 }
 415
 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
 417                             unsigned Opcode) {
 418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
 419     return false;
 420
 421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
 422                          Query.MMODescrs[0].AlignInBits,
 423                          Query.Types[1].getAddressSpace(), Opcode);
 424 }
 425
 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 427                                          const GCNTargetMachine &TM)
 428   :  ST(ST_) {
 429   using namespace TargetOpcode;
 430
 431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 433   };
 434
 435   const LLT S1 = LLT::scalar(1);
 436   const LLT S8 = LLT::scalar(8);
 437   const LLT S16 = LLT::scalar(16);
 438   const LLT S32 = LLT::scalar(32);
 439   const LLT S64 = LLT::scalar(64);
 440   const LLT S128 = LLT::scalar(128);
 441   const LLT S256 = LLT::scalar(256);
 442   const LLT S512 = LLT::scalar(512);
 443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
 444
 445   const LLT V2S8 = LLT::fixed_vector(2, 8);
 446   const LLT V2S16 = LLT::fixed_vector(2, 16);
 447   const LLT V4S16 = LLT::fixed_vector(4, 16);
 448
 449   const LLT V2S32 = LLT::fixed_vector(2, 32);
 450   const LLT V3S32 = LLT::fixed_vector(3, 32);
 451   const LLT V4S32 = LLT::fixed_vector(4, 32);
 452   const LLT V5S32 = LLT::fixed_vector(5, 32);
 453   const LLT V6S32 = LLT::fixed_vector(6, 32);
 454   const LLT V7S32 = LLT::fixed_vector(7, 32);
 455   const LLT V8S32 = LLT::fixed_vector(8, 32);
 456   const LLT V9S32 = LLT::fixed_vector(9, 32);
 457   const LLT V10S32 = LLT::fixed_vector(10, 32);
 458   const LLT V11S32 = LLT::fixed_vector(11, 32);
 459   const LLT V12S32 = LLT::fixed_vector(12, 32);
 460   const LLT V13S32 = LLT::fixed_vector(13, 32);
 461   const LLT V14S32 = LLT::fixed_vector(14, 32);
 462   const LLT V15S32 = LLT::fixed_vector(15, 32);
 463   const LLT V16S32 = LLT::fixed_vector(16, 32);
 464   const LLT V32S32 = LLT::fixed_vector(32, 32);
 465
 466   const LLT V2S64 = LLT::fixed_vector(2, 64);
 467   const LLT V3S64 = LLT::fixed_vector(3, 64);
 468   const LLT V4S64 = LLT::fixed_vector(4, 64);
 469   const LLT V5S64 = LLT::fixed_vector(5, 64);
 470   const LLT V6S64 = LLT::fixed_vector(6, 64);
 471   const LLT V7S64 = LLT::fixed_vector(7, 64);
 472   const LLT V8S64 = LLT::fixed_vector(8, 64);
 473   const LLT V16S64 = LLT::fixed_vector(16, 64);
 474
 475   std::initializer_list<LLT> AllS32Vectors =
 476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
 478   std::initializer_list<LLT> AllS64Vectors =
 479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
 480
 481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 488
 489   const LLT CodePtr = FlatPtr;
 490
 491   const std::initializer_list<LLT> AddrSpaces64 = {
 492     GlobalPtr, ConstantPtr, FlatPtr
 493   };
 494
 495   const std::initializer_list<LLT> AddrSpaces32 = {
 496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 497   };
 498
 499   const std::initializer_list<LLT> FPTypesBase = {
 500     S32, S64
 501   };
 502
 503   const std::initializer_list<LLT> FPTypes16 = {
 504     S32, S64, S16
 505   };
 506
 507   const std::initializer_list<LLT> FPTypesPK16 = {
 508     S32, S64, S16, V2S16
 509   };
 510
 511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
 512
 513   // s1 for VCC branches, s32 for SCC branches.
 514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
 515
 516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 517   // elements for v3s16
 518   getActionDefinitionsBuilder(G_PHI)
 519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
 520     .legalFor(AllS32Vectors)
 521     .legalFor(AllS64Vectors)
 522     .legalFor(AddrSpaces64)
 523     .legalFor(AddrSpaces32)
 524     .legalIf(isPointer(0))
 525     .clampScalar(0, S16, S256)
 526     .widenScalarToNextPow2(0, 32)
 527     .clampMaxNumElements(0, S32, 16)
 528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 529     .scalarize(0);
 530
 531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
 532     // Full set of gfx9 features.
 533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 534       .legalFor({S32, S16, V2S16})
 535       .clampScalar(0, S16, S32)
 536       .clampMaxNumElements(0, S16, 2)
 537       .scalarize(0)
 538       .widenScalarToNextPow2(0, 32);
 539
 540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
 541       .legalFor({S32, S16, V2S16}) // Clamp modifier
 542       .minScalarOrElt(0, S16)
 543       .clampMaxNumElements(0, S16, 2)
 544       .scalarize(0)
 545       .widenScalarToNextPow2(0, 32)
 546       .lower();
 547   } else if (ST.has16BitInsts()) {
 548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 549       .legalFor({S32, S16})
 550       .clampScalar(0, S16, S32)
 551       .scalarize(0)
 552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
 553
 554     // Technically the saturating operations require clamp bit support, but this
 555     // was introduced at the same time as 16-bit operations.
 556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
 557       .legalFor({S32, S16}) // Clamp modifier
 558       .minScalar(0, S16)
 559       .scalarize(0)
 560       .widenScalarToNextPow2(0, 16)
 561       .lower();
 562
 563     // We're just lowering this, but it helps get a better result to try to
 564     // coerce to the desired type first.
 565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
 566       .minScalar(0, S16)
 567       .scalarize(0)
 568       .lower();
 569   } else {
 570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 571       .legalFor({S32})
 572       .clampScalar(0, S32, S32)
 573       .scalarize(0);
 574
 575     if (ST.hasIntClamp()) {
 576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
 577         .legalFor({S32}) // Clamp modifier.
 578         .scalarize(0)
 579         .minScalarOrElt(0, S32)
 580         .lower();
 581     } else {
 582       // Clamp bit support was added in VI, along with 16-bit operations.
 583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
 584         .minScalar(0, S32)
 585         .scalarize(0)
 586         .lower();
 587     }
 588
 589     // FIXME: DAG expansion gets better results. The widening uses the smaller
 590     // range values and goes for the min/max lowering directly.
 591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
 592       .minScalar(0, S32)
 593       .scalarize(0)
 594       .lower();
 595   }
 596
 597   getActionDefinitionsBuilder(
 598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
 599       .customFor({S32, S64})
 600       .clampScalar(0, S32, S64)
 601       .widenScalarToNextPow2(0, 32)
 602       .scalarize(0);
 603
 604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 605                    .legalFor({S32})
 606                    .maxScalarOrElt(0, S32);
 607
 608   if (ST.hasVOP3PInsts()) {
 609     Mulh
 610       .clampMaxNumElements(0, S8, 2)
 611       .lowerFor({V2S8});
 612   }
 613
 614   Mulh
 615     .scalarize(0)
 616     .lower();
 617
 618   // Report legal for any types we can handle anywhere. For the cases only legal
 619   // on the SALU, RegBankSelect will be able to re-legalize.
 620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 622     .clampScalar(0, S32, S64)
 623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
 625     .widenScalarToNextPow2(0)
 626     .scalarize(0);
 627
 628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
 629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 630     .legalFor({{S32, S1}, {S32, S32}})
 631     .minScalar(0, S32)
 632     // TODO: .scalarize(0)
 633     .lower();
 634
 635   getActionDefinitionsBuilder(G_BITCAST)
 636     // Don't worry about the size constraint.
 637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
 638     .lower();
 639
 640
 641   getActionDefinitionsBuilder(G_CONSTANT)
 642     .legalFor({S1, S32, S64, S16, GlobalPtr,
 643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 644     .legalIf(isPointer(0))
 645     .clampScalar(0, S32, S64)
 646     .widenScalarToNextPow2(0);
 647
 648   getActionDefinitionsBuilder(G_FCONSTANT)
 649     .legalFor({S32, S64, S16})
 650     .clampScalar(0, S16, S64);
 651
 652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
 653       .legalIf(isRegisterType(0))
 654       // s1 and s16 are special cases because they have legal operations on
 655       // them, but don't really occupy registers in the normal way.
 656       .legalFor({S1, S16})
 657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 658       .clampScalarOrElt(0, S32, MaxScalar)
 659       .widenScalarToNextPow2(0, 32)
 660       .clampMaxNumElements(0, S32, 16);
 661
 662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
 663
 664   // If the amount is divergent, we have to do a wave reduction to get the
 665   // maximum value, so this is expanded during RegBankSelect.
 666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
 667     .legalFor({{PrivatePtr, S32}});
 668
 669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
 670     .customIf(typeIsNot(0, PrivatePtr));
 671
 672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
 673
 674   auto &FPOpActions = getActionDefinitionsBuilder(
 675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
 676     .legalFor({S32, S64});
 677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 678     .customFor({S32, S64});
 679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
 680     .customFor({S32, S64});
 681
 682   if (ST.has16BitInsts()) {
 683     if (ST.hasVOP3PInsts())
 684       FPOpActions.legalFor({S16, V2S16});
 685     else
 686       FPOpActions.legalFor({S16});
 687
 688     TrigActions.customFor({S16});
 689     FDIVActions.customFor({S16});
 690   }
 691
 692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 694
 695   if (ST.hasVOP3PInsts()) {
 696     MinNumMaxNum.customFor(FPTypesPK16)
 697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 698       .clampMaxNumElements(0, S16, 2)
 699       .clampScalar(0, S16, S64)
 700       .scalarize(0);
 701   } else if (ST.has16BitInsts()) {
 702     MinNumMaxNum.customFor(FPTypes16)
 703       .clampScalar(0, S16, S64)
 704       .scalarize(0);
 705   } else {
 706     MinNumMaxNum.customFor(FPTypesBase)
 707       .clampScalar(0, S32, S64)
 708       .scalarize(0);
 709   }
 710
 711   if (ST.hasVOP3PInsts())
 712     FPOpActions.clampMaxNumElements(0, S16, 2);
 713
 714   FPOpActions
 715     .scalarize(0)
 716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 717
 718   TrigActions
 719     .scalarize(0)
 720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 721
 722   FDIVActions
 723     .scalarize(0)
 724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 725
 726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
 727     .legalFor(FPTypesPK16)
 728     .clampMaxNumElements(0, S16, 2)
 729     .scalarize(0)
 730     .clampScalar(0, S16, S64);
 731
 732   if (ST.has16BitInsts()) {
 733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 734       .legalFor({S32, S64, S16})
 735       .scalarize(0)
 736       .clampScalar(0, S16, S64);
 737   } else {
 738     getActionDefinitionsBuilder(G_FSQRT)
 739       .legalFor({S32, S64})
 740       .scalarize(0)
 741       .clampScalar(0, S32, S64);
 742
 743     if (ST.hasFractBug()) {
 744       getActionDefinitionsBuilder(G_FFLOOR)
 745         .customFor({S64})
 746         .legalFor({S32, S64})
 747         .scalarize(0)
 748         .clampScalar(0, S32, S64);
 749     } else {
 750       getActionDefinitionsBuilder(G_FFLOOR)
 751         .legalFor({S32, S64})
 752         .scalarize(0)
 753         .clampScalar(0, S32, S64);
 754     }
 755   }
 756
 757   getActionDefinitionsBuilder(G_FPTRUNC)
 758     .legalFor({{S32, S64}, {S16, S32}})
 759     .scalarize(0)
 760     .lower();
 761
 762   getActionDefinitionsBuilder(G_FPEXT)
 763     .legalFor({{S64, S32}, {S32, S16}})
 764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
 765     .scalarize(0);
 766
 767   getActionDefinitionsBuilder(G_FSUB)
 768       // Use actual fsub instruction
 769       .legalFor({S32})
 770       // Must use fadd + fneg
 771       .lowerFor({S64, S16, V2S16})
 772       .scalarize(0)
 773       .clampScalar(0, S32, S64);
 774
 775   // Whether this is legal depends on the floating point mode for the function.
 776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
 777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
 778     FMad.customFor({S32, S16});
 779   else if (ST.hasMadMacF32Insts())
 780     FMad.customFor({S32});
 781   else if (ST.hasMadF16())
 782     FMad.customFor({S16});
 783   FMad.scalarize(0)
 784       .lower();
 785
 786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
 787   if (ST.has16BitInsts()) {
 788     FRem.customFor({S16, S32, S64});
 789   } else {
 790     FRem.minScalar(0, S32)
 791         .customFor({S32, S64});
 792   }
 793   FRem.scalarize(0);
 794
 795   // TODO: Do we need to clamp maximum bitwidth?
 796   getActionDefinitionsBuilder(G_TRUNC)
 797     .legalIf(isScalar(0))
 798     .legalFor({{V2S16, V2S32}})
 799     .clampMaxNumElements(0, S16, 2)
 800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
 801     // situations (like an invalid implicit use), we don't want to infinite loop
 802     // in the legalizer.
 803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
 804     .alwaysLegal();
 805
 806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 808                {S32, S1}, {S64, S1}, {S16, S1}})
 809     .scalarize(0)
 810     .clampScalar(0, S32, S64)
 811     .widenScalarToNextPow2(1, 32);
 812
 813   // TODO: Split s1->s64 during regbankselect for VALU.
 814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 815                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
 816                     .lowerIf(typeIs(1, S1))
 817                     .customFor({{S32, S64}, {S64, S64}});
 818   if (ST.has16BitInsts())
 819     IToFP.legalFor({{S16, S16}});
 820   IToFP.clampScalar(1, S32, S64)
 821        .minScalar(0, S32)
 822        .scalarize(0)
 823        .widenScalarToNextPow2(1);
 824
 825   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 826     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
 827     .customFor({{S64, S32}, {S64, S64}})
 828     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
 829   if (ST.has16BitInsts())
 830     FPToI.legalFor({{S16, S16}});
 831   else
 832     FPToI.minScalar(1, S32);
 833
 834   FPToI.minScalar(0, S32)
 835        .widenScalarToNextPow2(0, 32)
 836        .scalarize(0)
 837        .lower();
 838
 839   // Lower roundeven into G_FRINT
 840   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
 841     .scalarize(0)
 842     .lower();
 843
 844   if (ST.has16BitInsts()) {
 845     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 846       .legalFor({S16, S32, S64})
 847       .clampScalar(0, S16, S64)
 848       .scalarize(0);
 849   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 850     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 851       .legalFor({S32, S64})
 852       .clampScalar(0, S32, S64)
 853       .scalarize(0);
 854   } else {
 855     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 856       .legalFor({S32})
 857       .customFor({S64})
 858       .clampScalar(0, S32, S64)
 859       .scalarize(0);
 860   }
 861
 862   getActionDefinitionsBuilder(G_PTR_ADD)
 863     .legalIf(all(isPointer(0), sameSize(0, 1)))
 864     .scalarize(0)
 865     .scalarSameSizeAs(1, 0);
 866
 867   getActionDefinitionsBuilder(G_PTRMASK)
 868     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
 869     .scalarSameSizeAs(1, 0)
 870     .scalarize(0);
 871
 872   auto &CmpBuilder =
 873     getActionDefinitionsBuilder(G_ICMP)
 874     // The compare output type differs based on the register bank of the output,
 875     // so make both s1 and s32 legal.
 876     //
 877     // Scalar compares producing output in scc will be promoted to s32, as that
 878     // is the allocatable register type that will be needed for the copy from
 879     // scc. This will be promoted during RegBankSelect, and we assume something
 880     // before that won't try to use s32 result types.
 881     //
 882     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
 883     // bank.
 884     .legalForCartesianProduct(
 885       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 886     .legalForCartesianProduct(
 887       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
 888   if (ST.has16BitInsts()) {
 889     CmpBuilder.legalFor({{S1, S16}});
 890   }
 891
 892   CmpBuilder
 893     .widenScalarToNextPow2(1)
 894     .clampScalar(1, S32, S64)
 895     .scalarize(0)
 896     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
 897
 898   getActionDefinitionsBuilder(G_FCMP)
 899     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 900     .widenScalarToNextPow2(1)
 901     .clampScalar(1, S32, S64)
 902     .scalarize(0);
 903
 904   // FIXME: fpow has a selection pattern that should move to custom lowering.
 905   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
 906   if (ST.has16BitInsts())
 907     Exp2Ops.legalFor({S32, S16});
 908   else
 909     Exp2Ops.legalFor({S32});
 910   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
 911   Exp2Ops.scalarize(0);
 912
 913   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
 914   if (ST.has16BitInsts())
 915     ExpOps.customFor({{S32}, {S16}});
 916   else
 917     ExpOps.customFor({S32});
 918   ExpOps.clampScalar(0, MinScalarFPTy, S32)
 919         .scalarize(0);
 920
 921   getActionDefinitionsBuilder(G_FPOWI)
 922     .clampScalar(0, MinScalarFPTy, S32)
 923     .lower();
 924
 925   // The 64-bit versions produce 32-bit results, but only on the SALU.
 926   getActionDefinitionsBuilder(G_CTPOP)
 927     .legalFor({{S32, S32}, {S32, S64}})
 928     .clampScalar(0, S32, S32)
 929     .clampScalar(1, S32, S64)
 930     .scalarize(0)
 931     .widenScalarToNextPow2(0, 32)
 932     .widenScalarToNextPow2(1, 32);
 933
 934   // The hardware instructions return a different result on 0 than the generic
 935   // instructions expect. The hardware produces -1, but these produce the
 936   // bitwidth.
 937   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
 938     .scalarize(0)
 939     .clampScalar(0, S32, S32)
 940     .clampScalar(1, S32, S64)
 941     .widenScalarToNextPow2(0, 32)
 942     .widenScalarToNextPow2(1, 32)
 943     .custom();
 944
 945   // The 64-bit versions produce 32-bit results, but only on the SALU.
 946   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
 947     .legalFor({{S32, S32}, {S32, S64}})
 948     .clampScalar(0, S32, S32)
 949     .clampScalar(1, S32, S64)
 950     .scalarize(0)
 951     .widenScalarToNextPow2(0, 32)
 952     .widenScalarToNextPow2(1, 32);
 953
 954   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
 955   // RegBankSelect.
 956   getActionDefinitionsBuilder(G_BITREVERSE)
 957     .legalFor({S32, S64})
 958     .clampScalar(0, S32, S64)
 959     .scalarize(0)
 960     .widenScalarToNextPow2(0);
 961
 962   if (ST.has16BitInsts()) {
 963     getActionDefinitionsBuilder(G_BSWAP)
 964       .legalFor({S16, S32, V2S16})
 965       .clampMaxNumElements(0, S16, 2)
 966       // FIXME: Fixing non-power-of-2 before clamp is workaround for
 967       // narrowScalar limitation.
 968       .widenScalarToNextPow2(0)
 969       .clampScalar(0, S16, S32)
 970       .scalarize(0);
 971
 972     if (ST.hasVOP3PInsts()) {
 973       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
 974         .legalFor({S32, S16, V2S16})
 975         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 976         .clampMaxNumElements(0, S16, 2)
 977         .minScalar(0, S16)
 978         .widenScalarToNextPow2(0)
 979         .scalarize(0)
 980         .lower();
 981     } else {
 982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
 983         .legalFor({S32, S16})
 984         .widenScalarToNextPow2(0)
 985         .minScalar(0, S16)
 986         .scalarize(0)
 987         .lower();
 988     }
 989   } else {
 990     // TODO: Should have same legality without v_perm_b32
 991     getActionDefinitionsBuilder(G_BSWAP)
 992       .legalFor({S32})
 993       .lowerIf(scalarNarrowerThan(0, 32))
 994       // FIXME: Fixing non-power-of-2 before clamp is workaround for
 995       // narrowScalar limitation.
 996       .widenScalarToNextPow2(0)
 997       .maxScalar(0, S32)
 998       .scalarize(0)
 999       .lower();
1000
1001     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002       .legalFor({S32})
1003       .minScalar(0, S32)
1004       .widenScalarToNextPow2(0)
1005       .scalarize(0)
1006       .lower();
1007   }
1008
1009   getActionDefinitionsBuilder(G_INTTOPTR)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       })
1020     .narrowScalarIf(largerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       });
1024
1025   getActionDefinitionsBuilder(G_PTRTOINT)
1026     // List the common cases
1027     .legalForCartesianProduct(AddrSpaces64, {S64})
1028     .legalForCartesianProduct(AddrSpaces32, {S32})
1029     .scalarize(0)
1030     // Accept any address space as long as the size matches
1031     .legalIf(sameSize(0, 1))
1032     .widenScalarIf(smallerThan(0, 1),
1033       [](const LegalityQuery &Query) {
1034         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1035       })
1036     .narrowScalarIf(
1037       largerThan(0, 1),
1038       [](const LegalityQuery &Query) {
1039         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1040       });
1041
1042   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043     .scalarize(0)
1044     .custom();
1045
1046   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047                                     bool IsLoad) -> bool {
1048     const LLT DstTy = Query.Types[0];
1049
1050     // Split vector extloads.
1051     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1053
1054     if (MemSize < DstTy.getSizeInBits())
1055       MemSize = std::max(MemSize, AlignBits);
1056
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076
1077     if (AlignBits < MemSize) {
1078       const SITargetLowering *TLI = ST.getTargetLowering();
1079       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080                                                       Align(AlignBits / 8));
1081     }
1082
1083     return false;
1084   };
1085
1086   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1089
1090   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091   // LDS
1092   // TODO: Unsupported flat for SI.
1093
1094   for (unsigned Op : {G_LOAD, G_STORE}) {
1095     const bool IsStore = Op == G_STORE;
1096
1097     auto &Actions = getActionDefinitionsBuilder(Op);
1098     // Explicitly list some common cases.
1099     // TODO: Does this help compile time at all?
1100     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103                                       {S64, GlobalPtr, S64, GlobalAlign32},
1104                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106                                       {S32, GlobalPtr, S8, GlobalAlign8},
1107                                       {S32, GlobalPtr, S16, GlobalAlign16},
1108
1109                                       {S32, LocalPtr, S32, 32},
1110                                       {S64, LocalPtr, S64, 32},
1111                                       {V2S32, LocalPtr, V2S32, 32},
1112                                       {S32, LocalPtr, S8, 8},
1113                                       {S32, LocalPtr, S16, 16},
1114                                       {V2S16, LocalPtr, S32, 32},
1115
1116                                       {S32, PrivatePtr, S32, 32},
1117                                       {S32, PrivatePtr, S8, 8},
1118                                       {S32, PrivatePtr, S16, 16},
1119                                       {V2S16, PrivatePtr, S32, 32},
1120
1121                                       {S32, ConstantPtr, S32, GlobalAlign32},
1122                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124                                       {S64, ConstantPtr, S64, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126     Actions.legalIf(
1127       [=](const LegalityQuery &Query) -> bool {
1128         return isLoadStoreLegal(ST, Query);
1129       });
1130
1131     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132     // 64-bits.
1133     //
1134     // TODO: Should generalize bitcast action into coerce, which will also cover
1135     // inserting addrspacecasts.
1136     Actions.customIf(typeIs(1, Constant32Ptr));
1137
1138     // Turn any illegal element vectors into something easier to deal
1139     // with. These will ultimately produce 32-bit scalar shifts to extract the
1140     // parts anyway.
1141     //
1142     // For odd 16-bit element vectors, prefer to split those into pieces with
1143     // 16-bit vector parts.
1144     Actions.bitcastIf(
1145       [=](const LegalityQuery &Query) -> bool {
1146         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147                                           Query.MMODescrs[0].MemoryTy);
1148       }, bitcastToRegisterType(0));
1149
1150     if (!IsStore) {
1151       // Widen suitably aligned loads by loading extra bytes. The standard
1152       // legalization actions can't properly express widening memory operands.
1153       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154         return shouldWidenLoad(ST, Query, G_LOAD);
1155       });
1156     }
1157
1158     // FIXME: load/store narrowing should be moved to lower action
1159     Actions
1160         .narrowScalarIf(
1161             [=](const LegalityQuery &Query) -> bool {
1162               return !Query.Types[0].isVector() &&
1163                      needToSplitMemOp(Query, Op == G_LOAD);
1164             },
1165             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166               const LLT DstTy = Query.Types[0];
1167               const LLT PtrTy = Query.Types[1];
1168
1169               const unsigned DstSize = DstTy.getSizeInBits();
1170               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1171
1172               // Split extloads.
1173               if (DstSize > MemSize)
1174                 return std::make_pair(0, LLT::scalar(MemSize));
1175
1176               if (!isPowerOf2_32(DstSize)) {
1177                 // We're probably decomposing an odd sized store. Try to split
1178                 // to the widest type. TODO: Account for alignment. As-is it
1179                 // should be OK, since the new parts will be further legalized.
1180                 unsigned FloorSize = PowerOf2Floor(DstSize);
1181                 return std::make_pair(0, LLT::scalar(FloorSize));
1182               }
1183
1184               if (DstSize > 32 && (DstSize % 32 != 0)) {
1185                 // FIXME: Need a way to specify non-extload of larger size if
1186                 // suitably aligned.
1187                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1188               }
1189
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193               if (MemSize > MaxSize)
1194                 return std::make_pair(0, LLT::scalar(MaxSize));
1195
1196               unsigned Align = Query.MMODescrs[0].AlignInBits;
1197               return std::make_pair(0, LLT::scalar(Align));
1198             })
1199         .fewerElementsIf(
1200             [=](const LegalityQuery &Query) -> bool {
1201               return Query.Types[0].isVector() &&
1202                      needToSplitMemOp(Query, Op == G_LOAD);
1203             },
1204             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205               const LLT DstTy = Query.Types[0];
1206               const LLT PtrTy = Query.Types[1];
1207
1208               LLT EltTy = DstTy.getElementType();
1209               unsigned MaxSize = maxSizeForAddrSpace(ST,
1210                                                      PtrTy.getAddressSpace(),
1211                                                      Op == G_LOAD);
1212
1213               // FIXME: Handle widened to power of 2 results better. This ends
1214               // up scalarizing.
1215               // FIXME: 3 element stores scalarized on SI
1216
1217               // Split if it's too large for the address space.
1218               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219               if (MemSize > MaxSize) {
1220                 unsigned NumElts = DstTy.getNumElements();
1221                 unsigned EltSize = EltTy.getSizeInBits();
1222
1223                 if (MaxSize % EltSize == 0) {
1224                   return std::make_pair(
1225                       0, LLT::scalarOrVector(
1226                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1227                 }
1228
1229                 unsigned NumPieces = MemSize / MaxSize;
1230
1231                 // FIXME: Refine when odd breakdowns handled
1232                 // The scalars will need to be re-legalized.
1233                 if (NumPieces == 1 || NumPieces >= NumElts ||
1234                     NumElts % NumPieces != 0)
1235                   return std::make_pair(0, EltTy);
1236
1237                 return std::make_pair(
1238                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1239               }
1240
1241               // FIXME: We could probably handle weird extending loads better.
1242               if (DstTy.getSizeInBits() > MemSize)
1243                 return std::make_pair(0, EltTy);
1244
1245               unsigned EltSize = EltTy.getSizeInBits();
1246               unsigned DstSize = DstTy.getSizeInBits();
1247               if (!isPowerOf2_32(DstSize)) {
1248                 // We're probably decomposing an odd sized store. Try to split
1249                 // to the widest type. TODO: Account for alignment. As-is it
1250                 // should be OK, since the new parts will be further legalized.
1251                 unsigned FloorSize = PowerOf2Floor(DstSize);
1252                 return std::make_pair(
1253                     0, LLT::scalarOrVector(
1254                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1255               }
1256
1257               // Need to split because of alignment.
1258               unsigned Align = Query.MMODescrs[0].AlignInBits;
1259               if (EltSize > Align &&
1260                   (EltSize / Align < DstTy.getNumElements())) {
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1263               }
1264
1265               // May need relegalization for the scalars.
1266               return std::make_pair(0, EltTy);
1267             })
1268     .minScalar(0, S32)
1269     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270     .widenScalarToNextPow2(0)
1271     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272     .lower();
1273   }
1274
1275   // FIXME: Unaligned accesses not lowered.
1276   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278                                                   {S32, GlobalPtr, S16, 2 * 8},
1279                                                   {S32, LocalPtr, S8, 8},
1280                                                   {S32, LocalPtr, S16, 16},
1281                                                   {S32, PrivatePtr, S8, 8},
1282                                                   {S32, PrivatePtr, S16, 16},
1283                                                   {S32, ConstantPtr, S8, 8},
1284                                                   {S32, ConstantPtr, S16, 2 * 8}})
1285                        .legalIf(
1286                          [=](const LegalityQuery &Query) -> bool {
1287                            return isLoadStoreLegal(ST, Query);
1288                          });
1289
1290   if (ST.hasFlatAddressSpace()) {
1291     ExtLoads.legalForTypesWithMemDesc(
1292         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1293   }
1294
1295   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296   // 64-bits.
1297   //
1298   // TODO: Should generalize bitcast action into coerce, which will also cover
1299   // inserting addrspacecasts.
1300   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1301
1302   ExtLoads.clampScalar(0, S32, S32)
1303           .widenScalarToNextPow2(0)
1304           .lower();
1305
1306   auto &Atomics = getActionDefinitionsBuilder(
1307     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310      G_ATOMICRMW_UMIN})
1311     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312                {S64, GlobalPtr}, {S64, LocalPtr},
1313                {S32, RegionPtr}, {S64, RegionPtr}});
1314   if (ST.hasFlatAddressSpace()) {
1315     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1316   }
1317
1318   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319   if (ST.hasLDSFPAtomics()) {
1320     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321     if (ST.hasGFX90AInsts())
1322       Atomic.legalFor({{S64, LocalPtr}});
1323   }
1324   if (ST.hasAtomicFaddInsts())
1325     Atomic.legalFor({{S32, GlobalPtr}});
1326
1327   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328   // demarshalling
1329   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331                 {S32, FlatPtr}, {S64, FlatPtr}})
1332     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333                {S32, RegionPtr}, {S64, RegionPtr}});
1334   // TODO: Pointer types, any 32-bit or 64-bit vector
1335
1336   // Condition should be s32 for scalar, s1 for vector.
1337   getActionDefinitionsBuilder(G_SELECT)
1338       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339                                  LocalPtr, FlatPtr, PrivatePtr,
1340                                  LLT::fixed_vector(2, LocalPtr),
1341                                  LLT::fixed_vector(2, PrivatePtr)},
1342                                 {S1, S32})
1343       .clampScalar(0, S16, S64)
1344       .scalarize(1)
1345       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347       .clampMaxNumElements(0, S32, 2)
1348       .clampMaxNumElements(0, LocalPtr, 2)
1349       .clampMaxNumElements(0, PrivatePtr, 2)
1350       .scalarize(0)
1351       .widenScalarToNextPow2(0)
1352       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1353
1354   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355   // be more flexible with the shift amount type.
1356   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357     .legalFor({{S32, S32}, {S64, S32}});
1358   if (ST.has16BitInsts()) {
1359     if (ST.hasVOP3PInsts()) {
1360       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361             .clampMaxNumElements(0, S16, 2);
1362     } else
1363       Shifts.legalFor({{S16, S16}});
1364
1365     // TODO: Support 16-bit shift amounts for all types
1366     Shifts.widenScalarIf(
1367       [=](const LegalityQuery &Query) {
1368         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369         // 32-bit amount.
1370         const LLT ValTy = Query.Types[0];
1371         const LLT AmountTy = Query.Types[1];
1372         return ValTy.getSizeInBits() <= 16 &&
1373                AmountTy.getSizeInBits() < 16;
1374       }, changeTo(1, S16));
1375     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S16, S64);
1378     Shifts.widenScalarToNextPow2(0, 16);
1379
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S16)
1382       .scalarize(0)
1383       .lower();
1384   } else {
1385     // Make sure we legalize the shift amount type first, as the general
1386     // expansion for the shifted type will produce much worse code if it hasn't
1387     // been truncated already.
1388     Shifts.clampScalar(1, S32, S32);
1389     Shifts.clampScalar(0, S32, S64);
1390     Shifts.widenScalarToNextPow2(0, 32);
1391
1392     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393       .minScalar(0, S32)
1394       .scalarize(0)
1395       .lower();
1396   }
1397   Shifts.scalarize(0);
1398
1399   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402     unsigned IdxTypeIdx = 2;
1403
1404     getActionDefinitionsBuilder(Op)
1405       .customIf([=](const LegalityQuery &Query) {
1406           const LLT EltTy = Query.Types[EltTypeIdx];
1407           const LLT VecTy = Query.Types[VecTypeIdx];
1408           const LLT IdxTy = Query.Types[IdxTypeIdx];
1409           const unsigned EltSize = EltTy.getSizeInBits();
1410           return (EltSize == 32 || EltSize == 64) &&
1411                   VecTy.getSizeInBits() % 32 == 0 &&
1412                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1413                   IdxTy.getSizeInBits() == 32;
1414         })
1415       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416                  bitcastToVectorElement32(VecTypeIdx))
1417       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418       .bitcastIf(
1419         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420         [=](const LegalityQuery &Query) {
1421           // For > 64-bit element types, try to turn this into a 64-bit
1422           // element vector since we may be able to do better indexing
1423           // if this is scalar. If not, fall back to 32.
1424           const LLT EltTy = Query.Types[EltTypeIdx];
1425           const LLT VecTy = Query.Types[VecTypeIdx];
1426           const unsigned DstEltSize = EltTy.getSizeInBits();
1427           const unsigned VecSize = VecTy.getSizeInBits();
1428
1429           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430           return std::make_pair(
1431               VecTypeIdx,
1432               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1433         })
1434       .clampScalar(EltTypeIdx, S32, S64)
1435       .clampScalar(VecTypeIdx, S32, S64)
1436       .clampScalar(IdxTypeIdx, S32, S32)
1437       .clampMaxNumElements(VecTypeIdx, S32, 32)
1438       // TODO: Clamp elements for 64-bit vectors?
1439       // It should only be necessary with variable indexes.
1440       // As a last resort, lower to the stack
1441       .lower();
1442   }
1443
1444   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445     .unsupportedIf([=](const LegalityQuery &Query) {
1446         const LLT &EltTy = Query.Types[1].getElementType();
1447         return Query.Types[0] != EltTy;
1448       });
1449
1450   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1453
1454     // FIXME: Doesn't handle extract of illegal sizes.
1455     getActionDefinitionsBuilder(Op)
1456       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457       // FIXME: Multiples of 16 should not be legal.
1458       .legalIf([=](const LegalityQuery &Query) {
1459           const LLT BigTy = Query.Types[BigTyIdx];
1460           const LLT LitTy = Query.Types[LitTyIdx];
1461           return (BigTy.getSizeInBits() % 32 == 0) &&
1462                  (LitTy.getSizeInBits() % 16 == 0);
1463         })
1464       .widenScalarIf(
1465         [=](const LegalityQuery &Query) {
1466           const LLT BigTy = Query.Types[BigTyIdx];
1467           return (BigTy.getScalarSizeInBits() < 16);
1468         },
1469         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470       .widenScalarIf(
1471         [=](const LegalityQuery &Query) {
1472           const LLT LitTy = Query.Types[LitTyIdx];
1473           return (LitTy.getScalarSizeInBits() < 16);
1474         },
1475         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477       .widenScalarToNextPow2(BigTyIdx, 32);
1478
1479   }
1480
1481   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482     .legalForCartesianProduct(AllS32Vectors, {S32})
1483     .legalForCartesianProduct(AllS64Vectors, {S64})
1484     .clampNumElements(0, V16S32, V32S32)
1485     .clampNumElements(0, V2S64, V16S64)
1486     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1487
1488   if (ST.hasScalarPackInsts()) {
1489     BuildVector
1490       // FIXME: Should probably widen s1 vectors straight to s32
1491       .minScalarOrElt(0, S16)
1492       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493       .minScalar(1, S32);
1494
1495     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496       .legalFor({V2S16, S32})
1497       .lower();
1498     BuildVector.minScalarOrElt(0, S32);
1499   } else {
1500     BuildVector.customFor({V2S16, S16});
1501     BuildVector.minScalarOrElt(0, S32);
1502
1503     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504       .customFor({V2S16, S32})
1505       .lower();
1506   }
1507
1508   BuildVector.legalIf(isRegisterType(0));
1509
1510   // FIXME: Clamp maximum size
1511   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513     .clampMaxNumElements(0, S32, 32)
1514     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515     .clampMaxNumElements(0, S16, 64);
1516
1517   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518   // pre-legalize.
1519   if (ST.hasVOP3PInsts()) {
1520     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521       .customFor({V2S16, V2S16})
1522       .lower();
1523   } else
1524     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1525
1526   // Merge/Unmerge
1527   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1530
1531     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532       const LLT Ty = Query.Types[TypeIdx];
1533       if (Ty.isVector()) {
1534         const LLT &EltTy = Ty.getElementType();
1535         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536           return true;
1537         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538           return true;
1539       }
1540       return false;
1541     };
1542
1543     auto &Builder = getActionDefinitionsBuilder(Op)
1544       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545       .lowerFor({{S16, V2S16}})
1546       .lowerIf([=](const LegalityQuery &Query) {
1547           const LLT BigTy = Query.Types[BigTyIdx];
1548           return BigTy.getSizeInBits() == 32;
1549         })
1550       // Try to widen to s16 first for small types.
1551       // TODO: Only do this on targets with legal s16 shifts
1552       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556                            elementTypeIs(1, S16)),
1557                        changeTo(1, V2S16))
1558       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560       // valid.
1561       .clampScalar(LitTyIdx, S32, S512)
1562       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563       // Break up vectors with weird elements into scalars
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566         scalarize(0))
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569         scalarize(1))
1570       .clampScalar(BigTyIdx, S32, MaxScalar);
1571
1572     if (Op == G_MERGE_VALUES) {
1573       Builder.widenScalarIf(
1574         // TODO: Use 16-bit shifts if legal for 8-bit values?
1575         [=](const LegalityQuery &Query) {
1576           const LLT Ty = Query.Types[LitTyIdx];
1577           return Ty.getSizeInBits() < 32;
1578         },
1579         changeTo(LitTyIdx, S32));
1580     }
1581
1582     Builder.widenScalarIf(
1583       [=](const LegalityQuery &Query) {
1584         const LLT Ty = Query.Types[BigTyIdx];
1585         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586           Ty.getSizeInBits() % 16 != 0;
1587       },
1588       [=](const LegalityQuery &Query) {
1589         // Pick the next power of 2, or a multiple of 64 over 128.
1590         // Whichever is smaller.
1591         const LLT &Ty = Query.Types[BigTyIdx];
1592         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593         if (NewSizeInBits >= 256) {
1594           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595           if (RoundedTo < NewSizeInBits)
1596             NewSizeInBits = RoundedTo;
1597         }
1598         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1599       })
1600       // Any vectors left are the wrong size. Scalarize them.
1601       .scalarize(0)
1602       .scalarize(1);
1603   }
1604
1605   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606   // RegBankSelect.
1607   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608     .legalFor({{S32}, {S64}});
1609
1610   if (ST.hasVOP3PInsts()) {
1611     SextInReg.lowerFor({{V2S16}})
1612       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613       // get more vector shift opportunities, since we'll get those when
1614       // expanded.
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616   } else if (ST.has16BitInsts()) {
1617     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618   } else {
1619     // Prefer to promote to s32 before lowering if we don't have 16-bit
1620     // shifts. This avoid a lot of intermediate truncate and extend operations.
1621     SextInReg.lowerFor({{S32}, {S64}});
1622   }
1623
1624   SextInReg
1625     .scalarize(0)
1626     .clampScalar(0, S32, S64)
1627     .lower();
1628
1629   // TODO: Only Try to form v2s16 with legal packed instructions.
1630   getActionDefinitionsBuilder(G_FSHR)
1631     .legalFor({{S32, S32}})
1632     .lowerFor({{V2S16, V2S16}})
1633     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634     .scalarize(0)
1635     .lower();
1636
1637   if (ST.hasVOP3PInsts()) {
1638     getActionDefinitionsBuilder(G_FSHL)
1639       .lowerFor({{V2S16, V2S16}})
1640       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641       .scalarize(0)
1642       .lower();
1643   } else {
1644     getActionDefinitionsBuilder(G_FSHL)
1645       .scalarize(0)
1646       .lower();
1647   }
1648
1649   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650     .legalFor({S64});
1651
1652   getActionDefinitionsBuilder(G_FENCE)
1653     .alwaysLegal();
1654
1655   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656       .scalarize(0)
1657       .minScalar(0, S32)
1658       .lower();
1659
1660   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661       .legalFor({{S32, S32}, {S64, S32}})
1662       .clampScalar(1, S32, S32)
1663       .clampScalar(0, S32, S64)
1664       .widenScalarToNextPow2(0)
1665       .scalarize(0);
1666
1667   getActionDefinitionsBuilder({
1668       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669       G_FCOPYSIGN,
1670
1671       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672       G_ATOMICRMW_NAND,
1673       G_ATOMICRMW_FSUB,
1674       G_READ_REGISTER,
1675       G_WRITE_REGISTER,
1676
1677       G_SADDO, G_SSUBO,
1678
1679        // TODO: Implement
1680       G_FMINIMUM, G_FMAXIMUM}).lower();
1681
1682   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685     .unsupported();
1686
1687   getLegacyLegalizerInfo().computeTables();
1688   verify(*ST.getInstrInfo());
1689 }
1690
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692                                          MachineInstr &MI) const {
1693   MachineIRBuilder &B = Helper.MIRBuilder;
1694   MachineRegisterInfo &MRI = *B.getMRI();
1695
1696   switch (MI.getOpcode()) {
1697   case TargetOpcode::G_ADDRSPACE_CAST:
1698     return legalizeAddrSpaceCast(MI, MRI, B);
1699   case TargetOpcode::G_FRINT:
1700     return legalizeFrint(MI, MRI, B);
1701   case TargetOpcode::G_FCEIL:
1702     return legalizeFceil(MI, MRI, B);
1703   case TargetOpcode::G_FREM:
1704     return legalizeFrem(MI, MRI, B);
1705   case TargetOpcode::G_INTRINSIC_TRUNC:
1706     return legalizeIntrinsicTrunc(MI, MRI, B);
1707   case TargetOpcode::G_SITOFP:
1708     return legalizeITOFP(MI, MRI, B, true);
1709   case TargetOpcode::G_UITOFP:
1710     return legalizeITOFP(MI, MRI, B, false);
1711   case TargetOpcode::G_FPTOSI:
1712     return legalizeFPTOI(MI, MRI, B, true);
1713   case TargetOpcode::G_FPTOUI:
1714     return legalizeFPTOI(MI, MRI, B, false);
1715   case TargetOpcode::G_FMINNUM:
1716   case TargetOpcode::G_FMAXNUM:
1717   case TargetOpcode::G_FMINNUM_IEEE:
1718   case TargetOpcode::G_FMAXNUM_IEEE:
1719     return legalizeMinNumMaxNum(Helper, MI);
1720   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721     return legalizeExtractVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_INSERT_VECTOR_ELT:
1723     return legalizeInsertVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_SHUFFLE_VECTOR:
1725     return legalizeShuffleVector(MI, MRI, B);
1726   case TargetOpcode::G_FSIN:
1727   case TargetOpcode::G_FCOS:
1728     return legalizeSinCos(MI, MRI, B);
1729   case TargetOpcode::G_GLOBAL_VALUE:
1730     return legalizeGlobalValue(MI, MRI, B);
1731   case TargetOpcode::G_LOAD:
1732   case TargetOpcode::G_SEXTLOAD:
1733   case TargetOpcode::G_ZEXTLOAD:
1734     return legalizeLoad(Helper, MI);
1735   case TargetOpcode::G_FMAD:
1736     return legalizeFMad(MI, MRI, B);
1737   case TargetOpcode::G_FDIV:
1738     return legalizeFDIV(MI, MRI, B);
1739   case TargetOpcode::G_UDIV:
1740   case TargetOpcode::G_UREM:
1741   case TargetOpcode::G_UDIVREM:
1742     return legalizeUnsignedDIV_REM(MI, MRI, B);
1743   case TargetOpcode::G_SDIV:
1744   case TargetOpcode::G_SREM:
1745   case TargetOpcode::G_SDIVREM:
1746     return legalizeSignedDIV_REM(MI, MRI, B);
1747   case TargetOpcode::G_ATOMIC_CMPXCHG:
1748     return legalizeAtomicCmpXChg(MI, MRI, B);
1749   case TargetOpcode::G_FLOG:
1750     return legalizeFlog(MI, B, numbers::ln2f);
1751   case TargetOpcode::G_FLOG10:
1752     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753   case TargetOpcode::G_FEXP:
1754     return legalizeFExp(MI, B);
1755   case TargetOpcode::G_FPOW:
1756     return legalizeFPow(MI, B);
1757   case TargetOpcode::G_FFLOOR:
1758     return legalizeFFloor(MI, MRI, B);
1759   case TargetOpcode::G_BUILD_VECTOR:
1760     return legalizeBuildVector(MI, MRI, B);
1761   case TargetOpcode::G_CTLZ:
1762   case TargetOpcode::G_CTTZ:
1763     return legalizeCTLZ_CTTZ(MI, MRI, B);
1764   default:
1765     return false;
1766   }
1767
1768   llvm_unreachable("expected switch to return");
1769 }
1770
1771 Register AMDGPULegalizerInfo::getSegmentAperture(
1772   unsigned AS,
1773   MachineRegisterInfo &MRI,
1774   MachineIRBuilder &B) const {
1775   MachineFunction &MF = B.getMF();
1776   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1777   const LLT S32 = LLT::scalar(32);
1778
1779   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1780
1781   if (ST.hasApertureRegs()) {
1782     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1783     // getreg.
1784     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1786         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1787     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1789         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1790     unsigned Encoding =
1791         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1792         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1793         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1794
1795     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1796
1797     B.buildInstr(AMDGPU::S_GETREG_B32)
1798       .addDef(GetReg)
1799       .addImm(Encoding);
1800     MRI.setType(GetReg, S32);
1801
1802     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1803     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1804   }
1805
1806   Register QueuePtr = MRI.createGenericVirtualRegister(
1807     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1808
1809   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1810     return Register();
1811
1812   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813   // private_segment_aperture_base_hi.
1814   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1815
1816   // TODO: can we be smarter about machine pointer info?
1817   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1818   MachineMemOperand *MMO = MF.getMachineMemOperand(
1819       PtrInfo,
1820       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1821           MachineMemOperand::MOInvariant,
1822       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1823
1824   Register LoadAddr;
1825
1826   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1827   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1828 }
1829
1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   MachineFunction &MF = B.getMF();
1834
1835   const LLT S32 = LLT::scalar(32);
1836   Register Dst = MI.getOperand(0).getReg();
1837   Register Src = MI.getOperand(1).getReg();
1838
1839   LLT DstTy = MRI.getType(Dst);
1840   LLT SrcTy = MRI.getType(Src);
1841   unsigned DestAS = DstTy.getAddressSpace();
1842   unsigned SrcAS = SrcTy.getAddressSpace();
1843
1844   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1845   // vector element.
1846   assert(!DstTy.isVector());
1847
1848   const AMDGPUTargetMachine &TM
1849     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1850
1851   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1852     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1853     return true;
1854   }
1855
1856   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1857     // Truncate.
1858     B.buildExtract(Dst, Src, 0);
1859     MI.eraseFromParent();
1860     return true;
1861   }
1862
1863   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1865     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1866
1867     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868     // another. Merge operands are required to be the same type, but creating an
1869     // extra ptrtoint would be kind of pointless.
1870     auto HighAddr = B.buildConstant(
1871       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1872     B.buildMerge(Dst, {Src, HighAddr});
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876
1877   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1878     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1879            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1880     unsigned NullVal = TM.getNullPointerValue(DestAS);
1881
1882     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1883     auto FlatNull = B.buildConstant(SrcTy, 0);
1884
1885     // Extract low 32-bits of the pointer.
1886     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1887
1888     auto CmpRes =
1889         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1890     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1891
1892     MI.eraseFromParent();
1893     return true;
1894   }
1895
1896   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1897     return false;
1898
1899   if (!ST.hasFlatAddressSpace())
1900     return false;
1901
1902   auto SegmentNull =
1903       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1904   auto FlatNull =
1905       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1906
1907   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908   if (!ApertureReg.isValid())
1909     return false;
1910
1911   auto CmpRes =
1912       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1913
1914   // Coerce the type of the low half of the result so we can use merge_values.
1915   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1916
1917   // TODO: Should we allow mismatched types but matching sizes in merges to
1918   // avoid the ptrtoint?
1919   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1920   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1921
1922   MI.eraseFromParent();
1923   return true;
1924 }
1925
1926 bool AMDGPULegalizerInfo::legalizeFrint(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   Register Src = MI.getOperand(1).getReg();
1930   LLT Ty = MRI.getType(Src);
1931   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1932
1933   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1935
1936   auto C1 = B.buildFConstant(Ty, C1Val);
1937   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1938
1939   // TODO: Should this propagate fast-math-flags?
1940   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1941   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1942
1943   auto C2 = B.buildFConstant(Ty, C2Val);
1944   auto Fabs = B.buildFAbs(Ty, Src);
1945
1946   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1947   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1948   MI.eraseFromParent();
1949   return true;
1950 }
1951
1952 bool AMDGPULegalizerInfo::legalizeFceil(
1953   MachineInstr &MI, MachineRegisterInfo &MRI,
1954   MachineIRBuilder &B) const {
1955
1956   const LLT S1 = LLT::scalar(1);
1957   const LLT S64 = LLT::scalar(64);
1958
1959   Register Src = MI.getOperand(1).getReg();
1960   assert(MRI.getType(Src) == S64);
1961
1962   // result = trunc(src)
1963   // if (src > 0.0 && src != result)
1964   //   result += 1.0
1965
1966   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1967
1968   const auto Zero = B.buildFConstant(S64, 0.0);
1969   const auto One = B.buildFConstant(S64, 1.0);
1970   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1971   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1972   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1973   auto Add = B.buildSelect(S64, And, One, Zero);
1974
1975   // TODO: Should this propagate fast-math-flags?
1976   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1977   return true;
1978 }
1979
1980 bool AMDGPULegalizerInfo::legalizeFrem(
1981   MachineInstr &MI, MachineRegisterInfo &MRI,
1982   MachineIRBuilder &B) const {
1983     Register DstReg = MI.getOperand(0).getReg();
1984     Register Src0Reg = MI.getOperand(1).getReg();
1985     Register Src1Reg = MI.getOperand(2).getReg();
1986     auto Flags = MI.getFlags();
1987     LLT Ty = MRI.getType(DstReg);
1988
1989     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1990     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1991     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1992     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1993     MI.eraseFromParent();
1994     return true;
1995 }
1996
1997 static MachineInstrBuilder extractF64Exponent(Register Hi,
1998                                               MachineIRBuilder &B) {
1999   const unsigned FractBits = 52;
2000   const unsigned ExpBits = 11;
2001   LLT S32 = LLT::scalar(32);
2002
2003   auto Const0 = B.buildConstant(S32, FractBits - 32);
2004   auto Const1 = B.buildConstant(S32, ExpBits);
2005
2006   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2007     .addUse(Hi)
2008     .addUse(Const0.getReg(0))
2009     .addUse(Const1.getReg(0));
2010
2011   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2012 }
2013
2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B) const {
2017   const LLT S1 = LLT::scalar(1);
2018   const LLT S32 = LLT::scalar(32);
2019   const LLT S64 = LLT::scalar(64);
2020
2021   Register Src = MI.getOperand(1).getReg();
2022   assert(MRI.getType(Src) == S64);
2023
2024   // TODO: Should this use extract since the low half is unused?
2025   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2026   Register Hi = Unmerge.getReg(1);
2027
2028   // Extract the upper half, since this is where we will find the sign and
2029   // exponent.
2030   auto Exp = extractF64Exponent(Hi, B);
2031
2032   const unsigned FractBits = 52;
2033
2034   // Extract the sign bit.
2035   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2036   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2037
2038   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2039
2040   const auto Zero32 = B.buildConstant(S32, 0);
2041
2042   // Extend back to 64-bits.
2043   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2044
2045   auto Shr = B.buildAShr(S64, FractMask, Exp);
2046   auto Not = B.buildNot(S64, Shr);
2047   auto Tmp0 = B.buildAnd(S64, Src, Not);
2048   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2049
2050   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2051   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2052
2053   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2054   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2055   MI.eraseFromParent();
2056   return true;
2057 }
2058
2059 bool AMDGPULegalizerInfo::legalizeITOFP(
2060   MachineInstr &MI, MachineRegisterInfo &MRI,
2061   MachineIRBuilder &B, bool Signed) const {
2062
2063   Register Dst = MI.getOperand(0).getReg();
2064   Register Src = MI.getOperand(1).getReg();
2065
2066   const LLT S64 = LLT::scalar(64);
2067   const LLT S32 = LLT::scalar(32);
2068
2069   assert(MRI.getType(Src) == S64);
2070
2071   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2072   auto ThirtyTwo = B.buildConstant(S32, 32);
2073
2074   if (MRI.getType(Dst) == S64) {
2075     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2076                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2077
2078     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2079     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080                      .addUse(CvtHi.getReg(0))
2081                      .addUse(ThirtyTwo.getReg(0));
2082
2083     // TODO: Should this propagate fast-math-flags?
2084     B.buildFAdd(Dst, LdExp, CvtLo);
2085     MI.eraseFromParent();
2086     return true;
2087   }
2088
2089   assert(MRI.getType(Dst) == S32);
2090
2091   auto One = B.buildConstant(S32, 1);
2092
2093   MachineInstrBuilder ShAmt;
2094   if (Signed) {
2095     auto ThirtyOne = B.buildConstant(S32, 31);
2096     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2097     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2098     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2099     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2100                                /*HasSideEffects=*/false)
2101                   .addUse(Unmerge.getReg(1));
2102     auto LS2 = B.buildSub(S32, LS, One);
2103     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2104   } else
2105     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2106   auto Norm = B.buildShl(S64, Src, ShAmt);
2107   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2108   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2109   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2110   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2111   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2112   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2113                    /*HasSideEffects=*/false)
2114       .addUse(FVal.getReg(0))
2115       .addUse(Scale.getReg(0));
2116   MI.eraseFromParent();
2117   return true;
2118 }
2119
2120 // TODO: Copied from DAG implementation. Verify logic and document how this
2121 // actually works.
2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2123                                         MachineRegisterInfo &MRI,
2124                                         MachineIRBuilder &B,
2125                                         bool Signed) const {
2126
2127   Register Dst = MI.getOperand(0).getReg();
2128   Register Src = MI.getOperand(1).getReg();
2129
2130   const LLT S64 = LLT::scalar(64);
2131   const LLT S32 = LLT::scalar(32);
2132
2133   const LLT SrcLT = MRI.getType(Src);
2134   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2135
2136   unsigned Flags = MI.getFlags();
2137
2138   // The basic idea of converting a floating point number into a pair of 32-bit
2139   // integers is illustrated as follows:
2140   //
2141   //     tf := trunc(val);
2142   //    hif := floor(tf * 2^-32);
2143   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2144   //     hi := fptoi(hif);
2145   //     lo := fptoi(lof);
2146   //
2147   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2148   MachineInstrBuilder Sign;
2149   if (Signed && SrcLT == S32) {
2150     // However, a 32-bit floating point number has only 23 bits mantissa and
2151     // it's not enough to hold all the significant bits of `lof` if val is
2152     // negative. To avoid the loss of precision, We need to take the absolute
2153     // value after truncating and flip the result back based on the original
2154     // signedness.
2155     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2156     Trunc = B.buildFAbs(S32, Trunc, Flags);
2157   }
2158   MachineInstrBuilder K0, K1;
2159   if (SrcLT == S64) {
2160     K0 = B.buildFConstant(S64,
2161                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2162     K1 = B.buildFConstant(S64,
2163                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2164   } else {
2165     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2166     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2167   }
2168
2169   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2170   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2171   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2172
2173   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2174                                      : B.buildFPTOUI(S32, FloorMul);
2175   auto Lo = B.buildFPTOUI(S32, Fma);
2176
2177   if (Signed && SrcLT == S32) {
2178     // Flip the result based on the signedness, which is either all 0s or 1s.
2179     Sign = B.buildMerge(S64, {Sign, Sign});
2180     // r := xor({lo, hi}, sign) - sign;
2181     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2182   } else
2183     B.buildMerge(Dst, {Lo, Hi});
2184   MI.eraseFromParent();
2185
2186   return true;
2187 }
2188
2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2190                                                MachineInstr &MI) const {
2191   MachineFunction &MF = Helper.MIRBuilder.getMF();
2192   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2193
2194   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2195                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2196
2197   // With ieee_mode disabled, the instructions have the correct behavior
2198   // already for G_FMINNUM/G_FMAXNUM
2199   if (!MFI->getMode().IEEE)
2200     return !IsIEEEOp;
2201
2202   if (IsIEEEOp)
2203     return true;
2204
2205   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2206 }
2207
2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2209   MachineInstr &MI, MachineRegisterInfo &MRI,
2210   MachineIRBuilder &B) const {
2211   // TODO: Should move some of this into LegalizerHelper.
2212
2213   // TODO: Promote dynamic indexing of s16 to s32
2214
2215   // FIXME: Artifact combiner probably should have replaced the truncated
2216   // constant before this, so we shouldn't need
2217   // getConstantVRegValWithLookThrough.
2218   Optional<ValueAndVReg> MaybeIdxVal =
2219       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2220   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221     return true;
2222   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2223
2224   Register Dst = MI.getOperand(0).getReg();
2225   Register Vec = MI.getOperand(1).getReg();
2226
2227   LLT VecTy = MRI.getType(Vec);
2228   LLT EltTy = VecTy.getElementType();
2229   assert(EltTy == MRI.getType(Dst));
2230
2231   if (IdxVal < VecTy.getNumElements())
2232     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2233   else
2234     B.buildUndef(Dst);
2235
2236   MI.eraseFromParent();
2237   return true;
2238 }
2239
2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2241   MachineInstr &MI, MachineRegisterInfo &MRI,
2242   MachineIRBuilder &B) const {
2243   // TODO: Should move some of this into LegalizerHelper.
2244
2245   // TODO: Promote dynamic indexing of s16 to s32
2246
2247   // FIXME: Artifact combiner probably should have replaced the truncated
2248   // constant before this, so we shouldn't need
2249   // getConstantVRegValWithLookThrough.
2250   Optional<ValueAndVReg> MaybeIdxVal =
2251       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2252   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2253     return true;
2254
2255   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2256   Register Dst = MI.getOperand(0).getReg();
2257   Register Vec = MI.getOperand(1).getReg();
2258   Register Ins = MI.getOperand(2).getReg();
2259
2260   LLT VecTy = MRI.getType(Vec);
2261   LLT EltTy = VecTy.getElementType();
2262   assert(EltTy == MRI.getType(Ins));
2263
2264   if (IdxVal < VecTy.getNumElements())
2265     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2266   else
2267     B.buildUndef(Dst);
2268
2269   MI.eraseFromParent();
2270   return true;
2271 }
2272
2273 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2274   MachineInstr &MI, MachineRegisterInfo &MRI,
2275   MachineIRBuilder &B) const {
2276   const LLT V2S16 = LLT::fixed_vector(2, 16);
2277
2278   Register Dst = MI.getOperand(0).getReg();
2279   Register Src0 = MI.getOperand(1).getReg();
2280   LLT DstTy = MRI.getType(Dst);
2281   LLT SrcTy = MRI.getType(Src0);
2282
2283   if (SrcTy == V2S16 && DstTy == V2S16 &&
2284       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2285     return true;
2286
2287   MachineIRBuilder HelperBuilder(MI);
2288   GISelObserverWrapper DummyObserver;
2289   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2290   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2291 }
2292
2293 bool AMDGPULegalizerInfo::legalizeSinCos(
2294   MachineInstr &MI, MachineRegisterInfo &MRI,
2295   MachineIRBuilder &B) const {
2296
2297   Register DstReg = MI.getOperand(0).getReg();
2298   Register SrcReg = MI.getOperand(1).getReg();
2299   LLT Ty = MRI.getType(DstReg);
2300   unsigned Flags = MI.getFlags();
2301
2302   Register TrigVal;
2303   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2304   if (ST.hasTrigReducedRange()) {
2305     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2306     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2307       .addUse(MulVal.getReg(0))
2308       .setMIFlags(Flags).getReg(0);
2309   } else
2310     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2311
2312   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2313     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2314   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2315     .addUse(TrigVal)
2316     .setMIFlags(Flags);
2317   MI.eraseFromParent();
2318   return true;
2319 }
2320
2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2322                                                   MachineIRBuilder &B,
2323                                                   const GlobalValue *GV,
2324                                                   int64_t Offset,
2325                                                   unsigned GAFlags) const {
2326   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2327   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2328   // to the following code sequence:
2329   //
2330   // For constant address space:
2331   //   s_getpc_b64 s[0:1]
2332   //   s_add_u32 s0, s0, $symbol
2333   //   s_addc_u32 s1, s1, 0
2334   //
2335   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2336   //   a fixup or relocation is emitted to replace $symbol with a literal
2337   //   constant, which is a pc-relative offset from the encoding of the $symbol
2338   //   operand to the global variable.
2339   //
2340   // For global address space:
2341   //   s_getpc_b64 s[0:1]
2342   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2343   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2344   //
2345   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346   //   fixups or relocations are emitted to replace $symbol@*@lo and
2347   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2348   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2349   //   operand to the global variable.
2350   //
2351   // What we want here is an offset from the value returned by s_getpc
2352   // (which is the address of the s_add_u32 instruction) to the global
2353   // variable, but since the encoding of $symbol starts 4 bytes after the start
2354   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2355   // small. This requires us to add 4 to the global variable offset in order to
2356   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2357   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2358   // instruction.
2359
2360   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2361
2362   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2363     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2364
2365   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2366     .addDef(PCReg);
2367
2368   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2369   if (GAFlags == SIInstrInfo::MO_NONE)
2370     MIB.addImm(0);
2371   else
2372     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2373
2374   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2375
2376   if (PtrTy.getSizeInBits() == 32)
2377     B.buildExtract(DstReg, PCReg, 0);
2378   return true;
2379  }
2380
2381 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2382   MachineInstr &MI, MachineRegisterInfo &MRI,
2383   MachineIRBuilder &B) const {
2384   Register DstReg = MI.getOperand(0).getReg();
2385   LLT Ty = MRI.getType(DstReg);
2386   unsigned AS = Ty.getAddressSpace();
2387
2388   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2389   MachineFunction &MF = B.getMF();
2390   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2391
2392   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2393     if (!MFI->isModuleEntryFunction() &&
2394         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2395       const Function &Fn = MF.getFunction();
2396       DiagnosticInfoUnsupported BadLDSDecl(
2397         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2398         DS_Warning);
2399       Fn.getContext().diagnose(BadLDSDecl);
2400
2401       // We currently don't have a way to correctly allocate LDS objects that
2402       // aren't directly associated with a kernel. We do force inlining of
2403       // functions that use local objects. However, if these dead functions are
2404       // not eliminated, we don't want a compile time error. Just emit a warning
2405       // and a trap, since there should be no callable path here.
2406       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2407       B.buildUndef(DstReg);
2408       MI.eraseFromParent();
2409       return true;
2410     }
2411
2412     // TODO: We could emit code to handle the initialization somewhere.
2413     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2414       const SITargetLowering *TLI = ST.getTargetLowering();
2415       if (!TLI->shouldUseLDSConstAddress(GV)) {
2416         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2417         return true; // Leave in place;
2418       }
2419
2420       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2421         Type *Ty = GV->getValueType();
2422         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2423         // zero-sized type in other languages to declare the dynamic shared
2424         // memory which size is not known at the compile time. They will be
2425         // allocated by the runtime and placed directly after the static
2426         // allocated ones. They all share the same offset.
2427         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2428           // Adjust alignment for that dynamic shared memory array.
2429           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2430           LLT S32 = LLT::scalar(32);
2431           auto Sz =
2432               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2433           B.buildIntToPtr(DstReg, Sz);
2434           MI.eraseFromParent();
2435           return true;
2436         }
2437       }
2438
2439       B.buildConstant(
2440           DstReg,
2441           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2442       MI.eraseFromParent();
2443       return true;
2444     }
2445
2446     const Function &Fn = MF.getFunction();
2447     DiagnosticInfoUnsupported BadInit(
2448       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2449     Fn.getContext().diagnose(BadInit);
2450     return true;
2451   }
2452
2453   const SITargetLowering *TLI = ST.getTargetLowering();
2454
2455   if (TLI->shouldEmitFixup(GV)) {
2456     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2457     MI.eraseFromParent();
2458     return true;
2459   }
2460
2461   if (TLI->shouldEmitPCReloc(GV)) {
2462     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2463     MI.eraseFromParent();
2464     return true;
2465   }
2466
2467   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2468   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2469
2470   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2471   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2472       MachinePointerInfo::getGOT(MF),
2473       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2474           MachineMemOperand::MOInvariant,
2475       LoadTy, Align(8));
2476
2477   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2478
2479   if (Ty.getSizeInBits() == 32) {
2480     // Truncate if this is a 32-bit constant adrdess.
2481     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2482     B.buildExtract(DstReg, Load, 0);
2483   } else
2484     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2485
2486   MI.eraseFromParent();
2487   return true;
2488 }
2489
2490 static LLT widenToNextPowerOf2(LLT Ty) {
2491   if (Ty.isVector())
2492     return Ty.changeElementCount(
2493         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2494   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2495 }
2496
2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2498                                        MachineInstr &MI) const {
2499   MachineIRBuilder &B = Helper.MIRBuilder;
2500   MachineRegisterInfo &MRI = *B.getMRI();
2501   GISelChangeObserver &Observer = Helper.Observer;
2502
2503   Register PtrReg = MI.getOperand(1).getReg();
2504   LLT PtrTy = MRI.getType(PtrReg);
2505   unsigned AddrSpace = PtrTy.getAddressSpace();
2506
2507   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2508     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2509     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2510     Observer.changingInstr(MI);
2511     MI.getOperand(1).setReg(Cast.getReg(0));
2512     Observer.changedInstr(MI);
2513     return true;
2514   }
2515
2516   if (MI.getOpcode() != AMDGPU::G_LOAD)
2517     return false;
2518
2519   Register ValReg = MI.getOperand(0).getReg();
2520   LLT ValTy = MRI.getType(ValReg);
2521
2522   MachineMemOperand *MMO = *MI.memoperands_begin();
2523   const unsigned ValSize = ValTy.getSizeInBits();
2524   const LLT MemTy = MMO->getMemoryType();
2525   const Align MemAlign = MMO->getAlign();
2526   const unsigned MemSize = MemTy.getSizeInBits();
2527   const unsigned AlignInBits = 8 * MemAlign.value();
2528
2529   // Widen non-power-of-2 loads to the alignment if needed
2530   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2531     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2532
2533     // This was already the correct extending load result type, so just adjust
2534     // the memory type.
2535     if (WideMemSize == ValSize) {
2536       MachineFunction &MF = B.getMF();
2537
2538       MachineMemOperand *WideMMO =
2539           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2540       Observer.changingInstr(MI);
2541       MI.setMemRefs(MF, {WideMMO});
2542       Observer.changedInstr(MI);
2543       return true;
2544     }
2545
2546     // Don't bother handling edge case that should probably never be produced.
2547     if (ValSize > WideMemSize)
2548       return false;
2549
2550     LLT WideTy = widenToNextPowerOf2(ValTy);
2551
2552     Register WideLoad;
2553     if (!WideTy.isVector()) {
2554       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2555       B.buildTrunc(ValReg, WideLoad).getReg(0);
2556     } else {
2557       // Extract the subvector.
2558
2559       if (isRegisterType(ValTy)) {
2560         // If this a case where G_EXTRACT is legal, use it.
2561         // (e.g. <3 x s32> -> <4 x s32>)
2562         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2563         B.buildExtract(ValReg, WideLoad, 0);
2564       } else {
2565         // For cases where the widened type isn't a nice register value, unmerge
2566         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2567         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2568         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2569         B.setInsertPt(B.getMBB(), MI.getIterator());
2570         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2571       }
2572     }
2573
2574     MI.eraseFromParent();
2575     return true;
2576   }
2577
2578   return false;
2579 }
2580
2581 bool AMDGPULegalizerInfo::legalizeFMad(
2582   MachineInstr &MI, MachineRegisterInfo &MRI,
2583   MachineIRBuilder &B) const {
2584   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2585   assert(Ty.isScalar());
2586
2587   MachineFunction &MF = B.getMF();
2588   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2589
2590   // TODO: Always legal with future ftz flag.
2591   // FIXME: Do we need just output?
2592   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2593     return true;
2594   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2595     return true;
2596
2597   MachineIRBuilder HelperBuilder(MI);
2598   GISelObserverWrapper DummyObserver;
2599   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2600   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2601 }
2602
2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2604   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2605   Register DstReg = MI.getOperand(0).getReg();
2606   Register PtrReg = MI.getOperand(1).getReg();
2607   Register CmpVal = MI.getOperand(2).getReg();
2608   Register NewVal = MI.getOperand(3).getReg();
2609
2610   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2611          "this should not have been custom lowered");
2612
2613   LLT ValTy = MRI.getType(CmpVal);
2614   LLT VecTy = LLT::fixed_vector(2, ValTy);
2615
2616   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2617
2618   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2619     .addDef(DstReg)
2620     .addUse(PtrReg)
2621     .addUse(PackedVal)
2622     .setMemRefs(MI.memoperands());
2623
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627
2628 bool AMDGPULegalizerInfo::legalizeFlog(
2629   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2630   Register Dst = MI.getOperand(0).getReg();
2631   Register Src = MI.getOperand(1).getReg();
2632   LLT Ty = B.getMRI()->getType(Dst);
2633   unsigned Flags = MI.getFlags();
2634
2635   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2636   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2637
2638   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2639   MI.eraseFromParent();
2640   return true;
2641 }
2642
2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2644                                        MachineIRBuilder &B) const {
2645   Register Dst = MI.getOperand(0).getReg();
2646   Register Src = MI.getOperand(1).getReg();
2647   unsigned Flags = MI.getFlags();
2648   LLT Ty = B.getMRI()->getType(Dst);
2649
2650   auto K = B.buildFConstant(Ty, numbers::log2e);
2651   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2652   B.buildFExp2(Dst, Mul, Flags);
2653   MI.eraseFromParent();
2654   return true;
2655 }
2656
2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2658                                        MachineIRBuilder &B) const {
2659   Register Dst = MI.getOperand(0).getReg();
2660   Register Src0 = MI.getOperand(1).getReg();
2661   Register Src1 = MI.getOperand(2).getReg();
2662   unsigned Flags = MI.getFlags();
2663   LLT Ty = B.getMRI()->getType(Dst);
2664   const LLT S16 = LLT::scalar(16);
2665   const LLT S32 = LLT::scalar(32);
2666
2667   if (Ty == S32) {
2668     auto Log = B.buildFLog2(S32, Src0, Flags);
2669     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2670       .addUse(Log.getReg(0))
2671       .addUse(Src1)
2672       .setMIFlags(Flags);
2673     B.buildFExp2(Dst, Mul, Flags);
2674   } else if (Ty == S16) {
2675     // There's no f16 fmul_legacy, so we need to convert for it.
2676     auto Log = B.buildFLog2(S16, Src0, Flags);
2677     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2678     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2679     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680       .addUse(Ext0.getReg(0))
2681       .addUse(Ext1.getReg(0))
2682       .setMIFlags(Flags);
2683
2684     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2685   } else
2686     return false;
2687
2688   MI.eraseFromParent();
2689   return true;
2690 }
2691
2692 // Find a source register, ignoring any possible source modifiers.
2693 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2694   Register ModSrc = OrigSrc;
2695   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2696     ModSrc = SrcFNeg->getOperand(1).getReg();
2697     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2698       ModSrc = SrcFAbs->getOperand(1).getReg();
2699   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2700     ModSrc = SrcFAbs->getOperand(1).getReg();
2701   return ModSrc;
2702 }
2703
2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2705                                          MachineRegisterInfo &MRI,
2706                                          MachineIRBuilder &B) const {
2707
2708   const LLT S1 = LLT::scalar(1);
2709   const LLT S64 = LLT::scalar(64);
2710   Register Dst = MI.getOperand(0).getReg();
2711   Register OrigSrc = MI.getOperand(1).getReg();
2712   unsigned Flags = MI.getFlags();
2713   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2714          "this should not have been custom lowered");
2715
2716   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2717   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2718   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2719   // V_FRACT bug is:
2720   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2721   //
2722   // Convert floor(x) to (x - fract(x))
2723
2724   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2725     .addUse(OrigSrc)
2726     .setMIFlags(Flags);
2727
2728   // Give source modifier matching some assistance before obscuring a foldable
2729   // pattern.
2730
2731   // TODO: We can avoid the neg on the fract? The input sign to fract
2732   // shouldn't matter?
2733   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2734
2735   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2736
2737   Register Min = MRI.createGenericVirtualRegister(S64);
2738
2739   // We don't need to concern ourselves with the snan handling difference, so
2740   // use the one which will directly select.
2741   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2742   if (MFI->getMode().IEEE)
2743     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2744   else
2745     B.buildFMinNum(Min, Fract, Const, Flags);
2746
2747   Register CorrectedFract = Min;
2748   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2749     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2750     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2751   }
2752
2753   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2754   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2755
2756   MI.eraseFromParent();
2757   return true;
2758 }
2759
2760 // Turn an illegal packed v2s16 build vector into bit operations.
2761 // TODO: This should probably be a bitcast action in LegalizerHelper.
2762 bool AMDGPULegalizerInfo::legalizeBuildVector(
2763   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2764   Register Dst = MI.getOperand(0).getReg();
2765   const LLT S32 = LLT::scalar(32);
2766   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2767
2768   Register Src0 = MI.getOperand(1).getReg();
2769   Register Src1 = MI.getOperand(2).getReg();
2770   assert(MRI.getType(Src0) == LLT::scalar(16));
2771
2772   auto Merge = B.buildMerge(S32, {Src0, Src1});
2773   B.buildBitcast(Dst, Merge);
2774
2775   MI.eraseFromParent();
2776   return true;
2777 }
2778
2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2781 // case with a single min instruction instead of a compare+select.
2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2783                                             MachineRegisterInfo &MRI,
2784                                             MachineIRBuilder &B) const {
2785   Register Dst = MI.getOperand(0).getReg();
2786   Register Src = MI.getOperand(1).getReg();
2787   LLT DstTy = MRI.getType(Dst);
2788   LLT SrcTy = MRI.getType(Src);
2789
2790   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2791                         ? AMDGPU::G_AMDGPU_FFBH_U32
2792                         : AMDGPU::G_AMDGPU_FFBL_B32;
2793   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2794   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2795
2796   MI.eraseFromParent();
2797   return true;
2798 }
2799
2800 // Check that this is a G_XOR x, -1
2801 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2802   if (MI.getOpcode() != TargetOpcode::G_XOR)
2803     return false;
2804   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2805   return ConstVal && *ConstVal == -1;
2806 }
2807
2808 // Return the use branch instruction, otherwise null if the usage is invalid.
2809 static MachineInstr *
2810 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2811                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2812   Register CondDef = MI.getOperand(0).getReg();
2813   if (!MRI.hasOneNonDBGUse(CondDef))
2814     return nullptr;
2815
2816   MachineBasicBlock *Parent = MI.getParent();
2817   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2818
2819   if (isNot(MRI, *UseMI)) {
2820     Register NegatedCond = UseMI->getOperand(0).getReg();
2821     if (!MRI.hasOneNonDBGUse(NegatedCond))
2822       return nullptr;
2823
2824     // We're deleting the def of this value, so we need to remove it.
2825     UseMI->eraseFromParent();
2826
2827     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2828     Negated = true;
2829   }
2830
2831   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2832     return nullptr;
2833
2834   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2835   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2836   if (Next == Parent->end()) {
2837     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2838     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2839       return nullptr;
2840     UncondBrTarget = &*NextMBB;
2841   } else {
2842     if (Next->getOpcode() != AMDGPU::G_BR)
2843       return nullptr;
2844     Br = &*Next;
2845     UncondBrTarget = Br->getOperand(0).getMBB();
2846   }
2847
2848   return UseMI;
2849 }
2850
2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2852                                          const ArgDescriptor *Arg,
2853                                          const TargetRegisterClass *ArgRC,
2854                                          LLT ArgTy) const {
2855   MCRegister SrcReg = Arg->getRegister();
2856   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2857   assert(DstReg.isVirtual() && "Virtual register expected");
2858
2859   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2860                                              ArgTy);
2861   if (Arg->isMasked()) {
2862     // TODO: Should we try to emit this once in the entry block?
2863     const LLT S32 = LLT::scalar(32);
2864     const unsigned Mask = Arg->getMask();
2865     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2866
2867     Register AndMaskSrc = LiveIn;
2868
2869     if (Shift != 0) {
2870       auto ShiftAmt = B.buildConstant(S32, Shift);
2871       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2872     }
2873
2874     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2875   } else {
2876     B.buildCopy(DstReg, LiveIn);
2877   }
2878
2879   return true;
2880 }
2881
2882 bool AMDGPULegalizerInfo::loadInputValue(
2883     Register DstReg, MachineIRBuilder &B,
2884     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2885   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2886   const ArgDescriptor *Arg;
2887   const TargetRegisterClass *ArgRC;
2888   LLT ArgTy;
2889   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2890
2891   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2892     return false; // TODO: Handle these
2893   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2894 }
2895
2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2897     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2898     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2899   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2900     return false;
2901
2902   MI.eraseFromParent();
2903   return true;
2904 }
2905
2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2907                                        MachineRegisterInfo &MRI,
2908                                        MachineIRBuilder &B) const {
2909   Register Dst = MI.getOperand(0).getReg();
2910   LLT DstTy = MRI.getType(Dst);
2911   LLT S16 = LLT::scalar(16);
2912   LLT S32 = LLT::scalar(32);
2913   LLT S64 = LLT::scalar(64);
2914
2915   if (DstTy == S16)
2916     return legalizeFDIV16(MI, MRI, B);
2917   if (DstTy == S32)
2918     return legalizeFDIV32(MI, MRI, B);
2919   if (DstTy == S64)
2920     return legalizeFDIV64(MI, MRI, B);
2921
2922   return false;
2923 }
2924
2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2926                                                         Register DstDivReg,
2927                                                         Register DstRemReg,
2928                                                         Register X,
2929                                                         Register Y) const {
2930   const LLT S1 = LLT::scalar(1);
2931   const LLT S32 = LLT::scalar(32);
2932
2933   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2934   // algorithm used here.
2935
2936   // Initial estimate of inv(y).
2937   auto FloatY = B.buildUITOFP(S32, Y);
2938   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2939   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2940   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2941   auto Z = B.buildFPTOUI(S32, ScaledY);
2942
2943   // One round of UNR.
2944   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2945   auto NegYZ = B.buildMul(S32, NegY, Z);
2946   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2947
2948   // Quotient/remainder estimate.
2949   auto Q = B.buildUMulH(S32, X, Z);
2950   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2951
2952   // First quotient/remainder refinement.
2953   auto One = B.buildConstant(S32, 1);
2954   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2955   if (DstDivReg)
2956     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2957   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2958
2959   // Second quotient/remainder refinement.
2960   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2961   if (DstDivReg)
2962     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2963
2964   if (DstRemReg)
2965     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2966 }
2967
2968 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2969 //
2970 // Return lo, hi of result
2971 //
2972 // %cvt.lo = G_UITOFP Val.lo
2973 // %cvt.hi = G_UITOFP Val.hi
2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2977 // %mul2 = G_FMUL %mul1, 2**(-32)
2978 // %trunc = G_INTRINSIC_TRUNC %mul2
2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2981 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2982                                                        Register Val) {
2983   const LLT S32 = LLT::scalar(32);
2984   auto Unmerge = B.buildUnmerge(S32, Val);
2985
2986   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2987   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2988
2989   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2990                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2991
2992   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2993   auto Mul1 =
2994       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2995
2996   // 2**(-32)
2997   auto Mul2 =
2998       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2999   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3000
3001   // -(2**32)
3002   auto Mad2 = B.buildFMAD(S32, Trunc,
3003                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3004
3005   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3006   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3007
3008   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3009 }
3010
3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3012                                                         Register DstDivReg,
3013                                                         Register DstRemReg,
3014                                                         Register Numer,
3015                                                         Register Denom) const {
3016   const LLT S32 = LLT::scalar(32);
3017   const LLT S64 = LLT::scalar(64);
3018   const LLT S1 = LLT::scalar(1);
3019   Register RcpLo, RcpHi;
3020
3021   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3022
3023   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3024
3025   auto Zero64 = B.buildConstant(S64, 0);
3026   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3027
3028   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3029   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3030
3031   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3032   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3033   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3034
3035   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3036   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3037   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3038   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3039
3040   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3041   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3042   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3043   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3044   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3045
3046   auto Zero32 = B.buildConstant(S32, 0);
3047   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3048   auto Add2_HiC =
3049       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3050   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3051   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3052
3053   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3054   Register NumerLo = UnmergeNumer.getReg(0);
3055   Register NumerHi = UnmergeNumer.getReg(1);
3056
3057   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3058   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3059   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3060   Register Mul3_Lo = UnmergeMul3.getReg(0);
3061   Register Mul3_Hi = UnmergeMul3.getReg(1);
3062   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3063   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3064   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3065   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3066
3067   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3068   Register DenomLo = UnmergeDenom.getReg(0);
3069   Register DenomHi = UnmergeDenom.getReg(1);
3070
3071   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3072   auto C1 = B.buildSExt(S32, CmpHi);
3073
3074   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3075   auto C2 = B.buildSExt(S32, CmpLo);
3076
3077   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3078   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3079
3080   // TODO: Here and below portions of the code can be enclosed into if/endif.
3081   // Currently control flow is unconditional and we have 4 selects after
3082   // potential endif to substitute PHIs.
3083
3084   // if C3 != 0 ...
3085   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3086   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3087   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3088   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3089
3090   auto One64 = B.buildConstant(S64, 1);
3091   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3092
3093   auto C4 =
3094       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3095   auto C5 =
3096       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3097   auto C6 = B.buildSelect(
3098       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3099
3100   // if (C6 != 0)
3101   auto Add4 = B.buildAdd(S64, Add3, One64);
3102   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3103
3104   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3105   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3106   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3107
3108   // endif C6
3109   // endif C3
3110
3111   if (DstDivReg) {
3112     auto Sel1 = B.buildSelect(
3113         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3114     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3115                   Sel1, MulHi3);
3116   }
3117
3118   if (DstRemReg) {
3119     auto Sel2 = B.buildSelect(
3120         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3121     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3122                   Sel2, Sub1);
3123   }
3124 }
3125
3126 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3127                                                   MachineRegisterInfo &MRI,
3128                                                   MachineIRBuilder &B) const {
3129   Register DstDivReg, DstRemReg;
3130   switch (MI.getOpcode()) {
3131   default:
3132     llvm_unreachable("Unexpected opcode!");
3133   case AMDGPU::G_UDIV: {
3134     DstDivReg = MI.getOperand(0).getReg();
3135     break;
3136   }
3137   case AMDGPU::G_UREM: {
3138     DstRemReg = MI.getOperand(0).getReg();
3139     break;
3140   }
3141   case AMDGPU::G_UDIVREM: {
3142     DstDivReg = MI.getOperand(0).getReg();
3143     DstRemReg = MI.getOperand(1).getReg();
3144     break;
3145   }
3146   }
3147
3148   const LLT S64 = LLT::scalar(64);
3149   const LLT S32 = LLT::scalar(32);
3150   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3151   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3152   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3153   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3154
3155   if (Ty == S32)
3156     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3157   else if (Ty == S64)
3158     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3159   else
3160     return false;
3161
3162   MI.eraseFromParent();
3163   return true;
3164 }
3165
3166 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3167                                                 MachineRegisterInfo &MRI,
3168                                                 MachineIRBuilder &B) const {
3169   const LLT S64 = LLT::scalar(64);
3170   const LLT S32 = LLT::scalar(32);
3171
3172   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3173   if (Ty != S32 && Ty != S64)
3174     return false;
3175
3176   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3177   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3178   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3179
3180   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3181   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3182   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3183
3184   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3185   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3186
3187   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3188   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3189
3190   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3191   switch (MI.getOpcode()) {
3192   default:
3193     llvm_unreachable("Unexpected opcode!");
3194   case AMDGPU::G_SDIV: {
3195     DstDivReg = MI.getOperand(0).getReg();
3196     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3197     break;
3198   }
3199   case AMDGPU::G_SREM: {
3200     DstRemReg = MI.getOperand(0).getReg();
3201     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3202     break;
3203   }
3204   case AMDGPU::G_SDIVREM: {
3205     DstDivReg = MI.getOperand(0).getReg();
3206     DstRemReg = MI.getOperand(1).getReg();
3207     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3208     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3209     break;
3210   }
3211   }
3212
3213   if (Ty == S32)
3214     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3215   else
3216     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3217
3218   if (DstDivReg) {
3219     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3220     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3221     B.buildSub(DstDivReg, SignXor, Sign);
3222   }
3223
3224   if (DstRemReg) {
3225     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3226     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3227     B.buildSub(DstRemReg, SignXor, Sign);
3228   }
3229
3230   MI.eraseFromParent();
3231   return true;
3232 }
3233
3234 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3235                                                  MachineRegisterInfo &MRI,
3236                                                  MachineIRBuilder &B) const {
3237   Register Res = MI.getOperand(0).getReg();
3238   Register LHS = MI.getOperand(1).getReg();
3239   Register RHS = MI.getOperand(2).getReg();
3240   uint16_t Flags = MI.getFlags();
3241   LLT ResTy = MRI.getType(Res);
3242
3243   const MachineFunction &MF = B.getMF();
3244   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3245                             MI.getFlag(MachineInstr::FmAfn);
3246
3247   if (!AllowInaccurateRcp)
3248     return false;
3249
3250   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3251     // 1 / x -> RCP(x)
3252     if (CLHS->isExactlyValue(1.0)) {
3253       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3254         .addUse(RHS)
3255         .setMIFlags(Flags);
3256
3257       MI.eraseFromParent();
3258       return true;
3259     }
3260
3261     // -1 / x -> RCP( FNEG(x) )
3262     if (CLHS->isExactlyValue(-1.0)) {
3263       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3264       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3265         .addUse(FNeg.getReg(0))
3266         .setMIFlags(Flags);
3267
3268       MI.eraseFromParent();
3269       return true;
3270     }
3271   }
3272
3273   // x / y -> x * (1.0 / y)
3274   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3275     .addUse(RHS)
3276     .setMIFlags(Flags);
3277   B.buildFMul(Res, LHS, RCP, Flags);
3278
3279   MI.eraseFromParent();
3280   return true;
3281 }
3282
3283 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3284                                                    MachineRegisterInfo &MRI,
3285                                                    MachineIRBuilder &B) const {
3286   Register Res = MI.getOperand(0).getReg();
3287   Register X = MI.getOperand(1).getReg();
3288   Register Y = MI.getOperand(2).getReg();
3289   uint16_t Flags = MI.getFlags();
3290   LLT ResTy = MRI.getType(Res);
3291
3292   const MachineFunction &MF = B.getMF();
3293   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3294                             MI.getFlag(MachineInstr::FmAfn);
3295
3296   if (!AllowInaccurateRcp)
3297     return false;
3298
3299   auto NegY = B.buildFNeg(ResTy, Y);
3300   auto One = B.buildFConstant(ResTy, 1.0);
3301
3302   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3303     .addUse(Y)
3304     .setMIFlags(Flags);
3305
3306   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3307   R = B.buildFMA(ResTy, Tmp0, R, R);
3308
3309   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3310   R = B.buildFMA(ResTy, Tmp1, R, R);
3311
3312   auto Ret = B.buildFMul(ResTy, X, R);
3313   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3314
3315   B.buildFMA(Res, Tmp2, R, Ret);
3316   MI.eraseFromParent();
3317   return true;
3318 }
3319
3320 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3321                                          MachineRegisterInfo &MRI,
3322                                          MachineIRBuilder &B) const {
3323   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3324     return true;
3325
3326   Register Res = MI.getOperand(0).getReg();
3327   Register LHS = MI.getOperand(1).getReg();
3328   Register RHS = MI.getOperand(2).getReg();
3329
3330   uint16_t Flags = MI.getFlags();
3331
3332   LLT S16 = LLT::scalar(16);
3333   LLT S32 = LLT::scalar(32);
3334
3335   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3336   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3337
3338   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3339     .addUse(RHSExt.getReg(0))
3340     .setMIFlags(Flags);
3341
3342   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3343   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3344
3345   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3346     .addUse(RDst.getReg(0))
3347     .addUse(RHS)
3348     .addUse(LHS)
3349     .setMIFlags(Flags);
3350
3351   MI.eraseFromParent();
3352   return true;
3353 }
3354
3355 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3356 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3357 static void toggleSPDenormMode(bool Enable,
3358                                MachineIRBuilder &B,
3359                                const GCNSubtarget &ST,
3360                                AMDGPU::SIModeRegisterDefaults Mode) {
3361   // Set SP denorm mode to this value.
3362   unsigned SPDenormMode =
3363     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3364
3365   if (ST.hasDenormModeInst()) {
3366     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3367     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3368
3369     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3370     B.buildInstr(AMDGPU::S_DENORM_MODE)
3371       .addImm(NewDenormModeValue);
3372
3373   } else {
3374     // Select FP32 bit field in mode register.
3375     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3376                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3377                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3378
3379     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3380       .addImm(SPDenormMode)
3381       .addImm(SPDenormModeBitField);
3382   }
3383 }
3384
3385 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3386                                          MachineRegisterInfo &MRI,
3387                                          MachineIRBuilder &B) const {
3388   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3389     return true;
3390
3391   Register Res = MI.getOperand(0).getReg();
3392   Register LHS = MI.getOperand(1).getReg();
3393   Register RHS = MI.getOperand(2).getReg();
3394   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3395   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3396
3397   uint16_t Flags = MI.getFlags();
3398
3399   LLT S32 = LLT::scalar(32);
3400   LLT S1 = LLT::scalar(1);
3401
3402   auto One = B.buildFConstant(S32, 1.0f);
3403
3404   auto DenominatorScaled =
3405     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3406       .addUse(LHS)
3407       .addUse(RHS)
3408       .addImm(0)
3409       .setMIFlags(Flags);
3410   auto NumeratorScaled =
3411     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3412       .addUse(LHS)
3413       .addUse(RHS)
3414       .addImm(1)
3415       .setMIFlags(Flags);
3416
3417   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3418     .addUse(DenominatorScaled.getReg(0))
3419     .setMIFlags(Flags);
3420   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3421
3422   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3423   // aren't modeled as reading it.
3424   if (!Mode.allFP32Denormals())
3425     toggleSPDenormMode(true, B, ST, Mode);
3426
3427   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3428   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3429   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3430   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3431   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3432   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3433
3434   if (!Mode.allFP32Denormals())
3435     toggleSPDenormMode(false, B, ST, Mode);
3436
3437   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3438     .addUse(Fma4.getReg(0))
3439     .addUse(Fma1.getReg(0))
3440     .addUse(Fma3.getReg(0))
3441     .addUse(NumeratorScaled.getReg(1))
3442     .setMIFlags(Flags);
3443
3444   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3445     .addUse(Fmas.getReg(0))
3446     .addUse(RHS)
3447     .addUse(LHS)
3448     .setMIFlags(Flags);
3449
3450   MI.eraseFromParent();
3451   return true;
3452 }
3453
3454 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3455                                          MachineRegisterInfo &MRI,
3456                                          MachineIRBuilder &B) const {
3457   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3458     return true;
3459
3460   Register Res = MI.getOperand(0).getReg();
3461   Register LHS = MI.getOperand(1).getReg();
3462   Register RHS = MI.getOperand(2).getReg();
3463
3464   uint16_t Flags = MI.getFlags();
3465
3466   LLT S64 = LLT::scalar(64);
3467   LLT S1 = LLT::scalar(1);
3468
3469   auto One = B.buildFConstant(S64, 1.0);
3470
3471   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3472     .addUse(LHS)
3473     .addUse(RHS)
3474     .addImm(0)
3475     .setMIFlags(Flags);
3476
3477   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3478
3479   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3480     .addUse(DivScale0.getReg(0))
3481     .setMIFlags(Flags);
3482
3483   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3484   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3485   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3486
3487   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3488     .addUse(LHS)
3489     .addUse(RHS)
3490     .addImm(1)
3491     .setMIFlags(Flags);
3492
3493   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3494   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3495   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3496
3497   Register Scale;
3498   if (!ST.hasUsableDivScaleConditionOutput()) {
3499     // Workaround a hardware bug on SI where the condition output from div_scale
3500     // is not usable.
3501
3502     LLT S32 = LLT::scalar(32);
3503
3504     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3505     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3506     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3507     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3508
3509     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3510                               Scale1Unmerge.getReg(1));
3511     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3512                               Scale0Unmerge.getReg(1));
3513     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3514   } else {
3515     Scale = DivScale1.getReg(1);
3516   }
3517
3518   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3519     .addUse(Fma4.getReg(0))
3520     .addUse(Fma3.getReg(0))
3521     .addUse(Mul.getReg(0))
3522     .addUse(Scale)
3523     .setMIFlags(Flags);
3524
3525   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3526     .addUse(Fmas.getReg(0))
3527     .addUse(RHS)
3528     .addUse(LHS)
3529     .setMIFlags(Flags);
3530
3531   MI.eraseFromParent();
3532   return true;
3533 }
3534
3535 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3536                                                  MachineRegisterInfo &MRI,
3537                                                  MachineIRBuilder &B) const {
3538   Register Res = MI.getOperand(0).getReg();
3539   Register LHS = MI.getOperand(2).getReg();
3540   Register RHS = MI.getOperand(3).getReg();
3541   uint16_t Flags = MI.getFlags();
3542
3543   LLT S32 = LLT::scalar(32);
3544   LLT S1 = LLT::scalar(1);
3545
3546   auto Abs = B.buildFAbs(S32, RHS, Flags);
3547   const APFloat C0Val(1.0f);
3548
3549   auto C0 = B.buildConstant(S32, 0x6f800000);
3550   auto C1 = B.buildConstant(S32, 0x2f800000);
3551   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3552
3553   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3554   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3555
3556   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3557
3558   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3559     .addUse(Mul0.getReg(0))
3560     .setMIFlags(Flags);
3561
3562   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3563
3564   B.buildFMul(Res, Sel, Mul1, Flags);
3565
3566   MI.eraseFromParent();
3567   return true;
3568 }
3569
3570 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3571 // FIXME: Why do we handle this one but not other removed instructions?
3572 //
3573 // Reciprocal square root.  The clamp prevents infinite results, clamping
3574 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3575 // +-max_float.
3576 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3577                                                     MachineRegisterInfo &MRI,
3578                                                     MachineIRBuilder &B) const {
3579   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3580     return true;
3581
3582   Register Dst = MI.getOperand(0).getReg();
3583   Register Src = MI.getOperand(2).getReg();
3584   auto Flags = MI.getFlags();
3585
3586   LLT Ty = MRI.getType(Dst);
3587
3588   const fltSemantics *FltSemantics;
3589   if (Ty == LLT::scalar(32))
3590     FltSemantics = &APFloat::IEEEsingle();
3591   else if (Ty == LLT::scalar(64))
3592     FltSemantics = &APFloat::IEEEdouble();
3593   else
3594     return false;
3595
3596   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3597     .addUse(Src)
3598     .setMIFlags(Flags);
3599
3600   // We don't need to concern ourselves with the snan handling difference, since
3601   // the rsq quieted (or not) so use the one which will directly select.
3602   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3603   const bool UseIEEE = MFI->getMode().IEEE;
3604
3605   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3606   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3607                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3608
3609   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3610
3611   if (UseIEEE)
3612     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3613   else
3614     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3615   MI.eraseFromParent();
3616   return true;
3617 }
3618
3619 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3620   switch (IID) {
3621   case Intrinsic::amdgcn_ds_fadd:
3622     return AMDGPU::G_ATOMICRMW_FADD;
3623   case Intrinsic::amdgcn_ds_fmin:
3624     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3625   case Intrinsic::amdgcn_ds_fmax:
3626     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3627   default:
3628     llvm_unreachable("not a DS FP intrinsic");
3629   }
3630 }
3631
3632 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3633                                                       MachineInstr &MI,
3634                                                       Intrinsic::ID IID) const {
3635   GISelChangeObserver &Observer = Helper.Observer;
3636   Observer.changingInstr(MI);
3637
3638   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3639
3640   // The remaining operands were used to set fields in the MemOperand on
3641   // construction.
3642   for (int I = 6; I > 3; --I)
3643     MI.RemoveOperand(I);
3644
3645   MI.RemoveOperand(1); // Remove the intrinsic ID.
3646   Observer.changedInstr(MI);
3647   return true;
3648 }
3649
3650 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3651                                             MachineRegisterInfo &MRI,
3652                                             MachineIRBuilder &B) const {
3653   uint64_t Offset =
3654     ST.getTargetLowering()->getImplicitParameterOffset(
3655       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3656   LLT DstTy = MRI.getType(DstReg);
3657   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3658
3659   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3660   if (!loadInputValue(KernargPtrReg, B,
3661                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3662     return false;
3663
3664   // FIXME: This should be nuw
3665   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3666   return true;
3667 }
3668
3669 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3670                                                  MachineRegisterInfo &MRI,
3671                                                  MachineIRBuilder &B) const {
3672   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3673   if (!MFI->isEntryFunction()) {
3674     return legalizePreloadedArgIntrin(MI, MRI, B,
3675                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3676   }
3677
3678   Register DstReg = MI.getOperand(0).getReg();
3679   if (!getImplicitArgPtr(DstReg, MRI, B))
3680     return false;
3681
3682   MI.eraseFromParent();
3683   return true;
3684 }
3685
3686 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3687                                               MachineRegisterInfo &MRI,
3688                                               MachineIRBuilder &B,
3689                                               unsigned AddrSpace) const {
3690   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3691   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3692   Register Hi32 = Unmerge.getReg(1);
3693
3694   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3695   MI.eraseFromParent();
3696   return true;
3697 }
3698
3699 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3700 // offset (the offset that is included in bounds checking and swizzling, to be
3701 // split between the instruction's voffset and immoffset fields) and soffset
3702 // (the offset that is excluded from bounds checking and swizzling, to go in
3703 // the instruction's soffset field).  This function takes the first kind of
3704 // offset and figures out how to split it between voffset and immoffset.
3705 std::pair<Register, unsigned>
3706 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3707                                         Register OrigOffset) const {
3708   const unsigned MaxImm = 4095;
3709   Register BaseReg;
3710   unsigned ImmOffset;
3711   const LLT S32 = LLT::scalar(32);
3712   MachineRegisterInfo &MRI = *B.getMRI();
3713
3714   std::tie(BaseReg, ImmOffset) =
3715       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3716
3717   // If BaseReg is a pointer, convert it to int.
3718   if (MRI.getType(BaseReg).isPointer())
3719     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3720
3721   // If the immediate value is too big for the immoffset field, put the value
3722   // and -4096 into the immoffset field so that the value that is copied/added
3723   // for the voffset field is a multiple of 4096, and it stands more chance
3724   // of being CSEd with the copy/add for another similar load/store.
3725   // However, do not do that rounding down to a multiple of 4096 if that is a
3726   // negative number, as it appears to be illegal to have a negative offset
3727   // in the vgpr, even if adding the immediate offset makes it positive.
3728   unsigned Overflow = ImmOffset & ~MaxImm;
3729   ImmOffset -= Overflow;
3730   if ((int32_t)Overflow < 0) {
3731     Overflow += ImmOffset;
3732     ImmOffset = 0;
3733   }
3734
3735   if (Overflow != 0) {
3736     if (!BaseReg) {
3737       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3738     } else {
3739       auto OverflowVal = B.buildConstant(S32, Overflow);
3740       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3741     }
3742   }
3743
3744   if (!BaseReg)
3745     BaseReg = B.buildConstant(S32, 0).getReg(0);
3746
3747   return std::make_pair(BaseReg, ImmOffset);
3748 }
3749
3750 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3751 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3752                                           Register VOffset, Register SOffset,
3753                                           unsigned ImmOffset, Register VIndex,
3754                                           MachineRegisterInfo &MRI) const {
3755   Optional<ValueAndVReg> MaybeVOffsetVal =
3756       getConstantVRegValWithLookThrough(VOffset, MRI);
3757   Optional<ValueAndVReg> MaybeSOffsetVal =
3758       getConstantVRegValWithLookThrough(SOffset, MRI);
3759   Optional<ValueAndVReg> MaybeVIndexVal =
3760       getConstantVRegValWithLookThrough(VIndex, MRI);
3761   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3762   // update the MMO with that offset. The stride is unknown so we can only do
3763   // this if VIndex is constant 0.
3764   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3765       MaybeVIndexVal->Value == 0) {
3766     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3767                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3768     MMO->setOffset(TotalOffset);
3769   } else {
3770     // We don't have a constant combined offset to use in the MMO. Give up.
3771     MMO->setValue((Value *)nullptr);
3772   }
3773 }
3774
3775 /// Handle register layout difference for f16 images for some subtargets.
3776 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3777                                              MachineRegisterInfo &MRI,
3778                                              Register Reg,
3779                                              bool ImageStore) const {
3780   const LLT S16 = LLT::scalar(16);
3781   const LLT S32 = LLT::scalar(32);
3782   LLT StoreVT = MRI.getType(Reg);
3783   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3784
3785   if (ST.hasUnpackedD16VMem()) {
3786     auto Unmerge = B.buildUnmerge(S16, Reg);
3787
3788     SmallVector<Register, 4> WideRegs;
3789     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3790       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3791
3792     int NumElts = StoreVT.getNumElements();
3793
3794     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3795         .getReg(0);
3796   }
3797
3798   if (ImageStore && ST.hasImageStoreD16Bug()) {
3799     if (StoreVT.getNumElements() == 2) {
3800       SmallVector<Register, 4> PackedRegs;
3801       Reg = B.buildBitcast(S32, Reg).getReg(0);
3802       PackedRegs.push_back(Reg);
3803       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3804       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3805           .getReg(0);
3806     }
3807
3808     if (StoreVT.getNumElements() == 3) {
3809       SmallVector<Register, 4> PackedRegs;
3810       auto Unmerge = B.buildUnmerge(S16, Reg);
3811       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3812         PackedRegs.push_back(Unmerge.getReg(I));
3813       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3814       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3815       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3816     }
3817
3818     if (StoreVT.getNumElements() == 4) {
3819       SmallVector<Register, 4> PackedRegs;
3820       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3821       auto Unmerge = B.buildUnmerge(S32, Reg);
3822       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3823         PackedRegs.push_back(Unmerge.getReg(I));
3824       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3825       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3826           .getReg(0);
3827     }
3828
3829     llvm_unreachable("invalid data type");
3830   }
3831
3832   return Reg;
3833 }
3834
3835 Register AMDGPULegalizerInfo::fixStoreSourceType(
3836   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3837   MachineRegisterInfo *MRI = B.getMRI();
3838   LLT Ty = MRI->getType(VData);
3839
3840   const LLT S16 = LLT::scalar(16);
3841
3842   // Fixup illegal register types for i8 stores.
3843   if (Ty == LLT::scalar(8) || Ty == S16) {
3844     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3845     return AnyExt;
3846   }
3847
3848   if (Ty.isVector()) {
3849     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3850       if (IsFormat)
3851         return handleD16VData(B, *MRI, VData);
3852     }
3853   }
3854
3855   return VData;
3856 }
3857
3858 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3859                                               MachineRegisterInfo &MRI,
3860                                               MachineIRBuilder &B,
3861                                               bool IsTyped,
3862                                               bool IsFormat) const {
3863   Register VData = MI.getOperand(1).getReg();
3864   LLT Ty = MRI.getType(VData);
3865   LLT EltTy = Ty.getScalarType();
3866   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3867   const LLT S32 = LLT::scalar(32);
3868
3869   VData = fixStoreSourceType(B, VData, IsFormat);
3870   Register RSrc = MI.getOperand(2).getReg();
3871
3872   MachineMemOperand *MMO = *MI.memoperands_begin();
3873   const int MemSize = MMO->getSize();
3874
3875   unsigned ImmOffset;
3876
3877   // The typed intrinsics add an immediate after the registers.
3878   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3879
3880   // The struct intrinsic variants add one additional operand over raw.
3881   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3882   Register VIndex;
3883   int OpOffset = 0;
3884   if (HasVIndex) {
3885     VIndex = MI.getOperand(3).getReg();
3886     OpOffset = 1;
3887   } else {
3888     VIndex = B.buildConstant(S32, 0).getReg(0);
3889   }
3890
3891   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3892   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3893
3894   unsigned Format = 0;
3895   if (IsTyped) {
3896     Format = MI.getOperand(5 + OpOffset).getImm();
3897     ++OpOffset;
3898   }
3899
3900   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3901
3902   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3903   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3904
3905   unsigned Opc;
3906   if (IsTyped) {
3907     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3908                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3909   } else if (IsFormat) {
3910     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3911                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3912   } else {
3913     switch (MemSize) {
3914     case 1:
3915       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3916       break;
3917     case 2:
3918       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3919       break;
3920     default:
3921       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3922       break;
3923     }
3924   }
3925
3926   auto MIB = B.buildInstr(Opc)
3927     .addUse(VData)              // vdata
3928     .addUse(RSrc)               // rsrc
3929     .addUse(VIndex)             // vindex
3930     .addUse(VOffset)            // voffset
3931     .addUse(SOffset)            // soffset
3932     .addImm(ImmOffset);         // offset(imm)
3933
3934   if (IsTyped)
3935     MIB.addImm(Format);
3936
3937   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3938      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3939      .addMemOperand(MMO);
3940
3941   MI.eraseFromParent();
3942   return true;
3943 }
3944
3945 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3946                                              MachineRegisterInfo &MRI,
3947                                              MachineIRBuilder &B,
3948                                              bool IsFormat,
3949                                              bool IsTyped) const {
3950   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3951   MachineMemOperand *MMO = *MI.memoperands_begin();
3952   const LLT MemTy = MMO->getMemoryType();
3953   const LLT S32 = LLT::scalar(32);
3954
3955   Register Dst = MI.getOperand(0).getReg();
3956   Register RSrc = MI.getOperand(2).getReg();
3957
3958   // The typed intrinsics add an immediate after the registers.
3959   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3960
3961   // The struct intrinsic variants add one additional operand over raw.
3962   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3963   Register VIndex;
3964   int OpOffset = 0;
3965   if (HasVIndex) {
3966     VIndex = MI.getOperand(3).getReg();
3967     OpOffset = 1;
3968   } else {
3969     VIndex = B.buildConstant(S32, 0).getReg(0);
3970   }
3971
3972   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3973   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3974
3975   unsigned Format = 0;
3976   if (IsTyped) {
3977     Format = MI.getOperand(5 + OpOffset).getImm();
3978     ++OpOffset;
3979   }
3980
3981   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3982   unsigned ImmOffset;
3983
3984   LLT Ty = MRI.getType(Dst);
3985   LLT EltTy = Ty.getScalarType();
3986   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3987   const bool Unpacked = ST.hasUnpackedD16VMem();
3988
3989   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3990   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3991
3992   unsigned Opc;
3993
3994   if (IsTyped) {
3995     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3996                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3997   } else if (IsFormat) {
3998     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3999                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4000   } else {
4001     switch (MemTy.getSizeInBits()) {
4002     case 8:
4003       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4004       break;
4005     case 16:
4006       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4007       break;
4008     default:
4009       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4010       break;
4011     }
4012   }
4013
4014   Register LoadDstReg;
4015
4016   bool IsExtLoad =
4017       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4018   LLT UnpackedTy = Ty.changeElementSize(32);
4019
4020   if (IsExtLoad)
4021     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4022   else if (Unpacked && IsD16 && Ty.isVector())
4023     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4024   else
4025     LoadDstReg = Dst;
4026
4027   auto MIB = B.buildInstr(Opc)
4028     .addDef(LoadDstReg)         // vdata
4029     .addUse(RSrc)               // rsrc
4030     .addUse(VIndex)             // vindex
4031     .addUse(VOffset)            // voffset
4032     .addUse(SOffset)            // soffset
4033     .addImm(ImmOffset);         // offset(imm)
4034
4035   if (IsTyped)
4036     MIB.addImm(Format);
4037
4038   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4039      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4040      .addMemOperand(MMO);
4041
4042   if (LoadDstReg != Dst) {
4043     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4044
4045     // Widen result for extending loads was widened.
4046     if (IsExtLoad)
4047       B.buildTrunc(Dst, LoadDstReg);
4048     else {
4049       // Repack to original 16-bit vector result
4050       // FIXME: G_TRUNC should work, but legalization currently fails
4051       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4052       SmallVector<Register, 4> Repack;
4053       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4054         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4055       B.buildMerge(Dst, Repack);
4056     }
4057   }
4058
4059   MI.eraseFromParent();
4060   return true;
4061 }
4062
4063 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4064                                                MachineIRBuilder &B,
4065                                                bool IsInc) const {
4066   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4067                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4068   B.buildInstr(Opc)
4069     .addDef(MI.getOperand(0).getReg())
4070     .addUse(MI.getOperand(2).getReg())
4071     .addUse(MI.getOperand(3).getReg())
4072     .cloneMemRefs(MI);
4073   MI.eraseFromParent();
4074   return true;
4075 }
4076
4077 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4078   switch (IntrID) {
4079   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4080   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4081     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4082   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4083   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4084     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4085   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4086   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4087     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4088   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4089   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4090     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4091   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4092   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4093     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4094   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4095   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4096     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4097   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4098   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4100   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4101   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4102     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4103   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4104   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4105     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4106   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4107   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4108     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4109   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4110   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4111     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4112   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4113   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4114     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4115   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4116   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4117     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4118   case Intrinsic::amdgcn_buffer_atomic_fadd:
4119   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4120   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4121     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4122   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4123   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4124     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4125   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4126   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4127     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4128   default:
4129     llvm_unreachable("unhandled atomic opcode");
4130   }
4131 }
4132
4133 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4134                                                MachineIRBuilder &B,
4135                                                Intrinsic::ID IID) const {
4136   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4137                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4138   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4139
4140   Register Dst;
4141
4142   int OpOffset = 0;
4143   if (HasReturn) {
4144     // A few FP atomics do not support return values.
4145     Dst = MI.getOperand(0).getReg();
4146   } else {
4147     OpOffset = -1;
4148   }
4149
4150   Register VData = MI.getOperand(2 + OpOffset).getReg();
4151   Register CmpVal;
4152
4153   if (IsCmpSwap) {
4154     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4155     ++OpOffset;
4156   }
4157
4158   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4159   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4160
4161   // The struct intrinsic variants add one additional operand over raw.
4162   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4163   Register VIndex;
4164   if (HasVIndex) {
4165     VIndex = MI.getOperand(4 + OpOffset).getReg();
4166     ++OpOffset;
4167   } else {
4168     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4169   }
4170
4171   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4172   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4173   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4174
4175   MachineMemOperand *MMO = *MI.memoperands_begin();
4176
4177   unsigned ImmOffset;
4178   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4179   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4180
4181   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4182
4183   if (HasReturn)
4184     MIB.addDef(Dst);
4185
4186   MIB.addUse(VData); // vdata
4187
4188   if (IsCmpSwap)
4189     MIB.addReg(CmpVal);
4190
4191   MIB.addUse(RSrc)               // rsrc
4192      .addUse(VIndex)             // vindex
4193      .addUse(VOffset)            // voffset
4194      .addUse(SOffset)            // soffset
4195      .addImm(ImmOffset)          // offset(imm)
4196      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4197      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4198      .addMemOperand(MMO);
4199
4200   MI.eraseFromParent();
4201   return true;
4202 }
4203
4204 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4205 /// vector with s16 typed elements.
4206 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4207                                       SmallVectorImpl<Register> &PackedAddrs,
4208                                       unsigned ArgOffset,
4209                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4210                                       bool IsA16, bool IsG16) {
4211   const LLT S16 = LLT::scalar(16);
4212   const LLT V2S16 = LLT::fixed_vector(2, 16);
4213   auto EndIdx = Intr->VAddrEnd;
4214
4215   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4216     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4217     if (!SrcOp.isReg())
4218       continue; // _L to _LZ may have eliminated this.
4219
4220     Register AddrReg = SrcOp.getReg();
4221
4222     if ((I < Intr->GradientStart) ||
4223         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4224         (I >= Intr->CoordStart && !IsA16)) {
4225       // Handle any gradient or coordinate operands that should not be packed
4226       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4227       PackedAddrs.push_back(AddrReg);
4228     } else {
4229       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4230       // derivatives dx/dh and dx/dv are packed with undef.
4231       if (((I + 1) >= EndIdx) ||
4232           ((Intr->NumGradients / 2) % 2 == 1 &&
4233            (I == static_cast<unsigned>(Intr->GradientStart +
4234                                        (Intr->NumGradients / 2) - 1) ||
4235             I == static_cast<unsigned>(Intr->GradientStart +
4236                                        Intr->NumGradients - 1))) ||
4237           // Check for _L to _LZ optimization
4238           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4239         PackedAddrs.push_back(
4240             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4241                 .getReg(0));
4242       } else {
4243         PackedAddrs.push_back(
4244             B.buildBuildVector(
4245                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4246                 .getReg(0));
4247         ++I;
4248       }
4249     }
4250   }
4251 }
4252
4253 /// Convert from separate vaddr components to a single vector address register,
4254 /// and replace the remaining operands with $noreg.
4255 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4256                                      int DimIdx, int NumVAddrs) {
4257   const LLT S32 = LLT::scalar(32);
4258
4259   SmallVector<Register, 8> AddrRegs;
4260   for (int I = 0; I != NumVAddrs; ++I) {
4261     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4262     if (SrcOp.isReg()) {
4263       AddrRegs.push_back(SrcOp.getReg());
4264       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4265     }
4266   }
4267
4268   int NumAddrRegs = AddrRegs.size();
4269   if (NumAddrRegs != 1) {
4270     // Above 8 elements round up to next power of 2 (i.e. 16).
4271     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4272       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4273       auto Undef = B.buildUndef(S32);
4274       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4275       NumAddrRegs = RoundedNumRegs;
4276     }
4277
4278     auto VAddr =
4279         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4280     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4281   }
4282
4283   for (int I = 1; I != NumVAddrs; ++I) {
4284     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4285     if (SrcOp.isReg())
4286       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4287   }
4288 }
4289
4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4291 ///
4292 /// Depending on the subtarget, load/store with 16-bit element data need to be
4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4295 /// registers.
4296 ///
4297 /// We don't want to directly select image instructions just yet, but also want
4298 /// to exposes all register repacking to the legalizer/combiners. We also don't
4299 /// want a selected instrution entering RegBankSelect. In order to avoid
4300 /// defining a multitude of intermediate image instructions, directly hack on
4301 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4302 /// now unnecessary arguments with $noreg.
4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4304     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4305     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4306
4307   const unsigned NumDefs = MI.getNumExplicitDefs();
4308   const unsigned ArgOffset = NumDefs + 1;
4309   bool IsTFE = NumDefs == 2;
4310   // We are only processing the operands of d16 image operations on subtargets
4311   // that use the unpacked register layout, or need to repack the TFE result.
4312
4313   // TODO: Do we need to guard against already legalized intrinsics?
4314   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4315       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4316
4317   MachineRegisterInfo *MRI = B.getMRI();
4318   const LLT S32 = LLT::scalar(32);
4319   const LLT S16 = LLT::scalar(16);
4320   const LLT V2S16 = LLT::fixed_vector(2, 16);
4321
4322   unsigned DMask = 0;
4323
4324   // Check for 16 bit addresses and pack if true.
4325   LLT GradTy =
4326       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4327   LLT AddrTy =
4328       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4329   const bool IsG16 = GradTy == S16;
4330   const bool IsA16 = AddrTy == S16;
4331
4332   int DMaskLanes = 0;
4333   if (!BaseOpcode->Atomic) {
4334     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4335     if (BaseOpcode->Gather4) {
4336       DMaskLanes = 4;
4337     } else if (DMask != 0) {
4338       DMaskLanes = countPopulation(DMask);
4339     } else if (!IsTFE && !BaseOpcode->Store) {
4340       // If dmask is 0, this is a no-op load. This can be eliminated.
4341       B.buildUndef(MI.getOperand(0));
4342       MI.eraseFromParent();
4343       return true;
4344     }
4345   }
4346
4347   Observer.changingInstr(MI);
4348   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4349
4350   unsigned NewOpcode = NumDefs == 0 ?
4351     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4352
4353   // Track that we legalized this
4354   MI.setDesc(B.getTII().get(NewOpcode));
4355
4356   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4357   // dmask to be at least 1 otherwise the instruction will fail
4358   if (IsTFE && DMask == 0) {
4359     DMask = 0x1;
4360     DMaskLanes = 1;
4361     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4362   }
4363
4364   if (BaseOpcode->Atomic) {
4365     Register VData0 = MI.getOperand(2).getReg();
4366     LLT Ty = MRI->getType(VData0);
4367
4368     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4369     if (Ty.isVector())
4370       return false;
4371
4372     if (BaseOpcode->AtomicX2) {
4373       Register VData1 = MI.getOperand(3).getReg();
4374       // The two values are packed in one register.
4375       LLT PackedTy = LLT::fixed_vector(2, Ty);
4376       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4377       MI.getOperand(2).setReg(Concat.getReg(0));
4378       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4379     }
4380   }
4381
4382   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4383
4384   // Optimize _L to _LZ when _L is zero
4385   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4386           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4387     const ConstantFP *ConstantLod;
4388
4389     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4390                  m_GFCst(ConstantLod))) {
4391       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4392         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4393         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4394             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4395                                                       Intr->Dim);
4396
4397         // The starting indexes should remain in the same place.
4398         --CorrectedNumVAddrs;
4399
4400         MI.getOperand(MI.getNumExplicitDefs())
4401             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4402         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4403         Intr = NewImageDimIntr;
4404       }
4405     }
4406   }
4407
4408   // Optimize _mip away, when 'lod' is zero
4409   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4410     int64_t ConstantLod;
4411     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4412                  m_ICst(ConstantLod))) {
4413       if (ConstantLod == 0) {
4414         // TODO: Change intrinsic opcode and remove operand instead or replacing
4415         // it with 0, as the _L to _LZ handling is done above.
4416         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4417         --CorrectedNumVAddrs;
4418       }
4419     }
4420   }
4421
4422   // Rewrite the addressing register layout before doing anything else.
4423   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4424     // 16 bit gradients are supported, but are tied to the A16 control
4425     // so both gradients and addresses must be 16 bit
4426     return false;
4427   }
4428
4429   if (IsA16 && !ST.hasA16()) {
4430     // A16 not supported
4431     return false;
4432   }
4433
4434   if (IsA16 || IsG16) {
4435     if (Intr->NumVAddrs > 1) {
4436       SmallVector<Register, 4> PackedRegs;
4437
4438       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4439                                 IsG16);
4440
4441       // See also below in the non-a16 branch
4442       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4443                           PackedRegs.size() <= ST.getNSAMaxSize();
4444
4445       if (!UseNSA && PackedRegs.size() > 1) {
4446         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4447         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4448         PackedRegs[0] = Concat.getReg(0);
4449         PackedRegs.resize(1);
4450       }
4451
4452       const unsigned NumPacked = PackedRegs.size();
4453       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4454         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4455         if (!SrcOp.isReg()) {
4456           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4457           continue;
4458         }
4459
4460         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4461
4462         if (I - Intr->VAddrStart < NumPacked)
4463           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4464         else
4465           SrcOp.setReg(AMDGPU::NoRegister);
4466       }
4467     }
4468   } else {
4469     // If the register allocator cannot place the address registers contiguously
4470     // without introducing moves, then using the non-sequential address encoding
4471     // is always preferable, since it saves VALU instructions and is usually a
4472     // wash in terms of code size or even better.
4473     //
4474     // However, we currently have no way of hinting to the register allocator
4475     // that MIMG addresses should be placed contiguously when it is possible to
4476     // do so, so force non-NSA for the common 2-address case as a heuristic.
4477     //
4478     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4479     // allocation when possible.
4480     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4481                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4482
4483     if (!UseNSA && Intr->NumVAddrs > 1)
4484       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4485                                Intr->NumVAddrs);
4486   }
4487
4488   int Flags = 0;
4489   if (IsA16)
4490     Flags |= 1;
4491   if (IsG16)
4492     Flags |= 2;
4493   MI.addOperand(MachineOperand::CreateImm(Flags));
4494
4495   if (BaseOpcode->Store) { // No TFE for stores?
4496     // TODO: Handle dmask trim
4497     Register VData = MI.getOperand(1).getReg();
4498     LLT Ty = MRI->getType(VData);
4499     if (!Ty.isVector() || Ty.getElementType() != S16)
4500       return true;
4501
4502     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4503     if (RepackedReg != VData) {
4504       MI.getOperand(1).setReg(RepackedReg);
4505     }
4506
4507     return true;
4508   }
4509
4510   Register DstReg = MI.getOperand(0).getReg();
4511   LLT Ty = MRI->getType(DstReg);
4512   const LLT EltTy = Ty.getScalarType();
4513   const bool IsD16 = Ty.getScalarType() == S16;
4514   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4515
4516   // Confirm that the return type is large enough for the dmask specified
4517   if (NumElts < DMaskLanes)
4518     return false;
4519
4520   if (NumElts > 4 || DMaskLanes > 4)
4521     return false;
4522
4523   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4524   const LLT AdjustedTy =
4525       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4526
4527   // The raw dword aligned data component of the load. The only legal cases
4528   // where this matters should be when using the packed D16 format, for
4529   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4530   LLT RoundedTy;
4531
4532   // S32 vector to to cover all data, plus TFE result element.
4533   LLT TFETy;
4534
4535   // Register type to use for each loaded component. Will be S32 or V2S16.
4536   LLT RegTy;
4537
4538   if (IsD16 && ST.hasUnpackedD16VMem()) {
4539     RoundedTy =
4540         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4541     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4542     RegTy = S32;
4543   } else {
4544     unsigned EltSize = EltTy.getSizeInBits();
4545     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4546     unsigned RoundedSize = 32 * RoundedElts;
4547     RoundedTy = LLT::scalarOrVector(
4548         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4549     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4550     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4551   }
4552
4553   // The return type does not need adjustment.
4554   // TODO: Should we change s16 case to s32 or <2 x s16>?
4555   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4556     return true;
4557
4558   Register Dst1Reg;
4559
4560   // Insert after the instruction.
4561   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4562
4563   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4564   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4565   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4566   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4567
4568   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4569
4570   MI.getOperand(0).setReg(NewResultReg);
4571
4572   // In the IR, TFE is supposed to be used with a 2 element struct return
4573   // type. The intruction really returns these two values in one contiguous
4574   // register, with one additional dword beyond the loaded data. Rewrite the
4575   // return type to use a single register result.
4576
4577   if (IsTFE) {
4578     Dst1Reg = MI.getOperand(1).getReg();
4579     if (MRI->getType(Dst1Reg) != S32)
4580       return false;
4581
4582     // TODO: Make sure the TFE operand bit is set.
4583     MI.RemoveOperand(1);
4584
4585     // Handle the easy case that requires no repack instructions.
4586     if (Ty == S32) {
4587       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4588       return true;
4589     }
4590   }
4591
4592   // Now figure out how to copy the new result register back into the old
4593   // result.
4594   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4595
4596   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4597
4598   if (ResultNumRegs == 1) {
4599     assert(!IsTFE);
4600     ResultRegs[0] = NewResultReg;
4601   } else {
4602     // We have to repack into a new vector of some kind.
4603     for (int I = 0; I != NumDataRegs; ++I)
4604       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4605     B.buildUnmerge(ResultRegs, NewResultReg);
4606
4607     // Drop the final TFE element to get the data part. The TFE result is
4608     // directly written to the right place already.
4609     if (IsTFE)
4610       ResultRegs.resize(NumDataRegs);
4611   }
4612
4613   // For an s16 scalar result, we form an s32 result with a truncate regardless
4614   // of packed vs. unpacked.
4615   if (IsD16 && !Ty.isVector()) {
4616     B.buildTrunc(DstReg, ResultRegs[0]);
4617     return true;
4618   }
4619
4620   // Avoid a build/concat_vector of 1 entry.
4621   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4622     B.buildBitcast(DstReg, ResultRegs[0]);
4623     return true;
4624   }
4625
4626   assert(Ty.isVector());
4627
4628   if (IsD16) {
4629     // For packed D16 results with TFE enabled, all the data components are
4630     // S32. Cast back to the expected type.
4631     //
4632     // TODO: We don't really need to use load s32 elements. We would only need one
4633     // cast for the TFE result if a multiple of v2s16 was used.
4634     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4635       for (Register &Reg : ResultRegs)
4636         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4637     } else if (ST.hasUnpackedD16VMem()) {
4638       for (Register &Reg : ResultRegs)
4639         Reg = B.buildTrunc(S16, Reg).getReg(0);
4640     }
4641   }
4642
4643   auto padWithUndef = [&](LLT Ty, int NumElts) {
4644     if (NumElts == 0)
4645       return;
4646     Register Undef = B.buildUndef(Ty).getReg(0);
4647     for (int I = 0; I != NumElts; ++I)
4648       ResultRegs.push_back(Undef);
4649   };
4650
4651   // Pad out any elements eliminated due to the dmask.
4652   LLT ResTy = MRI->getType(ResultRegs[0]);
4653   if (!ResTy.isVector()) {
4654     padWithUndef(ResTy, NumElts - ResultRegs.size());
4655     B.buildBuildVector(DstReg, ResultRegs);
4656     return true;
4657   }
4658
4659   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4660   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4661
4662   // Deal with the one annoying legal case.
4663   const LLT V3S16 = LLT::fixed_vector(3, 16);
4664   if (Ty == V3S16) {
4665     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4666     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4667     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4668     return true;
4669   }
4670
4671   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4672   B.buildConcatVectors(DstReg, ResultRegs);
4673   return true;
4674 }
4675
4676 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4677   LegalizerHelper &Helper, MachineInstr &MI) const {
4678   MachineIRBuilder &B = Helper.MIRBuilder;
4679   GISelChangeObserver &Observer = Helper.Observer;
4680
4681   Register Dst = MI.getOperand(0).getReg();
4682   LLT Ty = B.getMRI()->getType(Dst);
4683   unsigned Size = Ty.getSizeInBits();
4684   MachineFunction &MF = B.getMF();
4685
4686   Observer.changingInstr(MI);
4687
4688   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4689     Ty = getBitcastRegisterType(Ty);
4690     Helper.bitcastDst(MI, Ty, 0);
4691     Dst = MI.getOperand(0).getReg();
4692     B.setInsertPt(B.getMBB(), MI);
4693   }
4694
4695   // FIXME: We don't really need this intermediate instruction. The intrinsic
4696   // should be fixed to have a memory operand. Since it's readnone, we're not
4697   // allowed to add one.
4698   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4699   MI.RemoveOperand(1); // Remove intrinsic ID
4700
4701   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4702   // TODO: Should this use datalayout alignment?
4703   const unsigned MemSize = (Size + 7) / 8;
4704   const Align MemAlign(4);
4705   MachineMemOperand *MMO = MF.getMachineMemOperand(
4706       MachinePointerInfo(),
4707       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4708           MachineMemOperand::MOInvariant,
4709       MemSize, MemAlign);
4710   MI.addMemOperand(MF, MMO);
4711
4712   // There are no 96-bit result scalar loads, but widening to 128-bit should
4713   // always be legal. We may need to restore this to a 96-bit result if it turns
4714   // out this needs to be converted to a vector load during RegBankSelect.
4715   if (!isPowerOf2_32(Size)) {
4716     if (Ty.isVector())
4717       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4718     else
4719       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4720   }
4721
4722   Observer.changedInstr(MI);
4723   return true;
4724 }
4725
4726 // TODO: Move to selection
4727 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4728                                                 MachineRegisterInfo &MRI,
4729                                                 MachineIRBuilder &B) const {
4730   if (!ST.isTrapHandlerEnabled() ||
4731       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4732     return legalizeTrapEndpgm(MI, MRI, B);
4733
4734   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4735     switch (*HsaAbiVer) {
4736     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4737     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4738       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4739     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4740       return ST.supportsGetDoorbellID() ?
4741           legalizeTrapHsa(MI, MRI, B) :
4742           legalizeTrapHsaQueuePtr(MI, MRI, B);
4743     }
4744   }
4745
4746   llvm_unreachable("Unknown trap handler");
4747 }
4748
4749 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4750     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4751   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4752   MI.eraseFromParent();
4753   return true;
4754 }
4755
4756 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4757     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4758   // Pass queue pointer to trap handler as input, and insert trap instruction
4759   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4760   Register LiveIn =
4761     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4762   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4763     return false;
4764
4765   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4766   B.buildCopy(SGPR01, LiveIn);
4767   B.buildInstr(AMDGPU::S_TRAP)
4768       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4769       .addReg(SGPR01, RegState::Implicit);
4770
4771   MI.eraseFromParent();
4772   return true;
4773 }
4774
4775 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4776     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4777   B.buildInstr(AMDGPU::S_TRAP)
4778       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4779   MI.eraseFromParent();
4780   return true;
4781 }
4782
4783 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4784     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4785   // Is non-HSA path or trap-handler disabled? then, report a warning
4786   // accordingly
4787   if (!ST.isTrapHandlerEnabled() ||
4788       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4789     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4790                                      "debugtrap handler not supported",
4791                                      MI.getDebugLoc(), DS_Warning);
4792     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4793     Ctx.diagnose(NoTrap);
4794   } else {
4795     // Insert debug-trap instruction
4796     B.buildInstr(AMDGPU::S_TRAP)
4797         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4798   }
4799
4800   MI.eraseFromParent();
4801   return true;
4802 }
4803
4804 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4805                                                MachineIRBuilder &B) const {
4806   MachineRegisterInfo &MRI = *B.getMRI();
4807   const LLT S16 = LLT::scalar(16);
4808   const LLT S32 = LLT::scalar(32);
4809
4810   Register DstReg = MI.getOperand(0).getReg();
4811   Register NodePtr = MI.getOperand(2).getReg();
4812   Register RayExtent = MI.getOperand(3).getReg();
4813   Register RayOrigin = MI.getOperand(4).getReg();
4814   Register RayDir = MI.getOperand(5).getReg();
4815   Register RayInvDir = MI.getOperand(6).getReg();
4816   Register TDescr = MI.getOperand(7).getReg();
4817
4818   if (!ST.hasGFX10_AEncoding()) {
4819     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4820                                         "intrinsic not supported on subtarget",
4821                                         MI.getDebugLoc());
4822     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4823     return false;
4824   }
4825
4826   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4827   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4828   const unsigned NumVDataDwords = 4;
4829   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4830   const bool UseNSA =
4831       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4832   const unsigned BaseOpcodes[2][2] = {
4833       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4834       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4835        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4836   int Opcode;
4837   if (UseNSA) {
4838     Opcode =
4839         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4840                               NumVDataDwords, NumVAddrDwords);
4841   } else {
4842     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4843                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4844                                    PowerOf2Ceil(NumVAddrDwords));
4845   }
4846   assert(Opcode != -1);
4847
4848   SmallVector<Register, 12> Ops;
4849   if (Is64) {
4850     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4851     Ops.push_back(Unmerge.getReg(0));
4852     Ops.push_back(Unmerge.getReg(1));
4853   } else {
4854     Ops.push_back(NodePtr);
4855   }
4856   Ops.push_back(RayExtent);
4857
4858   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4859     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4860     Ops.push_back(Unmerge.getReg(0));
4861     Ops.push_back(Unmerge.getReg(1));
4862     Ops.push_back(Unmerge.getReg(2));
4863   };
4864
4865   packLanes(RayOrigin);
4866   if (IsA16) {
4867     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4868     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4869     Register R1 = MRI.createGenericVirtualRegister(S32);
4870     Register R2 = MRI.createGenericVirtualRegister(S32);
4871     Register R3 = MRI.createGenericVirtualRegister(S32);
4872     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4873     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4874     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4875     Ops.push_back(R1);
4876     Ops.push_back(R2);
4877     Ops.push_back(R3);
4878   } else {
4879     packLanes(RayDir);
4880     packLanes(RayInvDir);
4881   }
4882
4883   if (!UseNSA) {
4884     // Build a single vector containing all the operands so far prepared.
4885     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4886     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4887     Ops.clear();
4888     Ops.push_back(MergedOps);
4889   }
4890
4891   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4892     .addDef(DstReg)
4893     .addImm(Opcode);
4894
4895   for (Register R : Ops) {
4896     MIB.addUse(R);
4897   }
4898
4899   MIB.addUse(TDescr)
4900      .addImm(IsA16 ? 1 : 0)
4901      .cloneMemRefs(MI);
4902
4903   MI.eraseFromParent();
4904   return true;
4905 }
4906
4907 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4908                                             MachineInstr &MI) const {
4909   MachineIRBuilder &B = Helper.MIRBuilder;
4910   MachineRegisterInfo &MRI = *B.getMRI();
4911
4912   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4913   auto IntrID = MI.getIntrinsicID();
4914   switch (IntrID) {
4915   case Intrinsic::amdgcn_if:
4916   case Intrinsic::amdgcn_else: {
4917     MachineInstr *Br = nullptr;
4918     MachineBasicBlock *UncondBrTarget = nullptr;
4919     bool Negated = false;
4920     if (MachineInstr *BrCond =
4921             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4922       const SIRegisterInfo *TRI
4923         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4924
4925       Register Def = MI.getOperand(1).getReg();
4926       Register Use = MI.getOperand(3).getReg();
4927
4928       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4929
4930       if (Negated)
4931         std::swap(CondBrTarget, UncondBrTarget);
4932
4933       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4934       if (IntrID == Intrinsic::amdgcn_if) {
4935         B.buildInstr(AMDGPU::SI_IF)
4936           .addDef(Def)
4937           .addUse(Use)
4938           .addMBB(UncondBrTarget);
4939       } else {
4940         B.buildInstr(AMDGPU::SI_ELSE)
4941             .addDef(Def)
4942             .addUse(Use)
4943             .addMBB(UncondBrTarget);
4944       }
4945
4946       if (Br) {
4947         Br->getOperand(0).setMBB(CondBrTarget);
4948       } else {
4949         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4950         // since we're swapping branch targets it needs to be reinserted.
4951         // FIXME: IRTranslator should probably not do this
4952         B.buildBr(*CondBrTarget);
4953       }
4954
4955       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4956       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4957       MI.eraseFromParent();
4958       BrCond->eraseFromParent();
4959       return true;
4960     }
4961
4962     return false;
4963   }
4964   case Intrinsic::amdgcn_loop: {
4965     MachineInstr *Br = nullptr;
4966     MachineBasicBlock *UncondBrTarget = nullptr;
4967     bool Negated = false;
4968     if (MachineInstr *BrCond =
4969             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4970       const SIRegisterInfo *TRI
4971         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4972
4973       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4974       Register Reg = MI.getOperand(2).getReg();
4975
4976       if (Negated)
4977         std::swap(CondBrTarget, UncondBrTarget);
4978
4979       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4980       B.buildInstr(AMDGPU::SI_LOOP)
4981         .addUse(Reg)
4982         .addMBB(UncondBrTarget);
4983
4984       if (Br)
4985         Br->getOperand(0).setMBB(CondBrTarget);
4986       else
4987         B.buildBr(*CondBrTarget);
4988
4989       MI.eraseFromParent();
4990       BrCond->eraseFromParent();
4991       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4992       return true;
4993     }
4994
4995     return false;
4996   }
4997   case Intrinsic::amdgcn_kernarg_segment_ptr:
4998     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4999       // This only makes sense to call in a kernel, so just lower to null.
5000       B.buildConstant(MI.getOperand(0).getReg(), 0);
5001       MI.eraseFromParent();
5002       return true;
5003     }
5004
5005     return legalizePreloadedArgIntrin(
5006       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5007   case Intrinsic::amdgcn_implicitarg_ptr:
5008     return legalizeImplicitArgPtr(MI, MRI, B);
5009   case Intrinsic::amdgcn_workitem_id_x:
5010     return legalizePreloadedArgIntrin(MI, MRI, B,
5011                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5012   case Intrinsic::amdgcn_workitem_id_y:
5013     return legalizePreloadedArgIntrin(MI, MRI, B,
5014                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5015   case Intrinsic::amdgcn_workitem_id_z:
5016     return legalizePreloadedArgIntrin(MI, MRI, B,
5017                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5018   case Intrinsic::amdgcn_workgroup_id_x:
5019     return legalizePreloadedArgIntrin(MI, MRI, B,
5020                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5021   case Intrinsic::amdgcn_workgroup_id_y:
5022     return legalizePreloadedArgIntrin(MI, MRI, B,
5023                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5024   case Intrinsic::amdgcn_workgroup_id_z:
5025     return legalizePreloadedArgIntrin(MI, MRI, B,
5026                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5027   case Intrinsic::amdgcn_dispatch_ptr:
5028     return legalizePreloadedArgIntrin(MI, MRI, B,
5029                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5030   case Intrinsic::amdgcn_queue_ptr:
5031     return legalizePreloadedArgIntrin(MI, MRI, B,
5032                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5033   case Intrinsic::amdgcn_implicit_buffer_ptr:
5034     return legalizePreloadedArgIntrin(
5035       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5036   case Intrinsic::amdgcn_dispatch_id:
5037     return legalizePreloadedArgIntrin(MI, MRI, B,
5038                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5039   case Intrinsic::amdgcn_fdiv_fast:
5040     return legalizeFDIVFastIntrin(MI, MRI, B);
5041   case Intrinsic::amdgcn_is_shared:
5042     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5043   case Intrinsic::amdgcn_is_private:
5044     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5045   case Intrinsic::amdgcn_wavefrontsize: {
5046     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5047     MI.eraseFromParent();
5048     return true;
5049   }
5050   case Intrinsic::amdgcn_s_buffer_load:
5051     return legalizeSBufferLoad(Helper, MI);
5052   case Intrinsic::amdgcn_raw_buffer_store:
5053   case Intrinsic::amdgcn_struct_buffer_store:
5054     return legalizeBufferStore(MI, MRI, B, false, false);
5055   case Intrinsic::amdgcn_raw_buffer_store_format:
5056   case Intrinsic::amdgcn_struct_buffer_store_format:
5057     return legalizeBufferStore(MI, MRI, B, false, true);
5058   case Intrinsic::amdgcn_raw_tbuffer_store:
5059   case Intrinsic::amdgcn_struct_tbuffer_store:
5060     return legalizeBufferStore(MI, MRI, B, true, true);
5061   case Intrinsic::amdgcn_raw_buffer_load:
5062   case Intrinsic::amdgcn_struct_buffer_load:
5063     return legalizeBufferLoad(MI, MRI, B, false, false);
5064   case Intrinsic::amdgcn_raw_buffer_load_format:
5065   case Intrinsic::amdgcn_struct_buffer_load_format:
5066     return legalizeBufferLoad(MI, MRI, B, true, false);
5067   case Intrinsic::amdgcn_raw_tbuffer_load:
5068   case Intrinsic::amdgcn_struct_tbuffer_load:
5069     return legalizeBufferLoad(MI, MRI, B, true, true);
5070   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5071   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5072   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5073   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5074   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5075   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5076   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5077   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5078   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5079   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5080   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5081   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5082   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5083   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5084   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5085   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5086   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5087   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5088   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5089   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5090   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5091   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5092   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5093   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5094   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5095   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5096   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5097   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5098   case Intrinsic::amdgcn_buffer_atomic_fadd:
5099   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5100   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5101   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5102   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5103     return legalizeBufferAtomic(MI, B, IntrID);
5104   case Intrinsic::amdgcn_atomic_inc:
5105     return legalizeAtomicIncDec(MI, B, true);
5106   case Intrinsic::amdgcn_atomic_dec:
5107     return legalizeAtomicIncDec(MI, B, false);
5108   case Intrinsic::trap:
5109     return legalizeTrapIntrinsic(MI, MRI, B);
5110   case Intrinsic::debugtrap:
5111     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5112   case Intrinsic::amdgcn_rsq_clamp:
5113     return legalizeRsqClampIntrinsic(MI, MRI, B);
5114   case Intrinsic::amdgcn_ds_fadd:
5115   case Intrinsic::amdgcn_ds_fmin:
5116   case Intrinsic::amdgcn_ds_fmax:
5117     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5118   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5119     return legalizeBVHIntrinsic(MI, B);
5120   default: {
5121     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5122             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5123       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5124     return true;
5125   }
5126   }
5127
5128   return true;
5129 }