llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPULegalizerInfo.h"
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUGlobalISelUtils.h"
  18 #include "AMDGPUInstrInfo.h"
  19 #include "AMDGPUMemoryUtils.h"
  20 #include "AMDGPUTargetMachine.h"
  21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "SIRegisterInfo.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/ScopeExit.h"
  27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
  28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
  30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  31 #include "llvm/CodeGen/GlobalISel/Utils.h"
  32 #include "llvm/CodeGen/TargetOpcodes.h"
  33 #include "llvm/IR/DiagnosticInfo.h"
  34 #include "llvm/IR/IntrinsicsAMDGPU.h"
  35 #include "llvm/IR/IntrinsicsR600.h"
  36
  37 #define DEBUG_TYPE "amdgpu-legalinfo"
  38
  39 using namespace llvm;
  40 using namespace LegalizeActions;
  41 using namespace LegalizeMutations;
  42 using namespace LegalityPredicates;
  43 using namespace MIPatternMatch;
  44
  45 // Hack until load/store selection patterns support any tuple of legal types.
  46 static cl::opt<bool> EnableNewLegality(
  47   "amdgpu-global-isel-new-legality",
  48   cl::desc("Use GlobalISel desired legality, rather than try to use"
  49            "rules compatible with selection patterns"),
  50   cl::init(false),
  51   cl::ReallyHidden);
  52
  53 static constexpr unsigned MaxRegisterSize = 1024;
  54
  55 // Round the number of elements to the next power of two elements
  56 static LLT getPow2VectorType(LLT Ty) {
  57   unsigned NElts = Ty.getNumElements();
  58   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
  59   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
  60 }
  61
  62 // Round the number of bits to the next power of two bits
  63 static LLT getPow2ScalarType(LLT Ty) {
  64   unsigned Bits = Ty.getSizeInBits();
  65   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
  66   return LLT::scalar(Pow2Bits);
  67 }
  68
  69 /// \returns true if this is an odd sized vector which should widen by adding an
  70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
  71 /// excludes s1 vectors, which should always be scalarized.
  72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  73   return [=](const LegalityQuery &Query) {
  74     const LLT Ty = Query.Types[TypeIdx];
  75     if (!Ty.isVector())
  76       return false;
  77
  78     const LLT EltTy = Ty.getElementType();
  79     const unsigned EltSize = EltTy.getSizeInBits();
  80     return Ty.getNumElements() % 2 != 0 &&
  81            EltSize > 1 && EltSize < 32 &&
  82            Ty.getSizeInBits() % 32 != 0;
  83   };
  84 }
  85
  86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
  87   return [=](const LegalityQuery &Query) {
  88     const LLT Ty = Query.Types[TypeIdx];
  89     return Ty.getSizeInBits() % 32 == 0;
  90   };
  91 }
  92
  93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
  94   return [=](const LegalityQuery &Query) {
  95     const LLT Ty = Query.Types[TypeIdx];
  96     const LLT EltTy = Ty.getScalarType();
  97     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
  98   };
  99 }
 100
 101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
 102   return [=](const LegalityQuery &Query) {
 103     const LLT Ty = Query.Types[TypeIdx];
 104     const LLT EltTy = Ty.getElementType();
 105     return std::pair(TypeIdx,
 106                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
 107   };
 108 }
 109
 110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
 111   return [=](const LegalityQuery &Query) {
 112     const LLT Ty = Query.Types[TypeIdx];
 113     const LLT EltTy = Ty.getElementType();
 114     unsigned Size = Ty.getSizeInBits();
 115     unsigned Pieces = (Size + 63) / 64;
 116     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
 117     return std::pair(TypeIdx, LLT::scalarOrVector(
 118                                   ElementCount::getFixed(NewNumElts), EltTy));
 119   };
 120 }
 121
 122 // Increase the number of vector elements to reach the next multiple of 32-bit
 123 // type.
 124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
 125   return [=](const LegalityQuery &Query) {
 126     const LLT Ty = Query.Types[TypeIdx];
 127
 128     const LLT EltTy = Ty.getElementType();
 129     const int Size = Ty.getSizeInBits();
 130     const int EltSize = EltTy.getSizeInBits();
 131     const int NextMul32 = (Size + 31) / 32;
 132
 133     assert(EltSize < 32);
 134
 135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
 136     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
 137   };
 138 }
 139
 140 // Increase the number of vector elements to reach the next legal RegClass.
 141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
 142   return [=](const LegalityQuery &Query) {
 143     const LLT Ty = Query.Types[TypeIdx];
 144     const unsigned NumElts = Ty.getNumElements();
 145     const unsigned EltSize = Ty.getElementType().getSizeInBits();
 146     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
 147
 148     assert(EltSize == 32 || EltSize == 64);
 149     assert(Ty.getSizeInBits() < MaxRegisterSize);
 150
 151     unsigned NewNumElts;
 152     // Find the nearest legal RegClass that is larger than the current type.
 153     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
 154       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
 155         break;
 156     }
 157
 158     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
 159   };
 160 }
 161
 162 static LLT getBufferRsrcScalarType(const LLT Ty) {
 163   if (!Ty.isVector())
 164     return LLT::scalar(128);
 165   const ElementCount NumElems = Ty.getElementCount();
 166   return LLT::vector(NumElems, LLT::scalar(128));
 167 }
 168
 169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
 170   if (!Ty.isVector())
 171     return LLT::fixed_vector(4, LLT::scalar(32));
 172   const unsigned NumElems = Ty.getElementCount().getFixedValue();
 173   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
 174 }
 175
 176 static LLT getBitcastRegisterType(const LLT Ty) {
 177   const unsigned Size = Ty.getSizeInBits();
 178
 179   if (Size <= 32) {
 180     // <2 x s8> -> s16
 181     // <4 x s8> -> s32
 182     return LLT::scalar(Size);
 183   }
 184
 185   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
 186 }
 187
 188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
 189   return [=](const LegalityQuery &Query) {
 190     const LLT Ty = Query.Types[TypeIdx];
 191     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
 192   };
 193 }
 194
 195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
 196   return [=](const LegalityQuery &Query) {
 197     const LLT Ty = Query.Types[TypeIdx];
 198     unsigned Size = Ty.getSizeInBits();
 199     assert(Size % 32 == 0);
 200     return std::pair(
 201         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
 202   };
 203 }
 204
 205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
 206   return [=](const LegalityQuery &Query) {
 207     const LLT QueryTy = Query.Types[TypeIdx];
 208     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
 209   };
 210 }
 211
 212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
 213   return [=](const LegalityQuery &Query) {
 214     const LLT QueryTy = Query.Types[TypeIdx];
 215     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
 216   };
 217 }
 218
 219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
 220   return [=](const LegalityQuery &Query) {
 221     const LLT QueryTy = Query.Types[TypeIdx];
 222     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
 223   };
 224 }
 225
 226 static bool isRegisterSize(unsigned Size) {
 227   return Size % 32 == 0 && Size <= MaxRegisterSize;
 228 }
 229
 230 static bool isRegisterVectorElementType(LLT EltTy) {
 231   const int EltSize = EltTy.getSizeInBits();
 232   return EltSize == 16 || EltSize % 32 == 0;
 233 }
 234
 235 static bool isRegisterVectorType(LLT Ty) {
 236   const int EltSize = Ty.getElementType().getSizeInBits();
 237   return EltSize == 32 || EltSize == 64 ||
 238          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 239          EltSize == 128 || EltSize == 256;
 240 }
 241
 242 // TODO: replace all uses of isRegisterType with isRegisterClassType
 243 static bool isRegisterType(LLT Ty) {
 244   if (!isRegisterSize(Ty.getSizeInBits()))
 245     return false;
 246
 247   if (Ty.isVector())
 248     return isRegisterVectorType(Ty);
 249
 250   return true;
 251 }
 252
 253 // Any combination of 32 or 64-bit elements up the maximum register size, and
 254 // multiples of v2s16.
 255 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 256   return [=](const LegalityQuery &Query) {
 257     return isRegisterType(Query.Types[TypeIdx]);
 258   };
 259 }
 260
 261 // RegisterType that doesn't have a corresponding RegClass.
 262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
 263 // should be removed.
 264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
 265   return [=](const LegalityQuery &Query) {
 266     LLT Ty = Query.Types[TypeIdx];
 267     return isRegisterType(Ty) &&
 268            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
 269   };
 270 }
 271
 272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
 273   return [=](const LegalityQuery &Query) {
 274     const LLT QueryTy = Query.Types[TypeIdx];
 275     if (!QueryTy.isVector())
 276       return false;
 277     const LLT EltTy = QueryTy.getElementType();
 278     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
 279   };
 280 }
 281
 282 static const LLT S1 = LLT::scalar(1);
 283 static const LLT S8 = LLT::scalar(8);
 284 static const LLT S16 = LLT::scalar(16);
 285 static const LLT S32 = LLT::scalar(32);
 286 static const LLT F32 = LLT::float32();
 287 static const LLT S64 = LLT::scalar(64);
 288 static const LLT F64 = LLT::float64();
 289 static const LLT S96 = LLT::scalar(96);
 290 static const LLT S128 = LLT::scalar(128);
 291 static const LLT S160 = LLT::scalar(160);
 292 static const LLT S192 = LLT::scalar(192);
 293 static const LLT S224 = LLT::scalar(224);
 294 static const LLT S256 = LLT::scalar(256);
 295 static const LLT S512 = LLT::scalar(512);
 296 static const LLT S1024 = LLT::scalar(1024);
 297 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
 298
 299 static const LLT V2S8 = LLT::fixed_vector(2, 8);
 300 static const LLT V2S16 = LLT::fixed_vector(2, 16);
 301 static const LLT V4S16 = LLT::fixed_vector(4, 16);
 302 static const LLT V6S16 = LLT::fixed_vector(6, 16);
 303 static const LLT V8S16 = LLT::fixed_vector(8, 16);
 304 static const LLT V10S16 = LLT::fixed_vector(10, 16);
 305 static const LLT V12S16 = LLT::fixed_vector(12, 16);
 306 static const LLT V16S16 = LLT::fixed_vector(16, 16);
 307
 308 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
 309 static const LLT V2BF16 = V2F16; // FIXME
 310
 311 static const LLT V2S32 = LLT::fixed_vector(2, 32);
 312 static const LLT V3S32 = LLT::fixed_vector(3, 32);
 313 static const LLT V4S32 = LLT::fixed_vector(4, 32);
 314 static const LLT V5S32 = LLT::fixed_vector(5, 32);
 315 static const LLT V6S32 = LLT::fixed_vector(6, 32);
 316 static const LLT V7S32 = LLT::fixed_vector(7, 32);
 317 static const LLT V8S32 = LLT::fixed_vector(8, 32);
 318 static const LLT V9S32 = LLT::fixed_vector(9, 32);
 319 static const LLT V10S32 = LLT::fixed_vector(10, 32);
 320 static const LLT V11S32 = LLT::fixed_vector(11, 32);
 321 static const LLT V12S32 = LLT::fixed_vector(12, 32);
 322 static const LLT V16S32 = LLT::fixed_vector(16, 32);
 323 static const LLT V32S32 = LLT::fixed_vector(32, 32);
 324
 325 static const LLT V2S64 = LLT::fixed_vector(2, 64);
 326 static const LLT V3S64 = LLT::fixed_vector(3, 64);
 327 static const LLT V4S64 = LLT::fixed_vector(4, 64);
 328 static const LLT V5S64 = LLT::fixed_vector(5, 64);
 329 static const LLT V6S64 = LLT::fixed_vector(6, 64);
 330 static const LLT V7S64 = LLT::fixed_vector(7, 64);
 331 static const LLT V8S64 = LLT::fixed_vector(8, 64);
 332 static const LLT V16S64 = LLT::fixed_vector(16, 64);
 333
 334 static const LLT V2S128 = LLT::fixed_vector(2, 128);
 335 static const LLT V4S128 = LLT::fixed_vector(4, 128);
 336
 337 static std::initializer_list<LLT> AllScalarTypes = {
 338     S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
 339
 340 static std::initializer_list<LLT> AllS16Vectors{
 341     V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
 342
 343 static std::initializer_list<LLT> AllS32Vectors = {
 344     V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
 345     V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
 346
 347 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
 348                                                    V6S64, V7S64, V8S64, V16S64};
 349
 350 // Checks whether a type is in the list of legal register types.
 351 static bool isRegisterClassType(LLT Ty) {
 352   if (Ty.isPointerOrPointerVector())
 353     Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
 354
 355   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
 356          is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
 357 }
 358
 359 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
 360   return [TypeIdx](const LegalityQuery &Query) {
 361     return isRegisterClassType(Query.Types[TypeIdx]);
 362   };
 363 }
 364
 365 // If we have a truncating store or an extending load with a data size larger
 366 // than 32-bits, we need to reduce to a 32-bit type.
 367 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
 368   return [=](const LegalityQuery &Query) {
 369     const LLT Ty = Query.Types[TypeIdx];
 370     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
 371            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
 372   };
 373 }
 374
 375 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 376 // handle some operations by just promoting the register during
 377 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 378 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
 379                                     bool IsLoad, bool IsAtomic) {
 380   switch (AS) {
 381   case AMDGPUAS::PRIVATE_ADDRESS:
 382     // FIXME: Private element size.
 383     return ST.enableFlatScratch() ? 128 : 32;
 384   case AMDGPUAS::LOCAL_ADDRESS:
 385     return ST.useDS128() ? 128 : 64;
 386   case AMDGPUAS::GLOBAL_ADDRESS:
 387   case AMDGPUAS::CONSTANT_ADDRESS:
 388   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
 389   case AMDGPUAS::BUFFER_RESOURCE:
 390     // Treat constant and global as identical. SMRD loads are sometimes usable for
 391     // global loads (ideally constant address space should be eliminated)
 392     // depending on the context. Legality cannot be context dependent, but
 393     // RegBankSelect can split the load as necessary depending on the pointer
 394     // register bank/uniformity and if the memory is invariant or not written in a
 395     // kernel.
 396     return IsLoad ? 512 : 128;
 397   default:
 398     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
 399     // if they may alias scratch depending on the subtarget.  This needs to be
 400     // moved to custom handling to use addressMayBeAccessedAsPrivate
 401     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
 402   }
 403 }
 404
 405 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
 406                                  const LegalityQuery &Query) {
 407   const LLT Ty = Query.Types[0];
 408
 409   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
 410   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
 411
 412   unsigned RegSize = Ty.getSizeInBits();
 413   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
 414   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
 415   unsigned AS = Query.Types[1].getAddressSpace();
 416
 417   // All of these need to be custom lowered to cast the pointer operand.
 418   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
 419     return false;
 420
 421   // Do not handle extending vector loads.
 422   if (Ty.isVector() && MemSize != RegSize)
 423     return false;
 424
 425   // TODO: We should be able to widen loads if the alignment is high enough, but
 426   // we also need to modify the memory access size.
 427 #if 0
 428   // Accept widening loads based on alignment.
 429   if (IsLoad && MemSize < Size)
 430     MemSize = std::max(MemSize, Align);
 431 #endif
 432
 433   // Only 1-byte and 2-byte to 32-bit extloads are valid.
 434   if (MemSize != RegSize && RegSize != 32)
 435     return false;
 436
 437   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
 438                                     Query.MMODescrs[0].Ordering !=
 439                                         AtomicOrdering::NotAtomic))
 440     return false;
 441
 442   switch (MemSize) {
 443   case 8:
 444   case 16:
 445   case 32:
 446   case 64:
 447   case 128:
 448     break;
 449   case 96:
 450     if (!ST.hasDwordx3LoadStores())
 451       return false;
 452     break;
 453   case 256:
 454   case 512:
 455     // These may contextually need to be broken down.
 456     break;
 457   default:
 458     return false;
 459   }
 460
 461   assert(RegSize >= MemSize);
 462
 463   if (AlignBits < MemSize) {
 464     const SITargetLowering *TLI = ST.getTargetLowering();
 465     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
 466                                                  Align(AlignBits / 8)))
 467       return false;
 468   }
 469
 470   return true;
 471 }
 472
 473 // The newer buffer intrinsic forms take their resource arguments as
 474 // pointers in address space 8, aka s128 values. However, in order to not break
 475 // SelectionDAG, the underlying operations have to continue to take v4i32
 476 // arguments. Therefore, we convert resource pointers - or vectors of them
 477 // to integer values here.
 478 static bool hasBufferRsrcWorkaround(const LLT Ty) {
 479   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
 480     return true;
 481   if (Ty.isVector()) {
 482     const LLT ElemTy = Ty.getElementType();
 483     return hasBufferRsrcWorkaround(ElemTy);
 484   }
 485   return false;
 486 }
 487
 488 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
 489 // workaround this. Eventually it should ignore the type for loads and only care
 490 // about the size. Return true in cases where we will workaround this for now by
 491 // bitcasting.
 492 static bool loadStoreBitcastWorkaround(const LLT Ty) {
 493   if (EnableNewLegality)
 494     return false;
 495
 496   const unsigned Size = Ty.getSizeInBits();
 497   if (Ty.isPointerVector())
 498     return true;
 499   if (Size <= 64)
 500     return false;
 501   // Address space 8 pointers get their own workaround.
 502   if (hasBufferRsrcWorkaround(Ty))
 503     return false;
 504   if (!Ty.isVector())
 505     return true;
 506
 507   unsigned EltSize = Ty.getScalarSizeInBits();
 508   return EltSize != 32 && EltSize != 64;
 509 }
 510
 511 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
 512   const LLT Ty = Query.Types[0];
 513   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
 514          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
 515 }
 516
 517 /// Return true if a load or store of the type should be lowered with a bitcast
 518 /// to a different type.
 519 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
 520                                        const LLT MemTy) {
 521   const unsigned MemSizeInBits = MemTy.getSizeInBits();
 522   const unsigned Size = Ty.getSizeInBits();
 523   if (Size != MemSizeInBits)
 524     return Size <= 32 && Ty.isVector();
 525
 526   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
 527     return true;
 528
 529   // Don't try to handle bitcasting vector ext loads for now.
 530   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
 531          (Size <= 32 || isRegisterSize(Size)) &&
 532          !isRegisterVectorElementType(Ty.getElementType());
 533 }
 534
 535 /// Return true if we should legalize a load by widening an odd sized memory
 536 /// access up to the alignment. Note this case when the memory access itself
 537 /// changes, not the size of the result register.
 538 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
 539                             uint64_t AlignInBits, unsigned AddrSpace,
 540                             unsigned Opcode) {
 541   unsigned SizeInBits = MemoryTy.getSizeInBits();
 542   // We don't want to widen cases that are naturally legal.
 543   if (isPowerOf2_32(SizeInBits))
 544     return false;
 545
 546   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
 547   // end up widening these for a scalar load during RegBankSelect, if we don't
 548   // have 96-bit scalar loads.
 549   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
 550     return false;
 551
 552   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
 553     return false;
 554
 555   // A load is known dereferenceable up to the alignment, so it's legal to widen
 556   // to it.
 557   //
 558   // TODO: Could check dereferenceable for less aligned cases.
 559   unsigned RoundedSize = NextPowerOf2(SizeInBits);
 560   if (AlignInBits < RoundedSize)
 561     return false;
 562
 563   // Do not widen if it would introduce a slow unaligned load.
 564   const SITargetLowering *TLI = ST.getTargetLowering();
 565   unsigned Fast = 0;
 566   return TLI->allowsMisalignedMemoryAccessesImpl(
 567              RoundedSize, AddrSpace, Align(AlignInBits / 8),
 568              MachineMemOperand::MOLoad, &Fast) &&
 569          Fast;
 570 }
 571
 572 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
 573                             unsigned Opcode) {
 574   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
 575     return false;
 576
 577   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
 578                          Query.MMODescrs[0].AlignInBits,
 579                          Query.Types[1].getAddressSpace(), Opcode);
 580 }
 581
 582 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
 583 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
 584 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
 585 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
 586                                    MachineRegisterInfo &MRI, unsigned Idx) {
 587   MachineOperand &MO = MI.getOperand(Idx);
 588
 589   const LLT PointerTy = MRI.getType(MO.getReg());
 590
 591   // Paranoidly prevent us from doing this multiple times.
 592   if (!hasBufferRsrcWorkaround(PointerTy))
 593     return PointerTy;
 594
 595   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
 596   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
 597   if (!PointerTy.isVector()) {
 598     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
 599     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
 600     const LLT S32 = LLT::scalar(32);
 601
 602     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
 603     std::array<Register, 4> VectorElems;
 604     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
 605     for (unsigned I = 0; I < NumParts; ++I)
 606       VectorElems[I] =
 607           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
 608     B.buildMergeValues(MO, VectorElems);
 609     MO.setReg(VectorReg);
 610     return VectorTy;
 611   }
 612   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
 613   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
 614   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
 615   B.buildIntToPtr(MO, Scalar);
 616   MO.setReg(BitcastReg);
 617
 618   return VectorTy;
 619 }
 620
 621 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
 622 /// the form in which the value must be in order to be passed to the low-level
 623 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
 624 /// needed in order to account for the fact that we can't define a register
 625 /// class for s128 without breaking SelectionDAG.
 626 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
 627   MachineRegisterInfo &MRI = *B.getMRI();
 628   const LLT PointerTy = MRI.getType(Pointer);
 629   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
 630   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
 631
 632   if (!PointerTy.isVector()) {
 633     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
 634     SmallVector<Register, 4> PointerParts;
 635     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
 636     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
 637     for (unsigned I = 0; I < NumParts; ++I)
 638       PointerParts.push_back(Unmerged.getReg(I));
 639     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
 640   }
 641   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
 642   return B.buildBitcast(VectorTy, Scalar).getReg(0);
 643 }
 644
 645 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
 646                                      unsigned Idx) {
 647   MachineOperand &MO = MI.getOperand(Idx);
 648
 649   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
 650   // Paranoidly prevent us from doing this multiple times.
 651   if (!hasBufferRsrcWorkaround(PointerTy))
 652     return;
 653   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
 654 }
 655
 656 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 657                                          const GCNTargetMachine &TM)
 658   :  ST(ST_) {
 659   using namespace TargetOpcode;
 660
 661   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 662     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 663   };
 664
 665   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 666   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 667   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 668   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 669   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 670   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 671   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 672   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
 673   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
 674   const LLT BufferStridedPtr =
 675       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
 676
 677   const LLT CodePtr = FlatPtr;
 678
 679   const std::initializer_list<LLT> AddrSpaces64 = {
 680     GlobalPtr, ConstantPtr, FlatPtr
 681   };
 682
 683   const std::initializer_list<LLT> AddrSpaces32 = {
 684     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 685   };
 686
 687   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
 688
 689   const std::initializer_list<LLT> FPTypesBase = {
 690     S32, S64
 691   };
 692
 693   const std::initializer_list<LLT> FPTypes16 = {
 694     S32, S64, S16
 695   };
 696
 697   const std::initializer_list<LLT> FPTypesPK16 = {
 698     S32, S64, S16, V2S16
 699   };
 700
 701   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
 702
 703   // s1 for VCC branches, s32 for SCC branches.
 704   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
 705
 706   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 707   // elements for v3s16
 708   getActionDefinitionsBuilder(G_PHI)
 709       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
 710       .legalFor(AllS32Vectors)
 711       .legalFor(AllS64Vectors)
 712       .legalFor(AddrSpaces64)
 713       .legalFor(AddrSpaces32)
 714       .legalFor(AddrSpaces128)
 715       .legalIf(isPointer(0))
 716       .clampScalar(0, S16, S256)
 717       .widenScalarToNextPow2(0, 32)
 718       .clampMaxNumElements(0, S32, 16)
 719       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 720       .scalarize(0);
 721
 722   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
 723     // Full set of gfx9 features.
 724     if (ST.hasScalarAddSub64()) {
 725       getActionDefinitionsBuilder({G_ADD, G_SUB})
 726           .legalFor({S64, S32, S16, V2S16})
 727           .clampMaxNumElementsStrict(0, S16, 2)
 728           .scalarize(0)
 729           .minScalar(0, S16)
 730           .widenScalarToNextMultipleOf(0, 32)
 731           .maxScalar(0, S32);
 732     } else {
 733       getActionDefinitionsBuilder({G_ADD, G_SUB})
 734           .legalFor({S32, S16, V2S16})
 735           .clampMaxNumElementsStrict(0, S16, 2)
 736           .scalarize(0)
 737           .minScalar(0, S16)
 738           .widenScalarToNextMultipleOf(0, 32)
 739           .maxScalar(0, S32);
 740     }
 741
 742     if (ST.hasScalarSMulU64()) {
 743       getActionDefinitionsBuilder(G_MUL)
 744           .legalFor({S64, S32, S16, V2S16})
 745           .clampMaxNumElementsStrict(0, S16, 2)
 746           .scalarize(0)
 747           .minScalar(0, S16)
 748           .widenScalarToNextMultipleOf(0, 32)
 749           .custom();
 750     } else {
 751       getActionDefinitionsBuilder(G_MUL)
 752           .legalFor({S32, S16, V2S16})
 753           .clampMaxNumElementsStrict(0, S16, 2)
 754           .scalarize(0)
 755           .minScalar(0, S16)
 756           .widenScalarToNextMultipleOf(0, 32)
 757           .custom();
 758     }
 759     assert(ST.hasMad64_32());
 760
 761     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
 762       .legalFor({S32, S16, V2S16}) // Clamp modifier
 763       .minScalarOrElt(0, S16)
 764       .clampMaxNumElementsStrict(0, S16, 2)
 765       .scalarize(0)
 766       .widenScalarToNextPow2(0, 32)
 767       .lower();
 768   } else if (ST.has16BitInsts()) {
 769     getActionDefinitionsBuilder({G_ADD, G_SUB})
 770       .legalFor({S32, S16})
 771       .minScalar(0, S16)
 772       .widenScalarToNextMultipleOf(0, 32)
 773       .maxScalar(0, S32)
 774       .scalarize(0);
 775
 776     getActionDefinitionsBuilder(G_MUL)
 777       .legalFor({S32, S16})
 778       .scalarize(0)
 779       .minScalar(0, S16)
 780       .widenScalarToNextMultipleOf(0, 32)
 781       .custom();
 782     assert(ST.hasMad64_32());
 783
 784     // Technically the saturating operations require clamp bit support, but this
 785     // was introduced at the same time as 16-bit operations.
 786     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
 787       .legalFor({S32, S16}) // Clamp modifier
 788       .minScalar(0, S16)
 789       .scalarize(0)
 790       .widenScalarToNextPow2(0, 16)
 791       .lower();
 792
 793     // We're just lowering this, but it helps get a better result to try to
 794     // coerce to the desired type first.
 795     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
 796       .minScalar(0, S16)
 797       .scalarize(0)
 798       .lower();
 799   } else {
 800     getActionDefinitionsBuilder({G_ADD, G_SUB})
 801       .legalFor({S32})
 802       .widenScalarToNextMultipleOf(0, 32)
 803       .clampScalar(0, S32, S32)
 804       .scalarize(0);
 805
 806     auto &Mul = getActionDefinitionsBuilder(G_MUL)
 807       .legalFor({S32})
 808       .scalarize(0)
 809       .minScalar(0, S32)
 810       .widenScalarToNextMultipleOf(0, 32);
 811
 812     if (ST.hasMad64_32())
 813       Mul.custom();
 814     else
 815       Mul.maxScalar(0, S32);
 816
 817     if (ST.hasIntClamp()) {
 818       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
 819         .legalFor({S32}) // Clamp modifier.
 820         .scalarize(0)
 821         .minScalarOrElt(0, S32)
 822         .lower();
 823     } else {
 824       // Clamp bit support was added in VI, along with 16-bit operations.
 825       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
 826         .minScalar(0, S32)
 827         .scalarize(0)
 828         .lower();
 829     }
 830
 831     // FIXME: DAG expansion gets better results. The widening uses the smaller
 832     // range values and goes for the min/max lowering directly.
 833     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
 834       .minScalar(0, S32)
 835       .scalarize(0)
 836       .lower();
 837   }
 838
 839   getActionDefinitionsBuilder(
 840       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
 841       .customFor({S32, S64})
 842       .clampScalar(0, S32, S64)
 843       .widenScalarToNextPow2(0, 32)
 844       .scalarize(0);
 845
 846   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 847                    .legalFor({S32})
 848                    .maxScalar(0, S32);
 849
 850   if (ST.hasVOP3PInsts()) {
 851     Mulh
 852       .clampMaxNumElements(0, S8, 2)
 853       .lowerFor({V2S8});
 854   }
 855
 856   Mulh
 857     .scalarize(0)
 858     .lower();
 859
 860   // Report legal for any types we can handle anywhere. For the cases only legal
 861   // on the SALU, RegBankSelect will be able to re-legalize.
 862   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 863     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 864     .clampScalar(0, S32, S64)
 865     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 866     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
 867     .widenScalarToNextPow2(0)
 868     .scalarize(0);
 869
 870   getActionDefinitionsBuilder(
 871       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 872       .legalFor({{S32, S1}, {S32, S32}})
 873       .clampScalar(0, S32, S32)
 874       .scalarize(0);
 875
 876   getActionDefinitionsBuilder(G_BITCAST)
 877       // Don't worry about the size constraint.
 878       .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
 879       .lower();
 880
 881   getActionDefinitionsBuilder(G_CONSTANT)
 882     .legalFor({S1, S32, S64, S16, GlobalPtr,
 883                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 884     .legalIf(isPointer(0))
 885     .clampScalar(0, S32, S64)
 886     .widenScalarToNextPow2(0);
 887
 888   getActionDefinitionsBuilder(G_FCONSTANT)
 889     .legalFor({S32, S64, S16})
 890     .clampScalar(0, S16, S64);
 891
 892   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
 893       .legalIf(isRegisterClassType(0))
 894       // s1 and s16 are special cases because they have legal operations on
 895       // them, but don't really occupy registers in the normal way.
 896       .legalFor({S1, S16})
 897       .clampNumElements(0, V16S32, V32S32)
 898       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 899       .clampScalarOrElt(0, S32, MaxScalar)
 900       .widenScalarToNextPow2(0, 32)
 901       .clampMaxNumElements(0, S32, 16);
 902
 903   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
 904
 905   // If the amount is divergent, we have to do a wave reduction to get the
 906   // maximum value, so this is expanded during RegBankSelect.
 907   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
 908     .legalFor({{PrivatePtr, S32}});
 909
 910   getActionDefinitionsBuilder(G_STACKSAVE)
 911     .customFor({PrivatePtr});
 912   getActionDefinitionsBuilder(G_STACKRESTORE)
 913     .legalFor({PrivatePtr});
 914
 915   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
 916
 917   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
 918     .customIf(typeIsNot(0, PrivatePtr));
 919
 920   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
 921
 922   auto &FPOpActions = getActionDefinitionsBuilder(
 923     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
 924       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
 925     .legalFor({S32, S64});
 926   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 927     .customFor({S32, S64});
 928   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
 929     .customFor({S32, S64});
 930
 931   if (ST.has16BitInsts()) {
 932     if (ST.hasVOP3PInsts())
 933       FPOpActions.legalFor({S16, V2S16});
 934     else
 935       FPOpActions.legalFor({S16});
 936
 937     TrigActions.customFor({S16});
 938     FDIVActions.customFor({S16});
 939   }
 940
 941   if (ST.hasPackedFP32Ops()) {
 942     FPOpActions.legalFor({V2S32});
 943     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
 944   }
 945
 946   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 947       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 948
 949   if (ST.hasVOP3PInsts()) {
 950     MinNumMaxNum.customFor(FPTypesPK16)
 951       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 952       .clampMaxNumElements(0, S16, 2)
 953       .clampScalar(0, S16, S64)
 954       .scalarize(0);
 955   } else if (ST.has16BitInsts()) {
 956     MinNumMaxNum.customFor(FPTypes16)
 957       .clampScalar(0, S16, S64)
 958       .scalarize(0);
 959   } else {
 960     MinNumMaxNum.customFor(FPTypesBase)
 961       .clampScalar(0, S32, S64)
 962       .scalarize(0);
 963   }
 964
 965   if (ST.hasVOP3PInsts())
 966     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
 967
 968   FPOpActions
 969     .scalarize(0)
 970     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 971
 972   TrigActions
 973     .scalarize(0)
 974     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 975
 976   FDIVActions
 977     .scalarize(0)
 978     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 979
 980   getActionDefinitionsBuilder({G_FNEG, G_FABS})
 981     .legalFor(FPTypesPK16)
 982     .clampMaxNumElementsStrict(0, S16, 2)
 983     .scalarize(0)
 984     .clampScalar(0, S16, S64);
 985
 986   if (ST.has16BitInsts()) {
 987     getActionDefinitionsBuilder(G_FSQRT)
 988       .legalFor({S16})
 989       .customFor({S32, S64})
 990       .scalarize(0)
 991       .unsupported();
 992     getActionDefinitionsBuilder(G_FFLOOR)
 993       .legalFor({S32, S64, S16})
 994       .scalarize(0)
 995       .clampScalar(0, S16, S64);
 996
 997     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
 998       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
 999       .scalarize(0)
1000       .maxScalarIf(typeIs(0, S16), 1, S16)
1001       .clampScalar(1, S32, S32)
1002       .lower();
1003
1004     getActionDefinitionsBuilder(G_FFREXP)
1005       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1006       .scalarize(0)
1007       .lower();
1008   } else {
1009     getActionDefinitionsBuilder(G_FSQRT)
1010       .customFor({S32, S64, S16})
1011       .scalarize(0)
1012       .unsupported();
1013
1014
1015     if (ST.hasFractBug()) {
1016       getActionDefinitionsBuilder(G_FFLOOR)
1017         .customFor({S64})
1018         .legalFor({S32, S64})
1019         .scalarize(0)
1020         .clampScalar(0, S32, S64);
1021     } else {
1022       getActionDefinitionsBuilder(G_FFLOOR)
1023         .legalFor({S32, S64})
1024         .scalarize(0)
1025         .clampScalar(0, S32, S64);
1026     }
1027
1028     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1029       .legalFor({{S32, S32}, {S64, S32}})
1030       .scalarize(0)
1031       .clampScalar(0, S32, S64)
1032       .clampScalar(1, S32, S32)
1033       .lower();
1034
1035     getActionDefinitionsBuilder(G_FFREXP)
1036       .customFor({{S32, S32}, {S64, S32}})
1037       .scalarize(0)
1038       .minScalar(0, S32)
1039       .clampScalar(1, S32, S32)
1040       .lower();
1041   }
1042
1043   getActionDefinitionsBuilder(G_FPTRUNC)
1044     .legalFor({{S32, S64}, {S16, S32}})
1045     .scalarize(0)
1046     .lower();
1047
1048   getActionDefinitionsBuilder(G_FPEXT)
1049     .legalFor({{S64, S32}, {S32, S16}})
1050     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1051     .scalarize(0);
1052
1053   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1054   if (ST.has16BitInsts()) {
1055     FSubActions
1056       // Use actual fsub instruction
1057       .legalFor({S32, S16})
1058       // Must use fadd + fneg
1059       .lowerFor({S64, V2S16});
1060   } else {
1061     FSubActions
1062       // Use actual fsub instruction
1063       .legalFor({S32})
1064       // Must use fadd + fneg
1065       .lowerFor({S64, S16, V2S16});
1066   }
1067
1068   FSubActions
1069     .scalarize(0)
1070     .clampScalar(0, S32, S64);
1071
1072   // Whether this is legal depends on the floating point mode for the function.
1073   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1074   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1075     FMad.customFor({S32, S16});
1076   else if (ST.hasMadMacF32Insts())
1077     FMad.customFor({S32});
1078   else if (ST.hasMadF16())
1079     FMad.customFor({S16});
1080   FMad.scalarize(0)
1081       .lower();
1082
1083   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1084   if (ST.has16BitInsts()) {
1085     FRem.customFor({S16, S32, S64});
1086   } else {
1087     FRem.minScalar(0, S32)
1088         .customFor({S32, S64});
1089   }
1090   FRem.scalarize(0);
1091
1092   // TODO: Do we need to clamp maximum bitwidth?
1093   getActionDefinitionsBuilder(G_TRUNC)
1094     .legalIf(isScalar(0))
1095     .legalFor({{V2S16, V2S32}})
1096     .clampMaxNumElements(0, S16, 2)
1097     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1098     // situations (like an invalid implicit use), we don't want to infinite loop
1099     // in the legalizer.
1100     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1101     .alwaysLegal();
1102
1103   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1104     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1105                {S32, S1}, {S64, S1}, {S16, S1}})
1106     .scalarize(0)
1107     .clampScalar(0, S32, S64)
1108     .widenScalarToNextPow2(1, 32);
1109
1110   // TODO: Split s1->s64 during regbankselect for VALU.
1111   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1112                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1113                     .lowerIf(typeIs(1, S1))
1114                     .customFor({{S32, S64}, {S64, S64}});
1115   if (ST.has16BitInsts())
1116     IToFP.legalFor({{S16, S16}});
1117   IToFP.clampScalar(1, S32, S64)
1118        .minScalar(0, S32)
1119        .scalarize(0)
1120        .widenScalarToNextPow2(1);
1121
1122   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1123     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1124     .customFor({{S64, S32}, {S64, S64}})
1125     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1126   if (ST.has16BitInsts())
1127     FPToI.legalFor({{S16, S16}});
1128   else
1129     FPToI.minScalar(1, S32);
1130
1131   FPToI.minScalar(0, S32)
1132        .widenScalarToNextPow2(0, 32)
1133        .scalarize(0)
1134        .lower();
1135
1136   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1137       .clampScalar(0, S16, S64)
1138       .scalarize(0)
1139       .lower();
1140
1141   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1142       .legalFor({S16, S32})
1143       .scalarize(0)
1144       .lower();
1145
1146   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1147   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1148       .scalarize(0)
1149       .lower();
1150
1151   getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1152       .clampScalar(0, S16, S64)
1153       .scalarize(0)
1154       .lower();
1155
1156   if (ST.has16BitInsts()) {
1157     getActionDefinitionsBuilder(
1158         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159         .legalFor({S16, S32, S64})
1160         .clampScalar(0, S16, S64)
1161         .scalarize(0);
1162   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1163     getActionDefinitionsBuilder(
1164         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1165         .legalFor({S32, S64})
1166         .clampScalar(0, S32, S64)
1167         .scalarize(0);
1168   } else {
1169     getActionDefinitionsBuilder(
1170         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1171         .legalFor({S32})
1172         .customFor({S64})
1173         .clampScalar(0, S32, S64)
1174         .scalarize(0);
1175   }
1176
1177   getActionDefinitionsBuilder(G_PTR_ADD)
1178       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1179       .legalIf(all(isPointer(0), sameSize(0, 1)))
1180       .scalarize(0)
1181       .scalarSameSizeAs(1, 0);
1182
1183   getActionDefinitionsBuilder(G_PTRMASK)
1184     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1185     .scalarSameSizeAs(1, 0)
1186     .scalarize(0);
1187
1188   auto &CmpBuilder =
1189     getActionDefinitionsBuilder(G_ICMP)
1190     // The compare output type differs based on the register bank of the output,
1191     // so make both s1 and s32 legal.
1192     //
1193     // Scalar compares producing output in scc will be promoted to s32, as that
1194     // is the allocatable register type that will be needed for the copy from
1195     // scc. This will be promoted during RegBankSelect, and we assume something
1196     // before that won't try to use s32 result types.
1197     //
1198     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1199     // bank.
1200     .legalForCartesianProduct(
1201       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1202     .legalForCartesianProduct(
1203       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1204   if (ST.has16BitInsts()) {
1205     CmpBuilder.legalFor({{S1, S16}});
1206   }
1207
1208   CmpBuilder
1209     .widenScalarToNextPow2(1)
1210     .clampScalar(1, S32, S64)
1211     .scalarize(0)
1212     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1213
1214   auto &FCmpBuilder =
1215       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1216           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1217
1218   if (ST.hasSALUFloatInsts())
1219     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1220
1221   FCmpBuilder
1222     .widenScalarToNextPow2(1)
1223     .clampScalar(1, S32, S64)
1224     .scalarize(0);
1225
1226   // FIXME: fpow has a selection pattern that should move to custom lowering.
1227   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1228   if (ST.has16BitInsts())
1229     ExpOps.customFor({{S32}, {S16}});
1230   else
1231     ExpOps.customFor({S32});
1232   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1233         .scalarize(0);
1234
1235   getActionDefinitionsBuilder(G_FPOWI)
1236     .clampScalar(0, MinScalarFPTy, S32)
1237     .lower();
1238
1239   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1240   Log2Ops.customFor({S32});
1241   if (ST.has16BitInsts())
1242     Log2Ops.legalFor({S16});
1243   else
1244     Log2Ops.customFor({S16});
1245   Log2Ops.scalarize(0)
1246     .lower();
1247
1248   auto &LogOps =
1249       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1250   LogOps.customFor({S32, S16});
1251   LogOps.clampScalar(0, MinScalarFPTy, S32)
1252         .scalarize(0);
1253
1254   // The 64-bit versions produce 32-bit results, but only on the SALU.
1255   getActionDefinitionsBuilder(G_CTPOP)
1256     .legalFor({{S32, S32}, {S32, S64}})
1257     .clampScalar(0, S32, S32)
1258     .widenScalarToNextPow2(1, 32)
1259     .clampScalar(1, S32, S64)
1260     .scalarize(0)
1261     .widenScalarToNextPow2(0, 32);
1262
1263   // If no 16 bit instr is available, lower into different instructions.
1264   if (ST.has16BitInsts())
1265     getActionDefinitionsBuilder(G_IS_FPCLASS)
1266         .legalForCartesianProduct({S1}, FPTypes16)
1267         .widenScalarToNextPow2(1)
1268         .scalarize(0)
1269         .lower();
1270   else
1271     getActionDefinitionsBuilder(G_IS_FPCLASS)
1272         .legalForCartesianProduct({S1}, FPTypesBase)
1273         .lowerFor({S1, S16})
1274         .widenScalarToNextPow2(1)
1275         .scalarize(0)
1276         .lower();
1277
1278   // The hardware instructions return a different result on 0 than the generic
1279   // instructions expect. The hardware produces -1, but these produce the
1280   // bitwidth.
1281   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1282     .scalarize(0)
1283     .clampScalar(0, S32, S32)
1284     .clampScalar(1, S32, S64)
1285     .widenScalarToNextPow2(0, 32)
1286     .widenScalarToNextPow2(1, 32)
1287     .custom();
1288
1289   // The 64-bit versions produce 32-bit results, but only on the SALU.
1290   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1291       .legalFor({{S32, S32}, {S32, S64}})
1292       .customIf(scalarNarrowerThan(1, 32))
1293       .clampScalar(0, S32, S32)
1294       .clampScalar(1, S32, S64)
1295       .scalarize(0)
1296       .widenScalarToNextPow2(0, 32)
1297       .widenScalarToNextPow2(1, 32);
1298
1299   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1300       .legalFor({{S32, S32}, {S32, S64}})
1301       .clampScalar(0, S32, S32)
1302       .clampScalar(1, S32, S64)
1303       .scalarize(0)
1304       .widenScalarToNextPow2(0, 32)
1305       .widenScalarToNextPow2(1, 32);
1306
1307   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1308   // RegBankSelect.
1309   getActionDefinitionsBuilder(G_BITREVERSE)
1310     .legalFor({S32, S64})
1311     .clampScalar(0, S32, S64)
1312     .scalarize(0)
1313     .widenScalarToNextPow2(0);
1314
1315   if (ST.has16BitInsts()) {
1316     getActionDefinitionsBuilder(G_BSWAP)
1317       .legalFor({S16, S32, V2S16})
1318       .clampMaxNumElementsStrict(0, S16, 2)
1319       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1320       // narrowScalar limitation.
1321       .widenScalarToNextPow2(0)
1322       .clampScalar(0, S16, S32)
1323       .scalarize(0);
1324
1325     if (ST.hasVOP3PInsts()) {
1326       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1327         .legalFor({S32, S16, V2S16})
1328         .clampMaxNumElements(0, S16, 2)
1329         .minScalar(0, S16)
1330         .widenScalarToNextPow2(0)
1331         .scalarize(0)
1332         .lower();
1333     } else {
1334       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1335         .legalFor({S32, S16})
1336         .widenScalarToNextPow2(0)
1337         .minScalar(0, S16)
1338         .scalarize(0)
1339         .lower();
1340     }
1341   } else {
1342     // TODO: Should have same legality without v_perm_b32
1343     getActionDefinitionsBuilder(G_BSWAP)
1344       .legalFor({S32})
1345       .lowerIf(scalarNarrowerThan(0, 32))
1346       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1347       // narrowScalar limitation.
1348       .widenScalarToNextPow2(0)
1349       .maxScalar(0, S32)
1350       .scalarize(0)
1351       .lower();
1352
1353     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354       .legalFor({S32})
1355       .minScalar(0, S32)
1356       .widenScalarToNextPow2(0)
1357       .scalarize(0)
1358       .lower();
1359   }
1360
1361   getActionDefinitionsBuilder(G_INTTOPTR)
1362       // List the common cases
1363       .legalForCartesianProduct(AddrSpaces64, {S64})
1364       .legalForCartesianProduct(AddrSpaces32, {S32})
1365       .scalarize(0)
1366       // Accept any address space as long as the size matches
1367       .legalIf(sameSize(0, 1))
1368       .widenScalarIf(smallerThan(1, 0),
1369                      [](const LegalityQuery &Query) {
1370                        return std::pair(
1371                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1372                      })
1373       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1374         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1375       });
1376
1377   getActionDefinitionsBuilder(G_PTRTOINT)
1378       // List the common cases
1379       .legalForCartesianProduct(AddrSpaces64, {S64})
1380       .legalForCartesianProduct(AddrSpaces32, {S32})
1381       .scalarize(0)
1382       // Accept any address space as long as the size matches
1383       .legalIf(sameSize(0, 1))
1384       .widenScalarIf(smallerThan(0, 1),
1385                      [](const LegalityQuery &Query) {
1386                        return std::pair(
1387                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1388                      })
1389       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1390         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1391       });
1392
1393   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1394     .scalarize(0)
1395     .custom();
1396
1397   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1398                                     bool IsLoad) -> bool {
1399     const LLT DstTy = Query.Types[0];
1400
1401     // Split vector extloads.
1402     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1403
1404     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1405       return true;
1406
1407     const LLT PtrTy = Query.Types[1];
1408     unsigned AS = PtrTy.getAddressSpace();
1409     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1410                                       Query.MMODescrs[0].Ordering !=
1411                                           AtomicOrdering::NotAtomic))
1412       return true;
1413
1414     // Catch weird sized loads that don't evenly divide into the access sizes
1415     // TODO: May be able to widen depending on alignment etc.
1416     unsigned NumRegs = (MemSize + 31) / 32;
1417     if (NumRegs == 3) {
1418       if (!ST.hasDwordx3LoadStores())
1419         return true;
1420     } else {
1421       // If the alignment allows, these should have been widened.
1422       if (!isPowerOf2_32(NumRegs))
1423         return true;
1424     }
1425
1426     return false;
1427   };
1428
1429   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1430   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1431   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1432
1433   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1434   // LDS
1435   // TODO: Unsupported flat for SI.
1436
1437   for (unsigned Op : {G_LOAD, G_STORE}) {
1438     const bool IsStore = Op == G_STORE;
1439
1440     auto &Actions = getActionDefinitionsBuilder(Op);
1441     // Explicitly list some common cases.
1442     // TODO: Does this help compile time at all?
1443     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1444                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1445                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1446                                       {S64, GlobalPtr, S64, GlobalAlign32},
1447                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1448                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1449                                       {S32, GlobalPtr, S8, GlobalAlign8},
1450                                       {S32, GlobalPtr, S16, GlobalAlign16},
1451
1452                                       {S32, LocalPtr, S32, 32},
1453                                       {S64, LocalPtr, S64, 32},
1454                                       {V2S32, LocalPtr, V2S32, 32},
1455                                       {S32, LocalPtr, S8, 8},
1456                                       {S32, LocalPtr, S16, 16},
1457                                       {V2S16, LocalPtr, S32, 32},
1458
1459                                       {S32, PrivatePtr, S32, 32},
1460                                       {S32, PrivatePtr, S8, 8},
1461                                       {S32, PrivatePtr, S16, 16},
1462                                       {V2S16, PrivatePtr, S32, 32},
1463
1464                                       {S32, ConstantPtr, S32, GlobalAlign32},
1465                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1466                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1467                                       {S64, ConstantPtr, S64, GlobalAlign32},
1468                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1469     Actions.legalIf(
1470       [=](const LegalityQuery &Query) -> bool {
1471         return isLoadStoreLegal(ST, Query);
1472       });
1473
1474     // The custom pointers (fat pointers, buffer resources) don't work with load
1475     // and store at this level. Fat pointers should have been lowered to
1476     // intrinsics before the translation to MIR.
1477     Actions.unsupportedIf(
1478         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1479
1480     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1481     // ptrtoint. This is needed to account for the fact that we can't have i128
1482     // as a register class for SelectionDAG reasons.
1483     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1484       return hasBufferRsrcWorkaround(Query.Types[0]);
1485     });
1486
1487     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1488     // 64-bits.
1489     //
1490     // TODO: Should generalize bitcast action into coerce, which will also cover
1491     // inserting addrspacecasts.
1492     Actions.customIf(typeIs(1, Constant32Ptr));
1493
1494     // Turn any illegal element vectors into something easier to deal
1495     // with. These will ultimately produce 32-bit scalar shifts to extract the
1496     // parts anyway.
1497     //
1498     // For odd 16-bit element vectors, prefer to split those into pieces with
1499     // 16-bit vector parts.
1500     Actions.bitcastIf(
1501       [=](const LegalityQuery &Query) -> bool {
1502         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1503                                           Query.MMODescrs[0].MemoryTy);
1504       }, bitcastToRegisterType(0));
1505
1506     if (!IsStore) {
1507       // Widen suitably aligned loads by loading extra bytes. The standard
1508       // legalization actions can't properly express widening memory operands.
1509       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1510         return shouldWidenLoad(ST, Query, G_LOAD);
1511       });
1512     }
1513
1514     // FIXME: load/store narrowing should be moved to lower action
1515     Actions
1516         .narrowScalarIf(
1517             [=](const LegalityQuery &Query) -> bool {
1518               return !Query.Types[0].isVector() &&
1519                      needToSplitMemOp(Query, Op == G_LOAD);
1520             },
1521             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1522               const LLT DstTy = Query.Types[0];
1523               const LLT PtrTy = Query.Types[1];
1524
1525               const unsigned DstSize = DstTy.getSizeInBits();
1526               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1527
1528               // Split extloads.
1529               if (DstSize > MemSize)
1530                 return std::pair(0, LLT::scalar(MemSize));
1531
1532               unsigned MaxSize = maxSizeForAddrSpace(
1533                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1534                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1535               if (MemSize > MaxSize)
1536                 return std::pair(0, LLT::scalar(MaxSize));
1537
1538               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1539               return std::pair(0, LLT::scalar(Align));
1540             })
1541         .fewerElementsIf(
1542             [=](const LegalityQuery &Query) -> bool {
1543               return Query.Types[0].isVector() &&
1544                      needToSplitMemOp(Query, Op == G_LOAD);
1545             },
1546             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1547               const LLT DstTy = Query.Types[0];
1548               const LLT PtrTy = Query.Types[1];
1549
1550               LLT EltTy = DstTy.getElementType();
1551               unsigned MaxSize = maxSizeForAddrSpace(
1552                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1553                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1554
1555               // FIXME: Handle widened to power of 2 results better. This ends
1556               // up scalarizing.
1557               // FIXME: 3 element stores scalarized on SI
1558
1559               // Split if it's too large for the address space.
1560               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1561               if (MemSize > MaxSize) {
1562                 unsigned NumElts = DstTy.getNumElements();
1563                 unsigned EltSize = EltTy.getSizeInBits();
1564
1565                 if (MaxSize % EltSize == 0) {
1566                   return std::pair(
1567                       0, LLT::scalarOrVector(
1568                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1569                 }
1570
1571                 unsigned NumPieces = MemSize / MaxSize;
1572
1573                 // FIXME: Refine when odd breakdowns handled
1574                 // The scalars will need to be re-legalized.
1575                 if (NumPieces == 1 || NumPieces >= NumElts ||
1576                     NumElts % NumPieces != 0)
1577                   return std::pair(0, EltTy);
1578
1579                 return std::pair(0,
1580                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1581               }
1582
1583               // FIXME: We could probably handle weird extending loads better.
1584               if (DstTy.getSizeInBits() > MemSize)
1585                 return std::pair(0, EltTy);
1586
1587               unsigned EltSize = EltTy.getSizeInBits();
1588               unsigned DstSize = DstTy.getSizeInBits();
1589               if (!isPowerOf2_32(DstSize)) {
1590                 // We're probably decomposing an odd sized store. Try to split
1591                 // to the widest type. TODO: Account for alignment. As-is it
1592                 // should be OK, since the new parts will be further legalized.
1593                 unsigned FloorSize = llvm::bit_floor(DstSize);
1594                 return std::pair(
1595                     0, LLT::scalarOrVector(
1596                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1597               }
1598
1599               // May need relegalization for the scalars.
1600               return std::pair(0, EltTy);
1601             })
1602     .minScalar(0, S32)
1603     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1604     .widenScalarToNextPow2(0)
1605     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1606     .lower();
1607   }
1608
1609   // FIXME: Unaligned accesses not lowered.
1610   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1611                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1612                                                   {S32, GlobalPtr, S16, 2 * 8},
1613                                                   {S32, LocalPtr, S8, 8},
1614                                                   {S32, LocalPtr, S16, 16},
1615                                                   {S32, PrivatePtr, S8, 8},
1616                                                   {S32, PrivatePtr, S16, 16},
1617                                                   {S32, ConstantPtr, S8, 8},
1618                                                   {S32, ConstantPtr, S16, 2 * 8}})
1619                        .legalIf(
1620                          [=](const LegalityQuery &Query) -> bool {
1621                            return isLoadStoreLegal(ST, Query);
1622                          });
1623
1624   if (ST.hasFlatAddressSpace()) {
1625     ExtLoads.legalForTypesWithMemDesc(
1626         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1627   }
1628
1629   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1630   // 64-bits.
1631   //
1632   // TODO: Should generalize bitcast action into coerce, which will also cover
1633   // inserting addrspacecasts.
1634   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1635
1636   ExtLoads.clampScalar(0, S32, S32)
1637           .widenScalarToNextPow2(0)
1638           .lower();
1639
1640   auto &Atomics = getActionDefinitionsBuilder(
1641     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1642      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1643      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1644      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1645     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1646                {S64, GlobalPtr}, {S64, LocalPtr},
1647                {S32, RegionPtr}, {S64, RegionPtr}});
1648   if (ST.hasFlatAddressSpace()) {
1649     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1650   }
1651
1652   // TODO: v2bf16 operations, and fat buffer pointer support.
1653   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1654   if (ST.hasLDSFPAtomicAddF32()) {
1655     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1656     if (ST.hasLdsAtomicAddF64())
1657       Atomic.legalFor({{S64, LocalPtr}});
1658     if (ST.hasAtomicDsPkAdd16Insts())
1659       Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1660   }
1661   if (ST.hasAtomicFaddInsts())
1662     Atomic.legalFor({{S32, GlobalPtr}});
1663   if (ST.hasFlatAtomicFaddF32Inst())
1664     Atomic.legalFor({{S32, FlatPtr}});
1665
1666   if (ST.hasGFX90AInsts()) {
1667     // These are legal with some caveats, and should have undergone expansion in
1668     // the IR in most situations
1669     // TODO: Move atomic expansion into legalizer
1670     Atomic.legalFor({
1671         {S32, GlobalPtr},
1672         {S64, GlobalPtr},
1673         {S64, FlatPtr}
1674       });
1675   }
1676
1677   if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1678       ST.hasAtomicBufferGlobalPkAddF16Insts())
1679     Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1680   if (ST.hasAtomicGlobalPkAddBF16Inst())
1681     Atomic.legalFor({{V2BF16, GlobalPtr}});
1682   if (ST.hasAtomicFlatPkAdd16Insts())
1683     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1684
1685
1686   // Most of the legalization work here is done by AtomicExpand. We could
1687   // probably use a simpler legality rule that just assumes anything is OK.
1688   auto &AtomicFMinFMax =
1689     getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1690     .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1691
1692   if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1693     AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1694   if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1695     AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1696   if (ST.hasAtomicFMinFMaxF32FlatInsts())
1697     AtomicFMinFMax.legalFor({F32, FlatPtr});
1698   if (ST.hasAtomicFMinFMaxF64FlatInsts())
1699     AtomicFMinFMax.legalFor({F64, FlatPtr});
1700
1701   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1702   // demarshalling
1703   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1704     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1705                 {S32, FlatPtr}, {S64, FlatPtr}})
1706     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1707                {S32, RegionPtr}, {S64, RegionPtr}});
1708   // TODO: Pointer types, any 32-bit or 64-bit vector
1709
1710   // Condition should be s32 for scalar, s1 for vector.
1711   getActionDefinitionsBuilder(G_SELECT)
1712       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1713                                  LocalPtr, FlatPtr, PrivatePtr,
1714                                  LLT::fixed_vector(2, LocalPtr),
1715                                  LLT::fixed_vector(2, PrivatePtr)},
1716                                 {S1, S32})
1717       .clampScalar(0, S16, S64)
1718       .scalarize(1)
1719       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1720       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1721       .clampMaxNumElements(0, S32, 2)
1722       .clampMaxNumElements(0, LocalPtr, 2)
1723       .clampMaxNumElements(0, PrivatePtr, 2)
1724       .scalarize(0)
1725       .widenScalarToNextPow2(0)
1726       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1727
1728   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1729   // be more flexible with the shift amount type.
1730   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1731     .legalFor({{S32, S32}, {S64, S32}});
1732   if (ST.has16BitInsts()) {
1733     if (ST.hasVOP3PInsts()) {
1734       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1735             .clampMaxNumElements(0, S16, 2);
1736     } else
1737       Shifts.legalFor({{S16, S16}});
1738
1739     // TODO: Support 16-bit shift amounts for all types
1740     Shifts.widenScalarIf(
1741       [=](const LegalityQuery &Query) {
1742         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1743         // 32-bit amount.
1744         const LLT ValTy = Query.Types[0];
1745         const LLT AmountTy = Query.Types[1];
1746         return ValTy.getSizeInBits() <= 16 &&
1747                AmountTy.getSizeInBits() < 16;
1748       }, changeTo(1, S16));
1749     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1750     Shifts.clampScalar(1, S32, S32);
1751     Shifts.widenScalarToNextPow2(0, 16);
1752     Shifts.clampScalar(0, S16, S64);
1753
1754     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1755       .minScalar(0, S16)
1756       .scalarize(0)
1757       .lower();
1758   } else {
1759     // Make sure we legalize the shift amount type first, as the general
1760     // expansion for the shifted type will produce much worse code if it hasn't
1761     // been truncated already.
1762     Shifts.clampScalar(1, S32, S32);
1763     Shifts.widenScalarToNextPow2(0, 32);
1764     Shifts.clampScalar(0, S32, S64);
1765
1766     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1767       .minScalar(0, S32)
1768       .scalarize(0)
1769       .lower();
1770   }
1771   Shifts.scalarize(0);
1772
1773   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1774     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1775     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1776     unsigned IdxTypeIdx = 2;
1777
1778     getActionDefinitionsBuilder(Op)
1779       .customIf([=](const LegalityQuery &Query) {
1780           const LLT EltTy = Query.Types[EltTypeIdx];
1781           const LLT VecTy = Query.Types[VecTypeIdx];
1782           const LLT IdxTy = Query.Types[IdxTypeIdx];
1783           const unsigned EltSize = EltTy.getSizeInBits();
1784           const bool isLegalVecType =
1785               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1786           // Address space 8 pointers are 128-bit wide values, but the logic
1787           // below will try to bitcast them to 2N x s64, which will fail.
1788           // Therefore, as an intermediate step, wrap extracts/insertions from a
1789           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1790           // extraction result) in order to produce a vector operation that can
1791           // be handled by the logic below.
1792           if (EltTy.isPointer() && EltSize > 64)
1793             return true;
1794           return (EltSize == 32 || EltSize == 64) &&
1795                   VecTy.getSizeInBits() % 32 == 0 &&
1796                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1797                   IdxTy.getSizeInBits() == 32 &&
1798                   isLegalVecType;
1799         })
1800       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1801                  bitcastToVectorElement32(VecTypeIdx))
1802       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1803       .bitcastIf(
1804         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1805         [=](const LegalityQuery &Query) {
1806           // For > 64-bit element types, try to turn this into a 64-bit
1807           // element vector since we may be able to do better indexing
1808           // if this is scalar. If not, fall back to 32.
1809           const LLT EltTy = Query.Types[EltTypeIdx];
1810           const LLT VecTy = Query.Types[VecTypeIdx];
1811           const unsigned DstEltSize = EltTy.getSizeInBits();
1812           const unsigned VecSize = VecTy.getSizeInBits();
1813
1814           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1815           return std::pair(
1816               VecTypeIdx,
1817               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1818         })
1819       .clampScalar(EltTypeIdx, S32, S64)
1820       .clampScalar(VecTypeIdx, S32, S64)
1821       .clampScalar(IdxTypeIdx, S32, S32)
1822       .clampMaxNumElements(VecTypeIdx, S32, 32)
1823       // TODO: Clamp elements for 64-bit vectors?
1824       .moreElementsIf(
1825         isIllegalRegisterType(VecTypeIdx),
1826         moreElementsToNextExistingRegClass(VecTypeIdx))
1827       // It should only be necessary with variable indexes.
1828       // As a last resort, lower to the stack
1829       .lower();
1830   }
1831
1832   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1833     .unsupportedIf([=](const LegalityQuery &Query) {
1834         const LLT &EltTy = Query.Types[1].getElementType();
1835         return Query.Types[0] != EltTy;
1836       });
1837
1838   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1839     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1840     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1841
1842     // FIXME: Doesn't handle extract of illegal sizes.
1843     getActionDefinitionsBuilder(Op)
1844       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1845       .lowerIf([=](const LegalityQuery &Query) {
1846           // Sub-vector(or single element) insert and extract.
1847           // TODO: verify immediate offset here since lower only works with
1848           // whole elements.
1849           const LLT BigTy = Query.Types[BigTyIdx];
1850           return BigTy.isVector();
1851         })
1852       // FIXME: Multiples of 16 should not be legal.
1853       .legalIf([=](const LegalityQuery &Query) {
1854           const LLT BigTy = Query.Types[BigTyIdx];
1855           const LLT LitTy = Query.Types[LitTyIdx];
1856           return (BigTy.getSizeInBits() % 32 == 0) &&
1857                  (LitTy.getSizeInBits() % 16 == 0);
1858         })
1859       .widenScalarIf(
1860         [=](const LegalityQuery &Query) {
1861           const LLT BigTy = Query.Types[BigTyIdx];
1862           return (BigTy.getScalarSizeInBits() < 16);
1863         },
1864         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1865       .widenScalarIf(
1866         [=](const LegalityQuery &Query) {
1867           const LLT LitTy = Query.Types[LitTyIdx];
1868           return (LitTy.getScalarSizeInBits() < 16);
1869         },
1870         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1871       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1872       .widenScalarToNextPow2(BigTyIdx, 32);
1873
1874   }
1875
1876   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1877     .legalForCartesianProduct(AllS32Vectors, {S32})
1878     .legalForCartesianProduct(AllS64Vectors, {S64})
1879     .clampNumElements(0, V16S32, V32S32)
1880     .clampNumElements(0, V2S64, V16S64)
1881     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1882     .moreElementsIf(
1883       isIllegalRegisterType(0),
1884       moreElementsToNextExistingRegClass(0));
1885
1886   if (ST.hasScalarPackInsts()) {
1887     BuildVector
1888       // FIXME: Should probably widen s1 vectors straight to s32
1889       .minScalarOrElt(0, S16)
1890       .minScalar(1, S16);
1891
1892     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1893       .legalFor({V2S16, S32})
1894       .lower();
1895   } else {
1896     BuildVector.customFor({V2S16, S16});
1897     BuildVector.minScalarOrElt(0, S32);
1898
1899     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1900       .customFor({V2S16, S32})
1901       .lower();
1902   }
1903
1904   BuildVector.legalIf(isRegisterType(0));
1905
1906   // FIXME: Clamp maximum size
1907   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1908     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1909     .clampMaxNumElements(0, S32, 32)
1910     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1911     .clampMaxNumElements(0, S16, 64);
1912
1913   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1914
1915   // Merge/Unmerge
1916   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1917     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1918     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1919
1920     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1921       const LLT Ty = Query.Types[TypeIdx];
1922       if (Ty.isVector()) {
1923         const LLT &EltTy = Ty.getElementType();
1924         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1925           return true;
1926         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1927           return true;
1928       }
1929       return false;
1930     };
1931
1932     auto &Builder = getActionDefinitionsBuilder(Op)
1933       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1934       .lowerFor({{S16, V2S16}})
1935       .lowerIf([=](const LegalityQuery &Query) {
1936           const LLT BigTy = Query.Types[BigTyIdx];
1937           return BigTy.getSizeInBits() == 32;
1938         })
1939       // Try to widen to s16 first for small types.
1940       // TODO: Only do this on targets with legal s16 shifts
1941       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1942       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1943       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1944       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1945                            elementTypeIs(1, S16)),
1946                        changeTo(1, V2S16))
1947       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1948       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1949       // valid.
1950       .clampScalar(LitTyIdx, S32, S512)
1951       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1952       // Break up vectors with weird elements into scalars
1953       .fewerElementsIf(
1954         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1955         scalarize(0))
1956       .fewerElementsIf(
1957         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1958         scalarize(1))
1959       .clampScalar(BigTyIdx, S32, MaxScalar);
1960
1961     if (Op == G_MERGE_VALUES) {
1962       Builder.widenScalarIf(
1963         // TODO: Use 16-bit shifts if legal for 8-bit values?
1964         [=](const LegalityQuery &Query) {
1965           const LLT Ty = Query.Types[LitTyIdx];
1966           return Ty.getSizeInBits() < 32;
1967         },
1968         changeTo(LitTyIdx, S32));
1969     }
1970
1971     Builder.widenScalarIf(
1972       [=](const LegalityQuery &Query) {
1973         const LLT Ty = Query.Types[BigTyIdx];
1974         return Ty.getSizeInBits() % 16 != 0;
1975       },
1976       [=](const LegalityQuery &Query) {
1977         // Pick the next power of 2, or a multiple of 64 over 128.
1978         // Whichever is smaller.
1979         const LLT &Ty = Query.Types[BigTyIdx];
1980         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1981         if (NewSizeInBits >= 256) {
1982           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1983           if (RoundedTo < NewSizeInBits)
1984             NewSizeInBits = RoundedTo;
1985         }
1986         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1987       })
1988       // Any vectors left are the wrong size. Scalarize them.
1989       .scalarize(0)
1990       .scalarize(1);
1991   }
1992
1993   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1994   // RegBankSelect.
1995   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1996     .legalFor({{S32}, {S64}});
1997
1998   if (ST.hasVOP3PInsts()) {
1999     SextInReg.lowerFor({{V2S16}})
2000       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2001       // get more vector shift opportunities, since we'll get those when
2002       // expanded.
2003       .clampMaxNumElementsStrict(0, S16, 2);
2004   } else if (ST.has16BitInsts()) {
2005     SextInReg.lowerFor({{S32}, {S64}, {S16}});
2006   } else {
2007     // Prefer to promote to s32 before lowering if we don't have 16-bit
2008     // shifts. This avoid a lot of intermediate truncate and extend operations.
2009     SextInReg.lowerFor({{S32}, {S64}});
2010   }
2011
2012   SextInReg
2013     .scalarize(0)
2014     .clampScalar(0, S32, S64)
2015     .lower();
2016
2017   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2018     .scalarize(0)
2019     .lower();
2020
2021   // TODO: Only Try to form v2s16 with legal packed instructions.
2022   getActionDefinitionsBuilder(G_FSHR)
2023     .legalFor({{S32, S32}})
2024     .lowerFor({{V2S16, V2S16}})
2025     .clampMaxNumElementsStrict(0, S16, 2)
2026     .scalarize(0)
2027     .lower();
2028
2029   if (ST.hasVOP3PInsts()) {
2030     getActionDefinitionsBuilder(G_FSHL)
2031       .lowerFor({{V2S16, V2S16}})
2032       .clampMaxNumElementsStrict(0, S16, 2)
2033       .scalarize(0)
2034       .lower();
2035   } else {
2036     getActionDefinitionsBuilder(G_FSHL)
2037       .scalarize(0)
2038       .lower();
2039   }
2040
2041   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2042     .legalFor({S64});
2043
2044   getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2045
2046   getActionDefinitionsBuilder(G_FENCE)
2047     .alwaysLegal();
2048
2049   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2050       .scalarize(0)
2051       .minScalar(0, S32)
2052       .lower();
2053
2054   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2055       .legalFor({{S32, S32}, {S64, S32}})
2056       .clampScalar(1, S32, S32)
2057       .clampScalar(0, S32, S64)
2058       .widenScalarToNextPow2(0)
2059       .scalarize(0);
2060
2061   getActionDefinitionsBuilder(
2062       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2063        G_FCOPYSIGN,
2064
2065        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2066        G_READ_REGISTER, G_WRITE_REGISTER,
2067
2068        G_SADDO, G_SSUBO})
2069       .lower();
2070
2071   if (ST.hasIEEEMinMax()) {
2072     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2073         .legalFor(FPTypesPK16)
2074         .clampMaxNumElements(0, S16, 2)
2075         .scalarize(0);
2076   } else {
2077     // TODO: Implement
2078     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2079   }
2080
2081   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2082       .lower();
2083
2084   getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2085
2086   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2087         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2088         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2089     .unsupported();
2090
2091   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2092
2093   getLegacyLegalizerInfo().computeTables();
2094   verify(*ST.getInstrInfo());
2095 }
2096
2097 bool AMDGPULegalizerInfo::legalizeCustom(
2098     LegalizerHelper &Helper, MachineInstr &MI,
2099     LostDebugLocObserver &LocObserver) const {
2100   MachineIRBuilder &B = Helper.MIRBuilder;
2101   MachineRegisterInfo &MRI = *B.getMRI();
2102
2103   switch (MI.getOpcode()) {
2104   case TargetOpcode::G_ADDRSPACE_CAST:
2105     return legalizeAddrSpaceCast(MI, MRI, B);
2106   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2107     return legalizeFroundeven(MI, MRI, B);
2108   case TargetOpcode::G_FCEIL:
2109     return legalizeFceil(MI, MRI, B);
2110   case TargetOpcode::G_FREM:
2111     return legalizeFrem(MI, MRI, B);
2112   case TargetOpcode::G_INTRINSIC_TRUNC:
2113     return legalizeIntrinsicTrunc(MI, MRI, B);
2114   case TargetOpcode::G_SITOFP:
2115     return legalizeITOFP(MI, MRI, B, true);
2116   case TargetOpcode::G_UITOFP:
2117     return legalizeITOFP(MI, MRI, B, false);
2118   case TargetOpcode::G_FPTOSI:
2119     return legalizeFPTOI(MI, MRI, B, true);
2120   case TargetOpcode::G_FPTOUI:
2121     return legalizeFPTOI(MI, MRI, B, false);
2122   case TargetOpcode::G_FMINNUM:
2123   case TargetOpcode::G_FMAXNUM:
2124   case TargetOpcode::G_FMINNUM_IEEE:
2125   case TargetOpcode::G_FMAXNUM_IEEE:
2126     return legalizeMinNumMaxNum(Helper, MI);
2127   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2128     return legalizeExtractVectorElt(MI, MRI, B);
2129   case TargetOpcode::G_INSERT_VECTOR_ELT:
2130     return legalizeInsertVectorElt(MI, MRI, B);
2131   case TargetOpcode::G_FSIN:
2132   case TargetOpcode::G_FCOS:
2133     return legalizeSinCos(MI, MRI, B);
2134   case TargetOpcode::G_GLOBAL_VALUE:
2135     return legalizeGlobalValue(MI, MRI, B);
2136   case TargetOpcode::G_LOAD:
2137   case TargetOpcode::G_SEXTLOAD:
2138   case TargetOpcode::G_ZEXTLOAD:
2139     return legalizeLoad(Helper, MI);
2140   case TargetOpcode::G_STORE:
2141     return legalizeStore(Helper, MI);
2142   case TargetOpcode::G_FMAD:
2143     return legalizeFMad(MI, MRI, B);
2144   case TargetOpcode::G_FDIV:
2145     return legalizeFDIV(MI, MRI, B);
2146   case TargetOpcode::G_FFREXP:
2147     return legalizeFFREXP(MI, MRI, B);
2148   case TargetOpcode::G_FSQRT:
2149     return legalizeFSQRT(MI, MRI, B);
2150   case TargetOpcode::G_UDIV:
2151   case TargetOpcode::G_UREM:
2152   case TargetOpcode::G_UDIVREM:
2153     return legalizeUnsignedDIV_REM(MI, MRI, B);
2154   case TargetOpcode::G_SDIV:
2155   case TargetOpcode::G_SREM:
2156   case TargetOpcode::G_SDIVREM:
2157     return legalizeSignedDIV_REM(MI, MRI, B);
2158   case TargetOpcode::G_ATOMIC_CMPXCHG:
2159     return legalizeAtomicCmpXChg(MI, MRI, B);
2160   case TargetOpcode::G_FLOG2:
2161     return legalizeFlog2(MI, B);
2162   case TargetOpcode::G_FLOG:
2163   case TargetOpcode::G_FLOG10:
2164     return legalizeFlogCommon(MI, B);
2165   case TargetOpcode::G_FEXP2:
2166     return legalizeFExp2(MI, B);
2167   case TargetOpcode::G_FEXP:
2168   case TargetOpcode::G_FEXP10:
2169     return legalizeFExp(MI, B);
2170   case TargetOpcode::G_FPOW:
2171     return legalizeFPow(MI, B);
2172   case TargetOpcode::G_FFLOOR:
2173     return legalizeFFloor(MI, MRI, B);
2174   case TargetOpcode::G_BUILD_VECTOR:
2175   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2176     return legalizeBuildVector(MI, MRI, B);
2177   case TargetOpcode::G_MUL:
2178     return legalizeMul(Helper, MI);
2179   case TargetOpcode::G_CTLZ:
2180   case TargetOpcode::G_CTTZ:
2181     return legalizeCTLZ_CTTZ(MI, MRI, B);
2182   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2183     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2184   case TargetOpcode::G_STACKSAVE:
2185     return legalizeStackSave(MI, B);
2186   case TargetOpcode::G_GET_FPENV:
2187     return legalizeGetFPEnv(MI, MRI, B);
2188   case TargetOpcode::G_SET_FPENV:
2189     return legalizeSetFPEnv(MI, MRI, B);
2190   case TargetOpcode::G_TRAP:
2191     return legalizeTrap(MI, MRI, B);
2192   case TargetOpcode::G_DEBUGTRAP:
2193     return legalizeDebugTrap(MI, MRI, B);
2194   default:
2195     return false;
2196   }
2197
2198   llvm_unreachable("expected switch to return");
2199 }
2200
2201 Register AMDGPULegalizerInfo::getSegmentAperture(
2202   unsigned AS,
2203   MachineRegisterInfo &MRI,
2204   MachineIRBuilder &B) const {
2205   MachineFunction &MF = B.getMF();
2206   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2207   const LLT S32 = LLT::scalar(32);
2208   const LLT S64 = LLT::scalar(64);
2209
2210   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2211
2212   if (ST.hasApertureRegs()) {
2213     // Note: this register is somewhat broken. When used as a 32-bit operand,
2214     // it only returns zeroes. The real value is in the upper 32 bits.
2215     // Thus, we must emit extract the high 32 bits.
2216     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2217                                        ? AMDGPU::SRC_SHARED_BASE
2218                                        : AMDGPU::SRC_PRIVATE_BASE;
2219     // FIXME: It would be more natural to emit a COPY here, but then copy
2220     // coalescing would kick in and it would think it's okay to use the "HI"
2221     // subregister (instead of extracting the HI 32 bits) which is an artificial
2222     // (unusable) register.
2223     //  Register TableGen definitions would need an overhaul to get rid of the
2224     //  artificial "HI" aperture registers and prevent this kind of issue from
2225     //  happening.
2226     Register Dst = MRI.createGenericVirtualRegister(S64);
2227     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2229     return B.buildUnmerge(S32, Dst).getReg(1);
2230   }
2231
2232   // TODO: can we be smarter about machine pointer info?
2233   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2234   Register LoadAddr = MRI.createGenericVirtualRegister(
2235     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2236   // For code object version 5, private_base and shared_base are passed through
2237   // implicit kernargs.
2238   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2239       AMDGPU::AMDHSA_COV5) {
2240     AMDGPUTargetLowering::ImplicitParameter Param =
2241         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2242                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2243     uint64_t Offset =
2244         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2245
2246     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2247         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2248
2249     if (!loadInputValue(KernargPtrReg, B,
2250                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2251       return Register();
2252
2253     MachineMemOperand *MMO = MF.getMachineMemOperand(
2254         PtrInfo,
2255         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2256             MachineMemOperand::MOInvariant,
2257         LLT::scalar(32), commonAlignment(Align(64), Offset));
2258
2259     // Pointer address
2260     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2261                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2262     // Load address
2263     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2264   }
2265
2266   Register QueuePtr = MRI.createGenericVirtualRegister(
2267     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2268
2269   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2270     return Register();
2271
2272   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273   // private_segment_aperture_base_hi.
2274   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2275
2276   MachineMemOperand *MMO = MF.getMachineMemOperand(
2277       PtrInfo,
2278       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2279           MachineMemOperand::MOInvariant,
2280       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2281
2282   B.buildPtrAdd(LoadAddr, QueuePtr,
2283                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2284   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2285 }
2286
2287 /// Return true if the value is a known valid address, such that a null check is
2288 /// not necessary.
2289 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2290                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2291   MachineInstr *Def = MRI.getVRegDef(Val);
2292   switch (Def->getOpcode()) {
2293   case AMDGPU::G_FRAME_INDEX:
2294   case AMDGPU::G_GLOBAL_VALUE:
2295   case AMDGPU::G_BLOCK_ADDR:
2296     return true;
2297   case AMDGPU::G_CONSTANT: {
2298     const ConstantInt *CI = Def->getOperand(1).getCImm();
2299     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2300   }
2301   default:
2302     return false;
2303   }
2304
2305   return false;
2306 }
2307
2308 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2309   MachineInstr &MI, MachineRegisterInfo &MRI,
2310   MachineIRBuilder &B) const {
2311   MachineFunction &MF = B.getMF();
2312
2313   // MI can either be a G_ADDRSPACE_CAST or a
2314   // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315   assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316          (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2317                                      Intrinsic::amdgcn_addrspacecast_nonnull));
2318
2319   const LLT S32 = LLT::scalar(32);
2320   Register Dst = MI.getOperand(0).getReg();
2321   Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2322                                      : MI.getOperand(1).getReg();
2323   LLT DstTy = MRI.getType(Dst);
2324   LLT SrcTy = MRI.getType(Src);
2325   unsigned DestAS = DstTy.getAddressSpace();
2326   unsigned SrcAS = SrcTy.getAddressSpace();
2327
2328   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2329   // vector element.
2330   assert(!DstTy.isVector());
2331
2332   const AMDGPUTargetMachine &TM
2333     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2334
2335   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2337     return true;
2338   }
2339
2340   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2341       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2342        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2343     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344     // G_ADDRSPACE_CAST we need to guess.
2345     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2346       // Extract low 32-bits of the pointer.
2347       B.buildExtract(Dst, Src, 0);
2348       MI.eraseFromParent();
2349       return true;
2350     }
2351
2352     unsigned NullVal = TM.getNullPointerValue(DestAS);
2353
2354     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2355     auto FlatNull = B.buildConstant(SrcTy, 0);
2356
2357     // Extract low 32-bits of the pointer.
2358     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2359
2360     auto CmpRes =
2361         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2362     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2363
2364     MI.eraseFromParent();
2365     return true;
2366   }
2367
2368   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2369       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2370        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2371     auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2372       Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2373       if (!ApertureReg.isValid())
2374         return false;
2375
2376       // Coerce the type of the low half of the result so we can use
2377       // merge_values.
2378       Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2379
2380       // TODO: Should we allow mismatched types but matching sizes in merges to
2381       // avoid the ptrtoint?
2382       return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2383     };
2384
2385     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2386     // G_ADDRSPACE_CAST we need to guess.
2387     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2388       castLocalOrPrivateToFlat(Dst);
2389       MI.eraseFromParent();
2390       return true;
2391     }
2392
2393     Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2394
2395     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2396     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2397
2398     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2399                               SegmentNull.getReg(0));
2400
2401     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2402
2403     MI.eraseFromParent();
2404     return true;
2405   }
2406
2407   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2408       SrcTy.getSizeInBits() == 64) {
2409     // Truncate.
2410     B.buildExtract(Dst, Src, 0);
2411     MI.eraseFromParent();
2412     return true;
2413   }
2414
2415   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2416       DstTy.getSizeInBits() == 64) {
2417     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2418     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2419     auto PtrLo = B.buildPtrToInt(S32, Src);
2420     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2421     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2422     MI.eraseFromParent();
2423     return true;
2424   }
2425
2426   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2427       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2428
2429   LLVMContext &Ctx = MF.getFunction().getContext();
2430   Ctx.diagnose(InvalidAddrSpaceCast);
2431   B.buildUndef(Dst);
2432   MI.eraseFromParent();
2433   return true;
2434 }
2435
2436 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2437                                              MachineRegisterInfo &MRI,
2438                                              MachineIRBuilder &B) const {
2439   Register Src = MI.getOperand(1).getReg();
2440   LLT Ty = MRI.getType(Src);
2441   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2442
2443   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2444   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2445
2446   auto C1 = B.buildFConstant(Ty, C1Val);
2447   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2448
2449   // TODO: Should this propagate fast-math-flags?
2450   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2451   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2452
2453   auto C2 = B.buildFConstant(Ty, C2Val);
2454   auto Fabs = B.buildFAbs(Ty, Src);
2455
2456   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2457   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2458   MI.eraseFromParent();
2459   return true;
2460 }
2461
2462 bool AMDGPULegalizerInfo::legalizeFceil(
2463   MachineInstr &MI, MachineRegisterInfo &MRI,
2464   MachineIRBuilder &B) const {
2465
2466   const LLT S1 = LLT::scalar(1);
2467   const LLT S64 = LLT::scalar(64);
2468
2469   Register Src = MI.getOperand(1).getReg();
2470   assert(MRI.getType(Src) == S64);
2471
2472   // result = trunc(src)
2473   // if (src > 0.0 && src != result)
2474   //   result += 1.0
2475
2476   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2477
2478   const auto Zero = B.buildFConstant(S64, 0.0);
2479   const auto One = B.buildFConstant(S64, 1.0);
2480   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2481   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2482   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2483   auto Add = B.buildSelect(S64, And, One, Zero);
2484
2485   // TODO: Should this propagate fast-math-flags?
2486   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2487   MI.eraseFromParent();
2488   return true;
2489 }
2490
2491 bool AMDGPULegalizerInfo::legalizeFrem(
2492   MachineInstr &MI, MachineRegisterInfo &MRI,
2493   MachineIRBuilder &B) const {
2494     Register DstReg = MI.getOperand(0).getReg();
2495     Register Src0Reg = MI.getOperand(1).getReg();
2496     Register Src1Reg = MI.getOperand(2).getReg();
2497     auto Flags = MI.getFlags();
2498     LLT Ty = MRI.getType(DstReg);
2499
2500     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2501     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2502     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2503     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2504     MI.eraseFromParent();
2505     return true;
2506 }
2507
2508 static MachineInstrBuilder extractF64Exponent(Register Hi,
2509                                               MachineIRBuilder &B) {
2510   const unsigned FractBits = 52;
2511   const unsigned ExpBits = 11;
2512   LLT S32 = LLT::scalar(32);
2513
2514   auto Const0 = B.buildConstant(S32, FractBits - 32);
2515   auto Const1 = B.buildConstant(S32, ExpBits);
2516
2517   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2518                      .addUse(Hi)
2519                      .addUse(Const0.getReg(0))
2520                      .addUse(Const1.getReg(0));
2521
2522   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2523 }
2524
2525 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2526   MachineInstr &MI, MachineRegisterInfo &MRI,
2527   MachineIRBuilder &B) const {
2528   const LLT S1 = LLT::scalar(1);
2529   const LLT S32 = LLT::scalar(32);
2530   const LLT S64 = LLT::scalar(64);
2531
2532   Register Src = MI.getOperand(1).getReg();
2533   assert(MRI.getType(Src) == S64);
2534
2535   // TODO: Should this use extract since the low half is unused?
2536   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2537   Register Hi = Unmerge.getReg(1);
2538
2539   // Extract the upper half, since this is where we will find the sign and
2540   // exponent.
2541   auto Exp = extractF64Exponent(Hi, B);
2542
2543   const unsigned FractBits = 52;
2544
2545   // Extract the sign bit.
2546   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2547   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2548
2549   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2550
2551   const auto Zero32 = B.buildConstant(S32, 0);
2552
2553   // Extend back to 64-bits.
2554   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2555
2556   auto Shr = B.buildAShr(S64, FractMask, Exp);
2557   auto Not = B.buildNot(S64, Shr);
2558   auto Tmp0 = B.buildAnd(S64, Src, Not);
2559   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2560
2561   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2562   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2563
2564   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2565   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2566   MI.eraseFromParent();
2567   return true;
2568 }
2569
2570 bool AMDGPULegalizerInfo::legalizeITOFP(
2571   MachineInstr &MI, MachineRegisterInfo &MRI,
2572   MachineIRBuilder &B, bool Signed) const {
2573
2574   Register Dst = MI.getOperand(0).getReg();
2575   Register Src = MI.getOperand(1).getReg();
2576
2577   const LLT S64 = LLT::scalar(64);
2578   const LLT S32 = LLT::scalar(32);
2579
2580   assert(MRI.getType(Src) == S64);
2581
2582   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2583   auto ThirtyTwo = B.buildConstant(S32, 32);
2584
2585   if (MRI.getType(Dst) == S64) {
2586     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2587                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2588
2589     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2590     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2591
2592     // TODO: Should this propagate fast-math-flags?
2593     B.buildFAdd(Dst, LdExp, CvtLo);
2594     MI.eraseFromParent();
2595     return true;
2596   }
2597
2598   assert(MRI.getType(Dst) == S32);
2599
2600   auto One = B.buildConstant(S32, 1);
2601
2602   MachineInstrBuilder ShAmt;
2603   if (Signed) {
2604     auto ThirtyOne = B.buildConstant(S32, 31);
2605     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2606     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2607     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2608     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2609                   .addUse(Unmerge.getReg(1));
2610     auto LS2 = B.buildSub(S32, LS, One);
2611     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2612   } else
2613     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2614   auto Norm = B.buildShl(S64, Src, ShAmt);
2615   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2616   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2617   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2618   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2619   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2620   B.buildFLdexp(Dst, FVal, Scale);
2621   MI.eraseFromParent();
2622   return true;
2623 }
2624
2625 // TODO: Copied from DAG implementation. Verify logic and document how this
2626 // actually works.
2627 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2628                                         MachineRegisterInfo &MRI,
2629                                         MachineIRBuilder &B,
2630                                         bool Signed) const {
2631
2632   Register Dst = MI.getOperand(0).getReg();
2633   Register Src = MI.getOperand(1).getReg();
2634
2635   const LLT S64 = LLT::scalar(64);
2636   const LLT S32 = LLT::scalar(32);
2637
2638   const LLT SrcLT = MRI.getType(Src);
2639   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2640
2641   unsigned Flags = MI.getFlags();
2642
2643   // The basic idea of converting a floating point number into a pair of 32-bit
2644   // integers is illustrated as follows:
2645   //
2646   //     tf := trunc(val);
2647   //    hif := floor(tf * 2^-32);
2648   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2649   //     hi := fptoi(hif);
2650   //     lo := fptoi(lof);
2651   //
2652   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2653   MachineInstrBuilder Sign;
2654   if (Signed && SrcLT == S32) {
2655     // However, a 32-bit floating point number has only 23 bits mantissa and
2656     // it's not enough to hold all the significant bits of `lof` if val is
2657     // negative. To avoid the loss of precision, We need to take the absolute
2658     // value after truncating and flip the result back based on the original
2659     // signedness.
2660     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2661     Trunc = B.buildFAbs(S32, Trunc, Flags);
2662   }
2663   MachineInstrBuilder K0, K1;
2664   if (SrcLT == S64) {
2665     K0 = B.buildFConstant(
2666         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2667     K1 = B.buildFConstant(
2668         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2669   } else {
2670     K0 = B.buildFConstant(
2671         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2672     K1 = B.buildFConstant(
2673         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2674   }
2675
2676   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2677   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2678   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2679
2680   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2681                                      : B.buildFPTOUI(S32, FloorMul);
2682   auto Lo = B.buildFPTOUI(S32, Fma);
2683
2684   if (Signed && SrcLT == S32) {
2685     // Flip the result based on the signedness, which is either all 0s or 1s.
2686     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2687     // r := xor({lo, hi}, sign) - sign;
2688     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2689                Sign);
2690   } else
2691     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2692   MI.eraseFromParent();
2693
2694   return true;
2695 }
2696
2697 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2698                                                MachineInstr &MI) const {
2699   MachineFunction &MF = Helper.MIRBuilder.getMF();
2700   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2701
2702   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2703                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2704
2705   // With ieee_mode disabled, the instructions have the correct behavior
2706   // already for G_FMINNUM/G_FMAXNUM
2707   if (!MFI->getMode().IEEE)
2708     return !IsIEEEOp;
2709
2710   if (IsIEEEOp)
2711     return true;
2712
2713   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2714 }
2715
2716 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2717   MachineInstr &MI, MachineRegisterInfo &MRI,
2718   MachineIRBuilder &B) const {
2719   // TODO: Should move some of this into LegalizerHelper.
2720
2721   // TODO: Promote dynamic indexing of s16 to s32
2722
2723   Register Dst = MI.getOperand(0).getReg();
2724   Register Vec = MI.getOperand(1).getReg();
2725
2726   LLT VecTy = MRI.getType(Vec);
2727   LLT EltTy = VecTy.getElementType();
2728   assert(EltTy == MRI.getType(Dst));
2729
2730   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2731   // but we can't go directly to that logic becasue you can't bitcast a vector
2732   // of pointers to a vector of integers. Therefore, introduce an intermediate
2733   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2734   // drive the legalization forward.
2735   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2736     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2737     LLT IntVecTy = VecTy.changeElementType(IntTy);
2738
2739     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2740     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2741     B.buildIntToPtr(Dst, IntElt);
2742
2743     MI.eraseFromParent();
2744     return true;
2745   }
2746
2747   // FIXME: Artifact combiner probably should have replaced the truncated
2748   // constant before this, so we shouldn't need
2749   // getIConstantVRegValWithLookThrough.
2750   std::optional<ValueAndVReg> MaybeIdxVal =
2751       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2752   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2753     return true;
2754   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2755
2756   if (IdxVal < VecTy.getNumElements()) {
2757     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2758     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2759   } else {
2760     B.buildUndef(Dst);
2761   }
2762
2763   MI.eraseFromParent();
2764   return true;
2765 }
2766
2767 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2768   MachineInstr &MI, MachineRegisterInfo &MRI,
2769   MachineIRBuilder &B) const {
2770   // TODO: Should move some of this into LegalizerHelper.
2771
2772   // TODO: Promote dynamic indexing of s16 to s32
2773
2774   Register Dst = MI.getOperand(0).getReg();
2775   Register Vec = MI.getOperand(1).getReg();
2776   Register Ins = MI.getOperand(2).getReg();
2777
2778   LLT VecTy = MRI.getType(Vec);
2779   LLT EltTy = VecTy.getElementType();
2780   assert(EltTy == MRI.getType(Ins));
2781
2782   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2783   // but we can't go directly to that logic becasue you can't bitcast a vector
2784   // of pointers to a vector of integers. Therefore, make the pointer vector
2785   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2786   // new value, and then inttoptr the result vector back. This will then allow
2787   // the rest of legalization to take over.
2788   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2789     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2790     LLT IntVecTy = VecTy.changeElementType(IntTy);
2791
2792     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2793     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2794     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2795                                                  MI.getOperand(3));
2796     B.buildIntToPtr(Dst, IntVecDest);
2797     MI.eraseFromParent();
2798     return true;
2799   }
2800
2801   // FIXME: Artifact combiner probably should have replaced the truncated
2802   // constant before this, so we shouldn't need
2803   // getIConstantVRegValWithLookThrough.
2804   std::optional<ValueAndVReg> MaybeIdxVal =
2805       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2806   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2807     return true;
2808
2809   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2810
2811   unsigned NumElts = VecTy.getNumElements();
2812   if (IdxVal < NumElts) {
2813     SmallVector<Register, 8> SrcRegs;
2814     for (unsigned i = 0; i < NumElts; ++i)
2815       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2816     B.buildUnmerge(SrcRegs, Vec);
2817
2818     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2819     B.buildMergeLikeInstr(Dst, SrcRegs);
2820   } else {
2821     B.buildUndef(Dst);
2822   }
2823
2824   MI.eraseFromParent();
2825   return true;
2826 }
2827
2828 bool AMDGPULegalizerInfo::legalizeSinCos(
2829   MachineInstr &MI, MachineRegisterInfo &MRI,
2830   MachineIRBuilder &B) const {
2831
2832   Register DstReg = MI.getOperand(0).getReg();
2833   Register SrcReg = MI.getOperand(1).getReg();
2834   LLT Ty = MRI.getType(DstReg);
2835   unsigned Flags = MI.getFlags();
2836
2837   Register TrigVal;
2838   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2839   if (ST.hasTrigReducedRange()) {
2840     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2841     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2842                   .addUse(MulVal.getReg(0))
2843                   .setMIFlags(Flags)
2844                   .getReg(0);
2845   } else
2846     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2847
2848   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2849     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2850   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2851       .addUse(TrigVal)
2852       .setMIFlags(Flags);
2853   MI.eraseFromParent();
2854   return true;
2855 }
2856
2857 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2858                                                   MachineIRBuilder &B,
2859                                                   const GlobalValue *GV,
2860                                                   int64_t Offset,
2861                                                   unsigned GAFlags) const {
2862   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2863   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2864   // to the following code sequence:
2865   //
2866   // For constant address space:
2867   //   s_getpc_b64 s[0:1]
2868   //   s_add_u32 s0, s0, $symbol
2869   //   s_addc_u32 s1, s1, 0
2870   //
2871   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2872   //   a fixup or relocation is emitted to replace $symbol with a literal
2873   //   constant, which is a pc-relative offset from the encoding of the $symbol
2874   //   operand to the global variable.
2875   //
2876   // For global address space:
2877   //   s_getpc_b64 s[0:1]
2878   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2879   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2880   //
2881   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2882   //   fixups or relocations are emitted to replace $symbol@*@lo and
2883   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2884   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2885   //   operand to the global variable.
2886
2887   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2888
2889   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2890     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2891
2892   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2893     .addDef(PCReg);
2894
2895   MIB.addGlobalAddress(GV, Offset, GAFlags);
2896   if (GAFlags == SIInstrInfo::MO_NONE)
2897     MIB.addImm(0);
2898   else
2899     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2900
2901   if (!B.getMRI()->getRegClassOrNull(PCReg))
2902     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2903
2904   if (PtrTy.getSizeInBits() == 32)
2905     B.buildExtract(DstReg, PCReg, 0);
2906   return true;
2907 }
2908
2909 // Emit a ABS32_LO / ABS32_HI relocation stub.
2910 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2911     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2912     MachineRegisterInfo &MRI) const {
2913   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2914
2915   LLT S32 = LLT::scalar(32);
2916
2917   // Use the destination directly, if and only if we store the lower address
2918   // part only and we don't have a register class being set.
2919   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2920                         ? DstReg
2921                         : MRI.createGenericVirtualRegister(S32);
2922
2923   if (!MRI.getRegClassOrNull(AddrLo))
2924     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2925
2926   // Write the lower half.
2927   B.buildInstr(AMDGPU::S_MOV_B32)
2928       .addDef(AddrLo)
2929       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2930
2931   // If required, write the upper half as well.
2932   if (RequiresHighHalf) {
2933     assert(PtrTy.getSizeInBits() == 64 &&
2934            "Must provide a 64-bit pointer type!");
2935
2936     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2937     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2938
2939     B.buildInstr(AMDGPU::S_MOV_B32)
2940         .addDef(AddrHi)
2941         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2942
2943     // Use the destination directly, if and only if we don't have a register
2944     // class being set.
2945     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2946                            ? DstReg
2947                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2948
2949     if (!MRI.getRegClassOrNull(AddrDst))
2950       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2951
2952     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2953
2954     // If we created a new register for the destination, cast the result into
2955     // the final output.
2956     if (AddrDst != DstReg)
2957       B.buildCast(DstReg, AddrDst);
2958   } else if (AddrLo != DstReg) {
2959     // If we created a new register for the destination, cast the result into
2960     // the final output.
2961     B.buildCast(DstReg, AddrLo);
2962   }
2963 }
2964
2965 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2966   MachineInstr &MI, MachineRegisterInfo &MRI,
2967   MachineIRBuilder &B) const {
2968   Register DstReg = MI.getOperand(0).getReg();
2969   LLT Ty = MRI.getType(DstReg);
2970   unsigned AS = Ty.getAddressSpace();
2971
2972   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2973   MachineFunction &MF = B.getMF();
2974   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2975
2976   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2977     if (!MFI->isModuleEntryFunction() &&
2978         GV->getName() != "llvm.amdgcn.module.lds" &&
2979         !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
2980       const Function &Fn = MF.getFunction();
2981       DiagnosticInfoUnsupported BadLDSDecl(
2982         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2983         DS_Warning);
2984       Fn.getContext().diagnose(BadLDSDecl);
2985
2986       // We currently don't have a way to correctly allocate LDS objects that
2987       // aren't directly associated with a kernel. We do force inlining of
2988       // functions that use local objects. However, if these dead functions are
2989       // not eliminated, we don't want a compile time error. Just emit a warning
2990       // and a trap, since there should be no callable path here.
2991       B.buildTrap();
2992       B.buildUndef(DstReg);
2993       MI.eraseFromParent();
2994       return true;
2995     }
2996
2997     // TODO: We could emit code to handle the initialization somewhere.
2998     // We ignore the initializer for now and legalize it to allow selection.
2999     // The initializer will anyway get errored out during assembly emission.
3000     const SITargetLowering *TLI = ST.getTargetLowering();
3001     if (!TLI->shouldUseLDSConstAddress(GV)) {
3002       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3003       return true; // Leave in place;
3004     }
3005
3006     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3007       Type *Ty = GV->getValueType();
3008       // HIP uses an unsized array `extern __shared__ T s[]` or similar
3009       // zero-sized type in other languages to declare the dynamic shared
3010       // memory which size is not known at the compile time. They will be
3011       // allocated by the runtime and placed directly after the static
3012       // allocated ones. They all share the same offset.
3013       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3014         // Adjust alignment for that dynamic shared memory array.
3015         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3016         LLT S32 = LLT::scalar(32);
3017         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3018         B.buildIntToPtr(DstReg, Sz);
3019         MI.eraseFromParent();
3020         return true;
3021       }
3022     }
3023
3024     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3025                                                    *cast<GlobalVariable>(GV)));
3026     MI.eraseFromParent();
3027     return true;
3028   }
3029
3030   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3031     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3032     MI.eraseFromParent();
3033     return true;
3034   }
3035
3036   const SITargetLowering *TLI = ST.getTargetLowering();
3037
3038   if (TLI->shouldEmitFixup(GV)) {
3039     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3040     MI.eraseFromParent();
3041     return true;
3042   }
3043
3044   if (TLI->shouldEmitPCReloc(GV)) {
3045     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3046     MI.eraseFromParent();
3047     return true;
3048   }
3049
3050   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3051   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3052
3053   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3054   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3055       MachinePointerInfo::getGOT(MF),
3056       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3057           MachineMemOperand::MOInvariant,
3058       LoadTy, Align(8));
3059
3060   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3061
3062   if (Ty.getSizeInBits() == 32) {
3063     // Truncate if this is a 32-bit constant address.
3064     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3065     B.buildExtract(DstReg, Load, 0);
3066   } else
3067     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3068
3069   MI.eraseFromParent();
3070   return true;
3071 }
3072
3073 static LLT widenToNextPowerOf2(LLT Ty) {
3074   if (Ty.isVector())
3075     return Ty.changeElementCount(
3076         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3077   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3078 }
3079
3080 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3081                                        MachineInstr &MI) const {
3082   MachineIRBuilder &B = Helper.MIRBuilder;
3083   MachineRegisterInfo &MRI = *B.getMRI();
3084   GISelChangeObserver &Observer = Helper.Observer;
3085
3086   Register PtrReg = MI.getOperand(1).getReg();
3087   LLT PtrTy = MRI.getType(PtrReg);
3088   unsigned AddrSpace = PtrTy.getAddressSpace();
3089
3090   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3091     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3092     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3093     Observer.changingInstr(MI);
3094     MI.getOperand(1).setReg(Cast.getReg(0));
3095     Observer.changedInstr(MI);
3096     return true;
3097   }
3098
3099   if (MI.getOpcode() != AMDGPU::G_LOAD)
3100     return false;
3101
3102   Register ValReg = MI.getOperand(0).getReg();
3103   LLT ValTy = MRI.getType(ValReg);
3104
3105   if (hasBufferRsrcWorkaround(ValTy)) {
3106     Observer.changingInstr(MI);
3107     castBufferRsrcFromV4I32(MI, B, MRI, 0);
3108     Observer.changedInstr(MI);
3109     return true;
3110   }
3111
3112   MachineMemOperand *MMO = *MI.memoperands_begin();
3113   const unsigned ValSize = ValTy.getSizeInBits();
3114   const LLT MemTy = MMO->getMemoryType();
3115   const Align MemAlign = MMO->getAlign();
3116   const unsigned MemSize = MemTy.getSizeInBits();
3117   const uint64_t AlignInBits = 8 * MemAlign.value();
3118
3119   // Widen non-power-of-2 loads to the alignment if needed
3120   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3121     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3122
3123     // This was already the correct extending load result type, so just adjust
3124     // the memory type.
3125     if (WideMemSize == ValSize) {
3126       MachineFunction &MF = B.getMF();
3127
3128       MachineMemOperand *WideMMO =
3129           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3130       Observer.changingInstr(MI);
3131       MI.setMemRefs(MF, {WideMMO});
3132       Observer.changedInstr(MI);
3133       return true;
3134     }
3135
3136     // Don't bother handling edge case that should probably never be produced.
3137     if (ValSize > WideMemSize)
3138       return false;
3139
3140     LLT WideTy = widenToNextPowerOf2(ValTy);
3141
3142     Register WideLoad;
3143     if (!WideTy.isVector()) {
3144       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3145       B.buildTrunc(ValReg, WideLoad).getReg(0);
3146     } else {
3147       // Extract the subvector.
3148
3149       if (isRegisterType(ValTy)) {
3150         // If this a case where G_EXTRACT is legal, use it.
3151         // (e.g. <3 x s32> -> <4 x s32>)
3152         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3153         B.buildExtract(ValReg, WideLoad, 0);
3154       } else {
3155         // For cases where the widened type isn't a nice register value, unmerge
3156         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3157         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3158         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3159       }
3160     }
3161
3162     MI.eraseFromParent();
3163     return true;
3164   }
3165
3166   return false;
3167 }
3168
3169 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3170                                         MachineInstr &MI) const {
3171   MachineIRBuilder &B = Helper.MIRBuilder;
3172   MachineRegisterInfo &MRI = *B.getMRI();
3173   GISelChangeObserver &Observer = Helper.Observer;
3174
3175   Register DataReg = MI.getOperand(0).getReg();
3176   LLT DataTy = MRI.getType(DataReg);
3177
3178   if (hasBufferRsrcWorkaround(DataTy)) {
3179     Observer.changingInstr(MI);
3180     castBufferRsrcArgToV4I32(MI, B, 0);
3181     Observer.changedInstr(MI);
3182     return true;
3183   }
3184   return false;
3185 }
3186
3187 bool AMDGPULegalizerInfo::legalizeFMad(
3188   MachineInstr &MI, MachineRegisterInfo &MRI,
3189   MachineIRBuilder &B) const {
3190   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3191   assert(Ty.isScalar());
3192
3193   MachineFunction &MF = B.getMF();
3194   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3195
3196   // TODO: Always legal with future ftz flag.
3197   // FIXME: Do we need just output?
3198   if (Ty == LLT::float32() &&
3199       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3200     return true;
3201   if (Ty == LLT::float16() &&
3202       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3203     return true;
3204
3205   MachineIRBuilder HelperBuilder(MI);
3206   GISelObserverWrapper DummyObserver;
3207   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3208   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3209 }
3210
3211 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3212   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3213   Register DstReg = MI.getOperand(0).getReg();
3214   Register PtrReg = MI.getOperand(1).getReg();
3215   Register CmpVal = MI.getOperand(2).getReg();
3216   Register NewVal = MI.getOperand(3).getReg();
3217
3218   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3219          "this should not have been custom lowered");
3220
3221   LLT ValTy = MRI.getType(CmpVal);
3222   LLT VecTy = LLT::fixed_vector(2, ValTy);
3223
3224   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3225
3226   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3227     .addDef(DstReg)
3228     .addUse(PtrReg)
3229     .addUse(PackedVal)
3230     .setMemRefs(MI.memoperands());
3231
3232   MI.eraseFromParent();
3233   return true;
3234 }
3235
3236 /// Return true if it's known that \p Src can never be an f32 denormal value.
3237 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3238                                        Register Src) {
3239   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3240   switch (DefMI->getOpcode()) {
3241   case TargetOpcode::G_INTRINSIC: {
3242     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3243     case Intrinsic::amdgcn_frexp_mant:
3244       return true;
3245     default:
3246       break;
3247     }
3248
3249     break;
3250   }
3251   case TargetOpcode::G_FFREXP: {
3252     if (DefMI->getOperand(0).getReg() == Src)
3253       return true;
3254     break;
3255   }
3256   case TargetOpcode::G_FPEXT: {
3257     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3258   }
3259   default:
3260     return false;
3261   }
3262
3263   return false;
3264 }
3265
3266 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3267   if (Flags & MachineInstr::FmAfn)
3268     return true;
3269   const auto &Options = MF.getTarget().Options;
3270   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3271 }
3272
3273 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3274                                    unsigned Flags) {
3275   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3276          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3277              DenormalMode::PreserveSign;
3278 }
3279
3280 std::pair<Register, Register>
3281 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3282                                        unsigned Flags) const {
3283   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3284     return {};
3285
3286   const LLT F32 = LLT::scalar(32);
3287   auto SmallestNormal = B.buildFConstant(
3288       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3289   auto IsLtSmallestNormal =
3290       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3291
3292   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3293   auto One = B.buildFConstant(F32, 1.0);
3294   auto ScaleFactor =
3295       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3296   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3297
3298   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3299 }
3300
3301 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3302                                         MachineIRBuilder &B) const {
3303   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3304   // If we have to handle denormals, scale up the input and adjust the result.
3305
3306   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3307   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3308
3309   Register Dst = MI.getOperand(0).getReg();
3310   Register Src = MI.getOperand(1).getReg();
3311   LLT Ty = B.getMRI()->getType(Dst);
3312   unsigned Flags = MI.getFlags();
3313
3314   if (Ty == LLT::scalar(16)) {
3315     const LLT F32 = LLT::scalar(32);
3316     // Nothing in half is a denormal when promoted to f32.
3317     auto Ext = B.buildFPExt(F32, Src, Flags);
3318     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3319                     .addUse(Ext.getReg(0))
3320                     .setMIFlags(Flags);
3321     B.buildFPTrunc(Dst, Log2, Flags);
3322     MI.eraseFromParent();
3323     return true;
3324   }
3325
3326   assert(Ty == LLT::scalar(32));
3327
3328   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3329   if (!ScaledInput) {
3330     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3331         .addUse(Src)
3332         .setMIFlags(Flags);
3333     MI.eraseFromParent();
3334     return true;
3335   }
3336
3337   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3338                   .addUse(ScaledInput)
3339                   .setMIFlags(Flags);
3340
3341   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3342   auto Zero = B.buildFConstant(Ty, 0.0);
3343   auto ResultOffset =
3344       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3345   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3346
3347   MI.eraseFromParent();
3348   return true;
3349 }
3350
3351 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3352                        Register Z, unsigned Flags) {
3353   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3354   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3355 }
3356
3357 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3358                                              MachineIRBuilder &B) const {
3359   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3360   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3361
3362   MachineRegisterInfo &MRI = *B.getMRI();
3363   Register Dst = MI.getOperand(0).getReg();
3364   Register X = MI.getOperand(1).getReg();
3365   unsigned Flags = MI.getFlags();
3366   const LLT Ty = MRI.getType(X);
3367   MachineFunction &MF = B.getMF();
3368
3369   const LLT F32 = LLT::scalar(32);
3370   const LLT F16 = LLT::scalar(16);
3371
3372   const AMDGPUTargetMachine &TM =
3373       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3374
3375   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3376       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3377     if (Ty == F16 && !ST.has16BitInsts()) {
3378       Register LogVal = MRI.createGenericVirtualRegister(F32);
3379       auto PromoteSrc = B.buildFPExt(F32, X);
3380       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3381       B.buildFPTrunc(Dst, LogVal);
3382     } else {
3383       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3384     }
3385
3386     MI.eraseFromParent();
3387     return true;
3388   }
3389
3390   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3391   if (ScaledInput)
3392     X = ScaledInput;
3393
3394   auto Y =
3395       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3396
3397   Register R;
3398   if (ST.hasFastFMAF32()) {
3399     // c+cc are ln(2)/ln(10) to more than 49 bits
3400     const float c_log10 = 0x1.344134p-2f;
3401     const float cc_log10 = 0x1.09f79ep-26f;
3402
3403     // c + cc is ln(2) to more than 49 bits
3404     const float c_log = 0x1.62e42ep-1f;
3405     const float cc_log = 0x1.efa39ep-25f;
3406
3407     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3408     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3409
3410     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3411     auto NegR = B.buildFNeg(Ty, R, Flags);
3412     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3413     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3414     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3415   } else {
3416     // ch+ct is ln(2)/ln(10) to more than 36 bits
3417     const float ch_log10 = 0x1.344000p-2f;
3418     const float ct_log10 = 0x1.3509f6p-18f;
3419
3420     // ch + ct is ln(2) to more than 36 bits
3421     const float ch_log = 0x1.62e000p-1f;
3422     const float ct_log = 0x1.0bfbe8p-15f;
3423
3424     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3425     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3426
3427     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3428     auto YH = B.buildAnd(Ty, Y, MaskConst);
3429     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3430     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3431
3432     Register Mad0 =
3433         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3434     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3435     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3436   }
3437
3438   const bool IsFiniteOnly =
3439       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3440       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3441
3442   if (!IsFiniteOnly) {
3443     // Expand isfinite(x) => fabs(x) < inf
3444     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3445     auto Fabs = B.buildFAbs(Ty, Y);
3446     auto IsFinite =
3447         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3448     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3449   }
3450
3451   if (ScaledInput) {
3452     auto Zero = B.buildFConstant(Ty, 0.0);
3453     auto ShiftK =
3454         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3455     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3456     B.buildFSub(Dst, R, Shift, Flags);
3457   } else {
3458     B.buildCopy(Dst, R);
3459   }
3460
3461   MI.eraseFromParent();
3462   return true;
3463 }
3464
3465 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3466                                              Register Src, bool IsLog10,
3467                                              unsigned Flags) const {
3468   const double Log2BaseInverted =
3469       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3470
3471   LLT Ty = B.getMRI()->getType(Dst);
3472
3473   if (Ty == LLT::scalar(32)) {
3474     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3475     if (ScaledInput) {
3476       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3477                         .addUse(Src)
3478                         .setMIFlags(Flags);
3479       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3480       auto Zero = B.buildFConstant(Ty, 0.0);
3481       auto ResultOffset =
3482           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3483       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3484
3485       if (ST.hasFastFMAF32())
3486         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3487       else {
3488         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3489         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3490       }
3491
3492       return true;
3493     }
3494   }
3495
3496   auto Log2Operand = Ty == LLT::scalar(16)
3497                          ? B.buildFLog2(Ty, Src, Flags)
3498                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3499                                .addUse(Src)
3500                                .setMIFlags(Flags);
3501   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3502   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3503   return true;
3504 }
3505
3506 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3507                                         MachineIRBuilder &B) const {
3508   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3509   // If we have to handle denormals, scale up the input and adjust the result.
3510
3511   Register Dst = MI.getOperand(0).getReg();
3512   Register Src = MI.getOperand(1).getReg();
3513   unsigned Flags = MI.getFlags();
3514   LLT Ty = B.getMRI()->getType(Dst);
3515   const LLT F16 = LLT::scalar(16);
3516   const LLT F32 = LLT::scalar(32);
3517
3518   if (Ty == F16) {
3519     // Nothing in half is a denormal when promoted to f32.
3520     auto Ext = B.buildFPExt(F32, Src, Flags);
3521     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3522                     .addUse(Ext.getReg(0))
3523                     .setMIFlags(Flags);
3524     B.buildFPTrunc(Dst, Log2, Flags);
3525     MI.eraseFromParent();
3526     return true;
3527   }
3528
3529   assert(Ty == F32);
3530
3531   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3532     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3533         .addUse(Src)
3534         .setMIFlags(Flags);
3535     MI.eraseFromParent();
3536     return true;
3537   }
3538
3539   // bool needs_scaling = x < -0x1.f80000p+6f;
3540   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3541
3542   // -nextafter(128.0, -1)
3543   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3544   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3545                                   RangeCheckConst, Flags);
3546
3547   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3548   auto Zero = B.buildFConstant(Ty, 0.0);
3549   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3550   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3551
3552   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3553                   .addUse(AddInput.getReg(0))
3554                   .setMIFlags(Flags);
3555
3556   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3557   auto One = B.buildFConstant(Ty, 1.0);
3558   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3559   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3560   MI.eraseFromParent();
3561   return true;
3562 }
3563
3564 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3565                                              Register X, unsigned Flags) const {
3566   LLT Ty = B.getMRI()->getType(Dst);
3567   LLT F32 = LLT::scalar(32);
3568
3569   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3570     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3571     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3572
3573     if (Ty == F32) {
3574       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3575         .addUse(Mul.getReg(0))
3576         .setMIFlags(Flags);
3577     } else {
3578       B.buildFExp2(Dst, Mul.getReg(0), Flags);
3579     }
3580
3581     return true;
3582   }
3583
3584   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3585   auto NeedsScaling =
3586       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3587   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3588   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3589   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3590
3591   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3592   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3593
3594   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3595     .addUse(ExpInput.getReg(0))
3596     .setMIFlags(Flags);
3597
3598   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3599   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3600   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3601   return true;
3602 }
3603
3604 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3605                                        MachineIRBuilder &B) const {
3606   Register Dst = MI.getOperand(0).getReg();
3607   Register X = MI.getOperand(1).getReg();
3608   const unsigned Flags = MI.getFlags();
3609   MachineFunction &MF = B.getMF();
3610   MachineRegisterInfo &MRI = *B.getMRI();
3611   LLT Ty = MRI.getType(Dst);
3612   const LLT F16 = LLT::scalar(16);
3613   const LLT F32 = LLT::scalar(32);
3614   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3615
3616   if (Ty == F16) {
3617     // v_exp_f16 (fmul x, log2e)
3618     if (allowApproxFunc(MF, Flags)) {
3619       // TODO: Does this really require fast?
3620       legalizeFExpUnsafe(B, Dst, X, Flags);
3621       MI.eraseFromParent();
3622       return true;
3623     }
3624
3625     // exp(f16 x) ->
3626     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3627
3628     // Nothing in half is a denormal when promoted to f32.
3629     auto Ext = B.buildFPExt(F32, X, Flags);
3630     Register Lowered = MRI.createGenericVirtualRegister(F32);
3631     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3632     B.buildFPTrunc(Dst, Lowered, Flags);
3633     MI.eraseFromParent();
3634     return true;
3635   }
3636
3637   assert(Ty == F32);
3638
3639   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3640   // library behavior. Also, is known-not-daz source sufficient?
3641   if (allowApproxFunc(MF, Flags)) {
3642     legalizeFExpUnsafe(B, Dst, X, Flags);
3643     MI.eraseFromParent();
3644     return true;
3645   }
3646
3647   //    Algorithm:
3648   //
3649   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3650   //
3651   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3652   //    n = 64*m + j,   0 <= j < 64
3653   //
3654   //    e^x = 2^((64*m + j + f)/64)
3655   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3656   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3657   //
3658   //    f = x*(64/ln(2)) - n
3659   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3660   //
3661   //    e^x = (2^m) * (2^(j/64)) * e^r
3662   //
3663   //    (2^(j/64)) is precomputed
3664   //
3665   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3666   //    e^r = 1 + q
3667   //
3668   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3669   //
3670   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3671   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3672   Register PH, PL;
3673
3674   if (ST.hasFastFMAF32()) {
3675     const float c_exp = numbers::log2ef;
3676     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3677     const float c_exp10 = 0x1.a934f0p+1f;
3678     const float cc_exp10 = 0x1.2f346ep-24f;
3679
3680     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3681     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3682     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3683     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3684
3685     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3686     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3687   } else {
3688     const float ch_exp = 0x1.714000p+0f;
3689     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3690
3691     const float ch_exp10 = 0x1.a92000p+1f;
3692     const float cl_exp10 = 0x1.4f0978p-11f;
3693
3694     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3695     auto XH = B.buildAnd(Ty, X, MaskConst);
3696     auto XL = B.buildFSub(Ty, X, XH, Flags);
3697
3698     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3699     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3700
3701     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3702     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3703
3704     Register Mad0 =
3705         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3706     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3707   }
3708
3709   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3710
3711   // It is unsafe to contract this fsub into the PH multiply.
3712   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3713   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3714   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3715
3716   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3717                   .addUse(A.getReg(0))
3718                   .setMIFlags(Flags);
3719   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3720
3721   auto UnderflowCheckConst =
3722       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3723   auto Zero = B.buildFConstant(Ty, 0.0);
3724   auto Underflow =
3725       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3726
3727   R = B.buildSelect(Ty, Underflow, Zero, R);
3728
3729   const auto &Options = MF.getTarget().Options;
3730
3731   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3732     auto OverflowCheckConst =
3733         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3734
3735     auto Overflow =
3736         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3737     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3738     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3739   }
3740
3741   B.buildCopy(Dst, R);
3742   MI.eraseFromParent();
3743   return true;
3744 }
3745
3746 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3747                                        MachineIRBuilder &B) const {
3748   Register Dst = MI.getOperand(0).getReg();
3749   Register Src0 = MI.getOperand(1).getReg();
3750   Register Src1 = MI.getOperand(2).getReg();
3751   unsigned Flags = MI.getFlags();
3752   LLT Ty = B.getMRI()->getType(Dst);
3753   const LLT F16 = LLT::float16();
3754   const LLT F32 = LLT::float32();
3755
3756   if (Ty == F32) {
3757     auto Log = B.buildFLog2(F32, Src0, Flags);
3758     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3759                    .addUse(Log.getReg(0))
3760                    .addUse(Src1)
3761                    .setMIFlags(Flags);
3762     B.buildFExp2(Dst, Mul, Flags);
3763   } else if (Ty == F16) {
3764     // There's no f16 fmul_legacy, so we need to convert for it.
3765     auto Log = B.buildFLog2(F16, Src0, Flags);
3766     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3767     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3768     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3769                    .addUse(Ext0.getReg(0))
3770                    .addUse(Ext1.getReg(0))
3771                    .setMIFlags(Flags);
3772     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3773   } else
3774     return false;
3775
3776   MI.eraseFromParent();
3777   return true;
3778 }
3779
3780 // Find a source register, ignoring any possible source modifiers.
3781 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3782   Register ModSrc = OrigSrc;
3783   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3784     ModSrc = SrcFNeg->getOperand(1).getReg();
3785     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3786       ModSrc = SrcFAbs->getOperand(1).getReg();
3787   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3788     ModSrc = SrcFAbs->getOperand(1).getReg();
3789   return ModSrc;
3790 }
3791
3792 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3793                                          MachineRegisterInfo &MRI,
3794                                          MachineIRBuilder &B) const {
3795
3796   const LLT S1 = LLT::scalar(1);
3797   const LLT F64 = LLT::float64();
3798   Register Dst = MI.getOperand(0).getReg();
3799   Register OrigSrc = MI.getOperand(1).getReg();
3800   unsigned Flags = MI.getFlags();
3801   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3802          "this should not have been custom lowered");
3803
3804   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3805   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3806   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3807   // V_FRACT bug is:
3808   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3809   //
3810   // Convert floor(x) to (x - fract(x))
3811
3812   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3813                    .addUse(OrigSrc)
3814                    .setMIFlags(Flags);
3815
3816   // Give source modifier matching some assistance before obscuring a foldable
3817   // pattern.
3818
3819   // TODO: We can avoid the neg on the fract? The input sign to fract
3820   // shouldn't matter?
3821   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3822
3823   auto Const =
3824       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3825
3826   Register Min = MRI.createGenericVirtualRegister(F64);
3827
3828   // We don't need to concern ourselves with the snan handling difference, so
3829   // use the one which will directly select.
3830   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3831   if (MFI->getMode().IEEE)
3832     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3833   else
3834     B.buildFMinNum(Min, Fract, Const, Flags);
3835
3836   Register CorrectedFract = Min;
3837   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3838     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3839     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3840   }
3841
3842   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3843   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3844
3845   MI.eraseFromParent();
3846   return true;
3847 }
3848
3849 // Turn an illegal packed v2s16 build vector into bit operations.
3850 // TODO: This should probably be a bitcast action in LegalizerHelper.
3851 bool AMDGPULegalizerInfo::legalizeBuildVector(
3852   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3853   Register Dst = MI.getOperand(0).getReg();
3854   const LLT S32 = LLT::scalar(32);
3855   const LLT S16 = LLT::scalar(16);
3856   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3857
3858   Register Src0 = MI.getOperand(1).getReg();
3859   Register Src1 = MI.getOperand(2).getReg();
3860
3861   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3862     assert(MRI.getType(Src0) == S32);
3863     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3864     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3865   }
3866
3867   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3868   B.buildBitcast(Dst, Merge);
3869
3870   MI.eraseFromParent();
3871   return true;
3872 }
3873
3874 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3875 //
3876 // Source and accumulation registers must all be 32-bits.
3877 //
3878 // TODO: When the multiply is uniform, we should produce a code sequence
3879 // that is better suited to instruction selection on the SALU. Instead of
3880 // the outer loop going over parts of the result, the outer loop should go
3881 // over parts of one of the factors. This should result in instruction
3882 // selection that makes full use of S_ADDC_U32 instructions.
3883 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3884                                         MutableArrayRef<Register> Accum,
3885                                         ArrayRef<Register> Src0,
3886                                         ArrayRef<Register> Src1,
3887                                         bool UsePartialMad64_32,
3888                                         bool SeparateOddAlignedProducts) const {
3889   // Use (possibly empty) vectors of S1 registers to represent the set of
3890   // carries from one pair of positions to the next.
3891   using Carry = SmallVector<Register, 2>;
3892
3893   MachineIRBuilder &B = Helper.MIRBuilder;
3894   GISelKnownBits &KB = *Helper.getKnownBits();
3895
3896   const LLT S1 = LLT::scalar(1);
3897   const LLT S32 = LLT::scalar(32);
3898   const LLT S64 = LLT::scalar(64);
3899
3900   Register Zero32;
3901   Register Zero64;
3902
3903   auto getZero32 = [&]() -> Register {
3904     if (!Zero32)
3905       Zero32 = B.buildConstant(S32, 0).getReg(0);
3906     return Zero32;
3907   };
3908   auto getZero64 = [&]() -> Register {
3909     if (!Zero64)
3910       Zero64 = B.buildConstant(S64, 0).getReg(0);
3911     return Zero64;
3912   };
3913
3914   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3915   for (unsigned i = 0; i < Src0.size(); ++i) {
3916     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3917     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3918   }
3919
3920   // Merge the given carries into the 32-bit LocalAccum, which is modified
3921   // in-place.
3922   //
3923   // Returns the carry-out, which is a single S1 register or null.
3924   auto mergeCarry =
3925       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3926         if (CarryIn.empty())
3927           return Register();
3928
3929         bool HaveCarryOut = true;
3930         Register CarryAccum;
3931         if (CarryIn.size() == 1) {
3932           if (!LocalAccum) {
3933             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3934             return Register();
3935           }
3936
3937           CarryAccum = getZero32();
3938         } else {
3939           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3940           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3941             CarryAccum =
3942                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3943                     .getReg(0);
3944           }
3945
3946           if (!LocalAccum) {
3947             LocalAccum = getZero32();
3948             HaveCarryOut = false;
3949           }
3950         }
3951
3952         auto Add =
3953             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3954         LocalAccum = Add.getReg(0);
3955         return HaveCarryOut ? Add.getReg(1) : Register();
3956       };
3957
3958   // Build a multiply-add chain to compute
3959   //
3960   //   LocalAccum + (partial products at DstIndex)
3961   //       + (opportunistic subset of CarryIn)
3962   //
3963   // LocalAccum is an array of one or two 32-bit registers that are updated
3964   // in-place. The incoming registers may be null.
3965   //
3966   // In some edge cases, carry-ins can be consumed "for free". In that case,
3967   // the consumed carry bits are removed from CarryIn in-place.
3968   auto buildMadChain =
3969       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3970           -> Carry {
3971         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3972                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3973
3974         Carry CarryOut;
3975         unsigned j0 = 0;
3976
3977         // Use plain 32-bit multiplication for the most significant part of the
3978         // result by default.
3979         if (LocalAccum.size() == 1 &&
3980             (!UsePartialMad64_32 || !CarryIn.empty())) {
3981           do {
3982             // Skip multiplication if one of the operands is 0
3983             unsigned j1 = DstIndex - j0;
3984             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3985               ++j0;
3986               continue;
3987             }
3988             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3989             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3990               LocalAccum[0] = Mul.getReg(0);
3991             } else {
3992               if (CarryIn.empty()) {
3993                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3994               } else {
3995                 LocalAccum[0] =
3996                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3997                         .getReg(0);
3998                 CarryIn.pop_back();
3999               }
4000             }
4001             ++j0;
4002           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4003         }
4004
4005         // Build full 64-bit multiplies.
4006         if (j0 <= DstIndex) {
4007           bool HaveSmallAccum = false;
4008           Register Tmp;
4009
4010           if (LocalAccum[0]) {
4011             if (LocalAccum.size() == 1) {
4012               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4013               HaveSmallAccum = true;
4014             } else if (LocalAccum[1]) {
4015               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4016               HaveSmallAccum = false;
4017             } else {
4018               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4019               HaveSmallAccum = true;
4020             }
4021           } else {
4022             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4023             Tmp = getZero64();
4024             HaveSmallAccum = true;
4025           }
4026
4027           do {
4028             unsigned j1 = DstIndex - j0;
4029             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4030               ++j0;
4031               continue;
4032             }
4033             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4034                                     {Src0[j0], Src1[j1], Tmp});
4035             Tmp = Mad.getReg(0);
4036             if (!HaveSmallAccum)
4037               CarryOut.push_back(Mad.getReg(1));
4038             HaveSmallAccum = false;
4039
4040             ++j0;
4041           } while (j0 <= DstIndex);
4042
4043           auto Unmerge = B.buildUnmerge(S32, Tmp);
4044           LocalAccum[0] = Unmerge.getReg(0);
4045           if (LocalAccum.size() > 1)
4046             LocalAccum[1] = Unmerge.getReg(1);
4047         }
4048
4049         return CarryOut;
4050       };
4051
4052   // Outer multiply loop, iterating over destination parts from least
4053   // significant to most significant parts.
4054   //
4055   // The columns of the following diagram correspond to the destination parts
4056   // affected by one iteration of the outer loop (ignoring boundary
4057   // conditions).
4058   //
4059   //   Dest index relative to 2 * i:      1 0 -1
4060   //                                      ------
4061   //   Carries from previous iteration:     e o
4062   //   Even-aligned partial product sum:  E E .
4063   //   Odd-aligned partial product sum:     O O
4064   //
4065   // 'o' is OddCarry, 'e' is EvenCarry.
4066   // EE and OO are computed from partial products via buildMadChain and use
4067   // accumulation where possible and appropriate.
4068   //
4069   Register SeparateOddCarry;
4070   Carry EvenCarry;
4071   Carry OddCarry;
4072
4073   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4074     Carry OddCarryIn = std::move(OddCarry);
4075     Carry EvenCarryIn = std::move(EvenCarry);
4076     OddCarry.clear();
4077     EvenCarry.clear();
4078
4079     // Partial products at offset 2 * i.
4080     if (2 * i < Accum.size()) {
4081       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4082       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4083     }
4084
4085     // Partial products at offset 2 * i - 1.
4086     if (i > 0) {
4087       if (!SeparateOddAlignedProducts) {
4088         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4089         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4090       } else {
4091         bool IsHighest = 2 * i >= Accum.size();
4092         Register SeparateOddOut[2];
4093         auto LocalAccum = MutableArrayRef(SeparateOddOut)
4094                               .take_front(IsHighest ? 1 : 2);
4095         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4096
4097         MachineInstr *Lo;
4098
4099         if (i == 1) {
4100           if (!IsHighest)
4101             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4102           else
4103             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4104         } else {
4105           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4106                             SeparateOddCarry);
4107         }
4108         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4109
4110         if (!IsHighest) {
4111           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4112                                 Lo->getOperand(1).getReg());
4113           Accum[2 * i] = Hi.getReg(0);
4114           SeparateOddCarry = Hi.getReg(1);
4115         }
4116       }
4117     }
4118
4119     // Add in the carries from the previous iteration
4120     if (i > 0) {
4121       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4122         EvenCarryIn.push_back(CarryOut);
4123
4124       if (2 * i < Accum.size()) {
4125         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4126           OddCarry.push_back(CarryOut);
4127       }
4128     }
4129   }
4130 }
4131
4132 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4133 //
4134 // TODO: If the multiply is followed by an addition, we should attempt to
4135 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4136 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4137                                       MachineInstr &MI) const {
4138   assert(ST.hasMad64_32());
4139   assert(MI.getOpcode() == TargetOpcode::G_MUL);
4140
4141   MachineIRBuilder &B = Helper.MIRBuilder;
4142   MachineRegisterInfo &MRI = *B.getMRI();
4143
4144   Register DstReg = MI.getOperand(0).getReg();
4145   Register Src0 = MI.getOperand(1).getReg();
4146   Register Src1 = MI.getOperand(2).getReg();
4147
4148   LLT Ty = MRI.getType(DstReg);
4149   assert(Ty.isScalar());
4150
4151   unsigned Size = Ty.getSizeInBits();
4152   unsigned NumParts = Size / 32;
4153   assert((Size % 32) == 0);
4154   assert(NumParts >= 2);
4155
4156   // Whether to use MAD_64_32 for partial products whose high half is
4157   // discarded. This avoids some ADD instructions but risks false dependency
4158   // stalls on some subtargets in some cases.
4159   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4160
4161   // Whether to compute odd-aligned partial products separately. This is
4162   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4163   // in an even-aligned VGPR.
4164   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4165
4166   LLT S32 = LLT::scalar(32);
4167   SmallVector<Register, 2> Src0Parts, Src1Parts;
4168   for (unsigned i = 0; i < NumParts; ++i) {
4169     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4170     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4171   }
4172   B.buildUnmerge(Src0Parts, Src0);
4173   B.buildUnmerge(Src1Parts, Src1);
4174
4175   SmallVector<Register, 2> AccumRegs(NumParts);
4176   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4177                 SeparateOddAlignedProducts);
4178
4179   B.buildMergeLikeInstr(DstReg, AccumRegs);
4180   MI.eraseFromParent();
4181   return true;
4182 }
4183
4184 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4185 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4186 // case with a single min instruction instead of a compare+select.
4187 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4188                                             MachineRegisterInfo &MRI,
4189                                             MachineIRBuilder &B) const {
4190   Register Dst = MI.getOperand(0).getReg();
4191   Register Src = MI.getOperand(1).getReg();
4192   LLT DstTy = MRI.getType(Dst);
4193   LLT SrcTy = MRI.getType(Src);
4194
4195   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4196                         ? AMDGPU::G_AMDGPU_FFBH_U32
4197                         : AMDGPU::G_AMDGPU_FFBL_B32;
4198   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4199   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4200
4201   MI.eraseFromParent();
4202   return true;
4203 }
4204
4205 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4206                                                   MachineRegisterInfo &MRI,
4207                                                   MachineIRBuilder &B) const {
4208   Register Dst = MI.getOperand(0).getReg();
4209   Register Src = MI.getOperand(1).getReg();
4210   LLT SrcTy = MRI.getType(Src);
4211   TypeSize NumBits = SrcTy.getSizeInBits();
4212
4213   assert(NumBits < 32u);
4214
4215   auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4216   auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4217   auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4218   auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4219   B.buildTrunc(Dst, Ctlz);
4220   MI.eraseFromParent();
4221   return true;
4222 }
4223
4224 // Check that this is a G_XOR x, -1
4225 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4226   if (MI.getOpcode() != TargetOpcode::G_XOR)
4227     return false;
4228   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4229   return ConstVal && *ConstVal == -1;
4230 }
4231
4232 // Return the use branch instruction, otherwise null if the usage is invalid.
4233 static MachineInstr *
4234 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4235                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4236   Register CondDef = MI.getOperand(0).getReg();
4237   if (!MRI.hasOneNonDBGUse(CondDef))
4238     return nullptr;
4239
4240   MachineBasicBlock *Parent = MI.getParent();
4241   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4242
4243   if (isNot(MRI, *UseMI)) {
4244     Register NegatedCond = UseMI->getOperand(0).getReg();
4245     if (!MRI.hasOneNonDBGUse(NegatedCond))
4246       return nullptr;
4247
4248     // We're deleting the def of this value, so we need to remove it.
4249     eraseInstr(*UseMI, MRI);
4250
4251     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4252     Negated = true;
4253   }
4254
4255   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4256     return nullptr;
4257
4258   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4259   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4260   if (Next == Parent->end()) {
4261     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4262     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4263       return nullptr;
4264     UncondBrTarget = &*NextMBB;
4265   } else {
4266     if (Next->getOpcode() != AMDGPU::G_BR)
4267       return nullptr;
4268     Br = &*Next;
4269     UncondBrTarget = Br->getOperand(0).getMBB();
4270   }
4271
4272   return UseMI;
4273 }
4274
4275 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4276                                          const ArgDescriptor *Arg,
4277                                          const TargetRegisterClass *ArgRC,
4278                                          LLT ArgTy) const {
4279   MCRegister SrcReg = Arg->getRegister();
4280   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4281   assert(DstReg.isVirtual() && "Virtual register expected");
4282
4283   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4284                                              *ArgRC, B.getDebugLoc(), ArgTy);
4285   if (Arg->isMasked()) {
4286     // TODO: Should we try to emit this once in the entry block?
4287     const LLT S32 = LLT::scalar(32);
4288     const unsigned Mask = Arg->getMask();
4289     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4290
4291     Register AndMaskSrc = LiveIn;
4292
4293     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4294     // 0.
4295     if (Shift != 0) {
4296       auto ShiftAmt = B.buildConstant(S32, Shift);
4297       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4298     }
4299
4300     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4301   } else {
4302     B.buildCopy(DstReg, LiveIn);
4303   }
4304
4305   return true;
4306 }
4307
4308 bool AMDGPULegalizerInfo::loadInputValue(
4309     Register DstReg, MachineIRBuilder &B,
4310     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4311   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4312   const ArgDescriptor *Arg = nullptr;
4313   const TargetRegisterClass *ArgRC;
4314   LLT ArgTy;
4315
4316   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4317   const ArgDescriptor WorkGroupIDX =
4318       ArgDescriptor::createRegister(AMDGPU::TTMP9);
4319   // If GridZ is not programmed in an entry function then the hardware will set
4320   // it to all zeros, so there is no need to mask the GridY value in the low
4321   // order bits.
4322   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4323       AMDGPU::TTMP7,
4324       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4325   const ArgDescriptor WorkGroupIDZ =
4326       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4327   if (ST.hasArchitectedSGPRs() &&
4328       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4329     switch (ArgType) {
4330     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4331       Arg = &WorkGroupIDX;
4332       ArgRC = &AMDGPU::SReg_32RegClass;
4333       ArgTy = LLT::scalar(32);
4334       break;
4335     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4336       Arg = &WorkGroupIDY;
4337       ArgRC = &AMDGPU::SReg_32RegClass;
4338       ArgTy = LLT::scalar(32);
4339       break;
4340     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4341       Arg = &WorkGroupIDZ;
4342       ArgRC = &AMDGPU::SReg_32RegClass;
4343       ArgTy = LLT::scalar(32);
4344       break;
4345     default:
4346       break;
4347     }
4348   }
4349
4350   if (!Arg)
4351     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4352
4353   if (!Arg) {
4354     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4355       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4356       // case the pointer argument may be missing and we use null.
4357       B.buildConstant(DstReg, 0);
4358       return true;
4359     }
4360
4361     // It's undefined behavior if a function marked with the amdgpu-no-*
4362     // attributes uses the corresponding intrinsic.
4363     B.buildUndef(DstReg);
4364     return true;
4365   }
4366
4367   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4368     return false; // TODO: Handle these
4369   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4370 }
4371
4372 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4373     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4374     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4375   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4376     return false;
4377
4378   MI.eraseFromParent();
4379   return true;
4380 }
4381
4382 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4383                                 int64_t C) {
4384   B.buildConstant(MI.getOperand(0).getReg(), C);
4385   MI.eraseFromParent();
4386   return true;
4387 }
4388
4389 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4390     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4391     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4392   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4393   if (MaxID == 0)
4394     return replaceWithConstant(B, MI, 0);
4395
4396   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4397   const ArgDescriptor *Arg;
4398   const TargetRegisterClass *ArgRC;
4399   LLT ArgTy;
4400   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4401
4402   Register DstReg = MI.getOperand(0).getReg();
4403   if (!Arg) {
4404     // It's undefined behavior if a function marked with the amdgpu-no-*
4405     // attributes uses the corresponding intrinsic.
4406     B.buildUndef(DstReg);
4407     MI.eraseFromParent();
4408     return true;
4409   }
4410
4411   if (Arg->isMasked()) {
4412     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4413     // masking operations anyway.
4414     //
4415     // TODO: We could assert the top bit is 0 for the source copy.
4416     if (!loadInputValue(DstReg, B, ArgType))
4417       return false;
4418   } else {
4419     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4420     if (!loadInputValue(TmpReg, B, ArgType))
4421       return false;
4422     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4423   }
4424
4425   MI.eraseFromParent();
4426   return true;
4427 }
4428
4429 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4430                                                      int64_t Offset) const {
4431   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4432   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4433
4434   // TODO: If we passed in the base kernel offset we could have a better
4435   // alignment than 4, but we don't really need it.
4436   if (!loadInputValue(KernArgReg, B,
4437                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4438     llvm_unreachable("failed to find kernarg segment ptr");
4439
4440   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4441   // TODO: Should get nuw
4442   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4443 }
4444
4445 /// Legalize a value that's loaded from kernel arguments. This is only used by
4446 /// legacy intrinsics.
4447 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4448                                                       MachineIRBuilder &B,
4449                                                       uint64_t Offset,
4450                                                       Align Alignment) const {
4451   Register DstReg = MI.getOperand(0).getReg();
4452
4453   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4454          "unexpected kernarg parameter type");
4455
4456   Register Ptr = getKernargParameterPtr(B, Offset);
4457   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4458   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4459               MachineMemOperand::MODereferenceable |
4460                   MachineMemOperand::MOInvariant);
4461   MI.eraseFromParent();
4462   return true;
4463 }
4464
4465 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4466                                        MachineRegisterInfo &MRI,
4467                                        MachineIRBuilder &B) const {
4468   Register Dst = MI.getOperand(0).getReg();
4469   LLT DstTy = MRI.getType(Dst);
4470   LLT S16 = LLT::scalar(16);
4471   LLT S32 = LLT::scalar(32);
4472   LLT S64 = LLT::scalar(64);
4473
4474   if (DstTy == S16)
4475     return legalizeFDIV16(MI, MRI, B);
4476   if (DstTy == S32)
4477     return legalizeFDIV32(MI, MRI, B);
4478   if (DstTy == S64)
4479     return legalizeFDIV64(MI, MRI, B);
4480
4481   return false;
4482 }
4483
4484 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4485                                                         Register DstDivReg,
4486                                                         Register DstRemReg,
4487                                                         Register X,
4488                                                         Register Y) const {
4489   const LLT S1 = LLT::scalar(1);
4490   const LLT S32 = LLT::scalar(32);
4491
4492   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4493   // algorithm used here.
4494
4495   // Initial estimate of inv(y).
4496   auto FloatY = B.buildUITOFP(S32, Y);
4497   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4498   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4499   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4500   auto Z = B.buildFPTOUI(S32, ScaledY);
4501
4502   // One round of UNR.
4503   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4504   auto NegYZ = B.buildMul(S32, NegY, Z);
4505   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4506
4507   // Quotient/remainder estimate.
4508   auto Q = B.buildUMulH(S32, X, Z);
4509   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4510
4511   // First quotient/remainder refinement.
4512   auto One = B.buildConstant(S32, 1);
4513   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4514   if (DstDivReg)
4515     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4516   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4517
4518   // Second quotient/remainder refinement.
4519   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4520   if (DstDivReg)
4521     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4522
4523   if (DstRemReg)
4524     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4525 }
4526
4527 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4528 //
4529 // Return lo, hi of result
4530 //
4531 // %cvt.lo = G_UITOFP Val.lo
4532 // %cvt.hi = G_UITOFP Val.hi
4533 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4534 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4535 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4536 // %mul2 = G_FMUL %mul1, 2**(-32)
4537 // %trunc = G_INTRINSIC_TRUNC %mul2
4538 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4539 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4540 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4541                                                        Register Val) {
4542   const LLT S32 = LLT::scalar(32);
4543   auto Unmerge = B.buildUnmerge(S32, Val);
4544
4545   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4546   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4547
4548   auto Mad = B.buildFMAD(
4549       S32, CvtHi, // 2**32
4550       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4551
4552   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4553   auto Mul1 = B.buildFMul(
4554       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4555
4556   // 2**(-32)
4557   auto Mul2 = B.buildFMul(
4558       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4559   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4560
4561   // -(2**32)
4562   auto Mad2 = B.buildFMAD(
4563       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4564       Mul1);
4565
4566   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4567   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4568
4569   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4570 }
4571
4572 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4573                                                         Register DstDivReg,
4574                                                         Register DstRemReg,
4575                                                         Register Numer,
4576                                                         Register Denom) const {
4577   const LLT S32 = LLT::scalar(32);
4578   const LLT S64 = LLT::scalar(64);
4579   const LLT S1 = LLT::scalar(1);
4580   Register RcpLo, RcpHi;
4581
4582   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4583
4584   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4585
4586   auto Zero64 = B.buildConstant(S64, 0);
4587   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4588
4589   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4590   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4591
4592   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4593   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4594   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4595
4596   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4597   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4598   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4599
4600   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4601   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4602   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4603   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4604   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4605
4606   auto Zero32 = B.buildConstant(S32, 0);
4607   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4608   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4609   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4610
4611   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4612   Register NumerLo = UnmergeNumer.getReg(0);
4613   Register NumerHi = UnmergeNumer.getReg(1);
4614
4615   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4616   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4617   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4618   Register Mul3_Lo = UnmergeMul3.getReg(0);
4619   Register Mul3_Hi = UnmergeMul3.getReg(1);
4620   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4621   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4622   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4623   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4624
4625   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4626   Register DenomLo = UnmergeDenom.getReg(0);
4627   Register DenomHi = UnmergeDenom.getReg(1);
4628
4629   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4630   auto C1 = B.buildSExt(S32, CmpHi);
4631
4632   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4633   auto C2 = B.buildSExt(S32, CmpLo);
4634
4635   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4636   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4637
4638   // TODO: Here and below portions of the code can be enclosed into if/endif.
4639   // Currently control flow is unconditional and we have 4 selects after
4640   // potential endif to substitute PHIs.
4641
4642   // if C3 != 0 ...
4643   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4644   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4645   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4646   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4647
4648   auto One64 = B.buildConstant(S64, 1);
4649   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4650
4651   auto C4 =
4652       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4653   auto C5 =
4654       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4655   auto C6 = B.buildSelect(
4656       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4657
4658   // if (C6 != 0)
4659   auto Add4 = B.buildAdd(S64, Add3, One64);
4660   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4661
4662   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4663   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4664   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4665
4666   // endif C6
4667   // endif C3
4668
4669   if (DstDivReg) {
4670     auto Sel1 = B.buildSelect(
4671         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4672     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4673                   Sel1, MulHi3);
4674   }
4675
4676   if (DstRemReg) {
4677     auto Sel2 = B.buildSelect(
4678         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4679     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4680                   Sel2, Sub1);
4681   }
4682 }
4683
4684 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4685                                                   MachineRegisterInfo &MRI,
4686                                                   MachineIRBuilder &B) const {
4687   Register DstDivReg, DstRemReg;
4688   switch (MI.getOpcode()) {
4689   default:
4690     llvm_unreachable("Unexpected opcode!");
4691   case AMDGPU::G_UDIV: {
4692     DstDivReg = MI.getOperand(0).getReg();
4693     break;
4694   }
4695   case AMDGPU::G_UREM: {
4696     DstRemReg = MI.getOperand(0).getReg();
4697     break;
4698   }
4699   case AMDGPU::G_UDIVREM: {
4700     DstDivReg = MI.getOperand(0).getReg();
4701     DstRemReg = MI.getOperand(1).getReg();
4702     break;
4703   }
4704   }
4705
4706   const LLT S64 = LLT::scalar(64);
4707   const LLT S32 = LLT::scalar(32);
4708   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4709   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4710   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4711   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4712
4713   if (Ty == S32)
4714     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4715   else if (Ty == S64)
4716     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4717   else
4718     return false;
4719
4720   MI.eraseFromParent();
4721   return true;
4722 }
4723
4724 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4725                                                 MachineRegisterInfo &MRI,
4726                                                 MachineIRBuilder &B) const {
4727   const LLT S64 = LLT::scalar(64);
4728   const LLT S32 = LLT::scalar(32);
4729
4730   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4731   if (Ty != S32 && Ty != S64)
4732     return false;
4733
4734   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4735   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4736   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4737
4738   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4739   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4740   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4741
4742   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4743   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4744
4745   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4746   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4747
4748   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4749   switch (MI.getOpcode()) {
4750   default:
4751     llvm_unreachable("Unexpected opcode!");
4752   case AMDGPU::G_SDIV: {
4753     DstDivReg = MI.getOperand(0).getReg();
4754     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4755     break;
4756   }
4757   case AMDGPU::G_SREM: {
4758     DstRemReg = MI.getOperand(0).getReg();
4759     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4760     break;
4761   }
4762   case AMDGPU::G_SDIVREM: {
4763     DstDivReg = MI.getOperand(0).getReg();
4764     DstRemReg = MI.getOperand(1).getReg();
4765     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4766     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4767     break;
4768   }
4769   }
4770
4771   if (Ty == S32)
4772     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4773   else
4774     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4775
4776   if (DstDivReg) {
4777     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4778     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4779     B.buildSub(DstDivReg, SignXor, Sign);
4780   }
4781
4782   if (DstRemReg) {
4783     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4784     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4785     B.buildSub(DstRemReg, SignXor, Sign);
4786   }
4787
4788   MI.eraseFromParent();
4789   return true;
4790 }
4791
4792 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4793                                                  MachineRegisterInfo &MRI,
4794                                                  MachineIRBuilder &B) const {
4795   Register Res = MI.getOperand(0).getReg();
4796   Register LHS = MI.getOperand(1).getReg();
4797   Register RHS = MI.getOperand(2).getReg();
4798   uint16_t Flags = MI.getFlags();
4799   LLT ResTy = MRI.getType(Res);
4800
4801   const MachineFunction &MF = B.getMF();
4802   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4803                             MF.getTarget().Options.UnsafeFPMath;
4804
4805   if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
4806     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4807       return false;
4808
4809     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4810     // the CI documentation has a worst case error of 1 ulp.
4811     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4812     // use it as long as we aren't trying to use denormals.
4813     //
4814     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4815
4816     // 1 / x -> RCP(x)
4817     if (CLHS->isExactlyValue(1.0)) {
4818       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4819           .addUse(RHS)
4820           .setMIFlags(Flags);
4821
4822       MI.eraseFromParent();
4823       return true;
4824     }
4825
4826     // -1 / x -> RCP( FNEG(x) )
4827     if (CLHS->isExactlyValue(-1.0)) {
4828       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4829       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4830           .addUse(FNeg.getReg(0))
4831           .setMIFlags(Flags);
4832
4833       MI.eraseFromParent();
4834       return true;
4835     }
4836   }
4837
4838   // For f16 require afn or arcp.
4839   // For f32 require afn.
4840   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4841                               !MI.getFlag(MachineInstr::FmArcp)))
4842     return false;
4843
4844   // x / y -> x * (1.0 / y)
4845   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4846                  .addUse(RHS)
4847                  .setMIFlags(Flags);
4848   B.buildFMul(Res, LHS, RCP, Flags);
4849
4850   MI.eraseFromParent();
4851   return true;
4852 }
4853
4854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4855                                                    MachineRegisterInfo &MRI,
4856                                                    MachineIRBuilder &B) const {
4857   Register Res = MI.getOperand(0).getReg();
4858   Register X = MI.getOperand(1).getReg();
4859   Register Y = MI.getOperand(2).getReg();
4860   uint16_t Flags = MI.getFlags();
4861   LLT ResTy = MRI.getType(Res);
4862
4863   const MachineFunction &MF = B.getMF();
4864   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4865                             MI.getFlag(MachineInstr::FmAfn);
4866
4867   if (!AllowInaccurateRcp)
4868     return false;
4869
4870   auto NegY = B.buildFNeg(ResTy, Y);
4871   auto One = B.buildFConstant(ResTy, 1.0);
4872
4873   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4874                .addUse(Y)
4875                .setMIFlags(Flags);
4876
4877   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4878   R = B.buildFMA(ResTy, Tmp0, R, R);
4879
4880   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4881   R = B.buildFMA(ResTy, Tmp1, R, R);
4882
4883   auto Ret = B.buildFMul(ResTy, X, R);
4884   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4885
4886   B.buildFMA(Res, Tmp2, R, Ret);
4887   MI.eraseFromParent();
4888   return true;
4889 }
4890
4891 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4892                                          MachineRegisterInfo &MRI,
4893                                          MachineIRBuilder &B) const {
4894   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4895     return true;
4896
4897   Register Res = MI.getOperand(0).getReg();
4898   Register LHS = MI.getOperand(1).getReg();
4899   Register RHS = MI.getOperand(2).getReg();
4900
4901   uint16_t Flags = MI.getFlags();
4902
4903   LLT S16 = LLT::scalar(16);
4904   LLT S32 = LLT::scalar(32);
4905
4906   // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4907   // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4908   // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4909   // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4910   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4911   // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4912   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4913   // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4914   // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4915   // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4916   // q16.u = opx(V_CVT_F16_F32, q32.u);
4917   // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4918
4919   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4920   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4921   auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4922   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4923                  .addUse(RHSExt.getReg(0))
4924                  .setMIFlags(Flags);
4925   auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
4926   MachineInstrBuilder Err;
4927   if (ST.hasMadMacF32Insts()) {
4928     Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4929     Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
4930     Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4931   } else {
4932     Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4933     Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
4934     Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4935   }
4936   auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
4937   Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4938   Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
4939   auto RDst = B.buildFPTrunc(S16, Quot, Flags);
4940   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4941       .addUse(RDst.getReg(0))
4942       .addUse(RHS)
4943       .addUse(LHS)
4944       .setMIFlags(Flags);
4945
4946   MI.eraseFromParent();
4947   return true;
4948 }
4949
4950 static constexpr unsigned SPDenormModeBitField =
4951     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4952
4953 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4954 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4955 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4956                                const GCNSubtarget &ST,
4957                                SIModeRegisterDefaults Mode) {
4958   // Set SP denorm mode to this value.
4959   unsigned SPDenormMode =
4960     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4961
4962   if (ST.hasDenormModeInst()) {
4963     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4964     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4965
4966     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4967     B.buildInstr(AMDGPU::S_DENORM_MODE)
4968       .addImm(NewDenormModeValue);
4969
4970   } else {
4971     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4972       .addImm(SPDenormMode)
4973       .addImm(SPDenormModeBitField);
4974   }
4975 }
4976
4977 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4978                                          MachineRegisterInfo &MRI,
4979                                          MachineIRBuilder &B) const {
4980   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4981     return true;
4982
4983   Register Res = MI.getOperand(0).getReg();
4984   Register LHS = MI.getOperand(1).getReg();
4985   Register RHS = MI.getOperand(2).getReg();
4986   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4987   SIModeRegisterDefaults Mode = MFI->getMode();
4988
4989   uint16_t Flags = MI.getFlags();
4990
4991   LLT S32 = LLT::scalar(32);
4992   LLT S1 = LLT::scalar(1);
4993
4994   auto One = B.buildFConstant(S32, 1.0f);
4995
4996   auto DenominatorScaled =
4997       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4998           .addUse(LHS)
4999           .addUse(RHS)
5000           .addImm(0)
5001           .setMIFlags(Flags);
5002   auto NumeratorScaled =
5003       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5004           .addUse(LHS)
5005           .addUse(RHS)
5006           .addImm(1)
5007           .setMIFlags(Flags);
5008
5009   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5010                        .addUse(DenominatorScaled.getReg(0))
5011                        .setMIFlags(Flags);
5012   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5013
5014   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5015   const bool HasDynamicDenormals =
5016       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5017       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5018
5019   Register SavedSPDenormMode;
5020   if (!PreservesDenormals) {
5021     if (HasDynamicDenormals) {
5022       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5023       B.buildInstr(AMDGPU::S_GETREG_B32)
5024           .addDef(SavedSPDenormMode)
5025           .addImm(SPDenormModeBitField);
5026     }
5027     toggleSPDenormMode(true, B, ST, Mode);
5028   }
5029
5030   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5031   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5032   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5033   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5034   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5035   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5036
5037   if (!PreservesDenormals) {
5038     if (HasDynamicDenormals) {
5039       assert(SavedSPDenormMode);
5040       B.buildInstr(AMDGPU::S_SETREG_B32)
5041           .addReg(SavedSPDenormMode)
5042           .addImm(SPDenormModeBitField);
5043     } else
5044       toggleSPDenormMode(false, B, ST, Mode);
5045   }
5046
5047   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5048                   .addUse(Fma4.getReg(0))
5049                   .addUse(Fma1.getReg(0))
5050                   .addUse(Fma3.getReg(0))
5051                   .addUse(NumeratorScaled.getReg(1))
5052                   .setMIFlags(Flags);
5053
5054   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5055       .addUse(Fmas.getReg(0))
5056       .addUse(RHS)
5057       .addUse(LHS)
5058       .setMIFlags(Flags);
5059
5060   MI.eraseFromParent();
5061   return true;
5062 }
5063
5064 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5065                                          MachineRegisterInfo &MRI,
5066                                          MachineIRBuilder &B) const {
5067   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5068     return true;
5069
5070   Register Res = MI.getOperand(0).getReg();
5071   Register LHS = MI.getOperand(1).getReg();
5072   Register RHS = MI.getOperand(2).getReg();
5073
5074   uint16_t Flags = MI.getFlags();
5075
5076   LLT S64 = LLT::scalar(64);
5077   LLT S1 = LLT::scalar(1);
5078
5079   auto One = B.buildFConstant(S64, 1.0);
5080
5081   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5082                        .addUse(LHS)
5083                        .addUse(RHS)
5084                        .addImm(0)
5085                        .setMIFlags(Flags);
5086
5087   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5088
5089   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5090                  .addUse(DivScale0.getReg(0))
5091                  .setMIFlags(Flags);
5092
5093   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5094   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5095   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5096
5097   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5098                        .addUse(LHS)
5099                        .addUse(RHS)
5100                        .addImm(1)
5101                        .setMIFlags(Flags);
5102
5103   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5104   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5105   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5106
5107   Register Scale;
5108   if (!ST.hasUsableDivScaleConditionOutput()) {
5109     // Workaround a hardware bug on SI where the condition output from div_scale
5110     // is not usable.
5111
5112     LLT S32 = LLT::scalar(32);
5113
5114     auto NumUnmerge = B.buildUnmerge(S32, LHS);
5115     auto DenUnmerge = B.buildUnmerge(S32, RHS);
5116     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5117     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5118
5119     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5120                               Scale1Unmerge.getReg(1));
5121     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5122                               Scale0Unmerge.getReg(1));
5123     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5124   } else {
5125     Scale = DivScale1.getReg(1);
5126   }
5127
5128   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5129                   .addUse(Fma4.getReg(0))
5130                   .addUse(Fma3.getReg(0))
5131                   .addUse(Mul.getReg(0))
5132                   .addUse(Scale)
5133                   .setMIFlags(Flags);
5134
5135   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5136       .addUse(Fmas.getReg(0))
5137       .addUse(RHS)
5138       .addUse(LHS)
5139       .setMIFlags(Flags);
5140
5141   MI.eraseFromParent();
5142   return true;
5143 }
5144
5145 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5146                                          MachineRegisterInfo &MRI,
5147                                          MachineIRBuilder &B) const {
5148   Register Res0 = MI.getOperand(0).getReg();
5149   Register Res1 = MI.getOperand(1).getReg();
5150   Register Val = MI.getOperand(2).getReg();
5151   uint16_t Flags = MI.getFlags();
5152
5153   LLT Ty = MRI.getType(Res0);
5154   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5155
5156   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5157                   .addUse(Val)
5158                   .setMIFlags(Flags);
5159   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5160                  .addUse(Val)
5161                  .setMIFlags(Flags);
5162
5163   if (ST.hasFractBug()) {
5164     auto Fabs = B.buildFAbs(Ty, Val);
5165     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5166     auto IsFinite =
5167         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5168     auto Zero = B.buildConstant(InstrExpTy, 0);
5169     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5170     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5171   }
5172
5173   B.buildCopy(Res0, Mant);
5174   B.buildSExtOrTrunc(Res1, Exp);
5175
5176   MI.eraseFromParent();
5177   return true;
5178 }
5179
5180 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5181                                                  MachineRegisterInfo &MRI,
5182                                                  MachineIRBuilder &B) const {
5183   Register Res = MI.getOperand(0).getReg();
5184   Register LHS = MI.getOperand(2).getReg();
5185   Register RHS = MI.getOperand(3).getReg();
5186   uint16_t Flags = MI.getFlags();
5187
5188   LLT S32 = LLT::scalar(32);
5189   LLT S1 = LLT::scalar(1);
5190
5191   auto Abs = B.buildFAbs(S32, RHS, Flags);
5192   const APFloat C0Val(1.0f);
5193
5194   auto C0 = B.buildFConstant(S32, 0x1p+96f);
5195   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5196   auto C2 = B.buildFConstant(S32, 1.0f);
5197
5198   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5199   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5200
5201   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5202
5203   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5204                  .addUse(Mul0.getReg(0))
5205                  .setMIFlags(Flags);
5206
5207   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5208
5209   B.buildFMul(Res, Sel, Mul1, Flags);
5210
5211   MI.eraseFromParent();
5212   return true;
5213 }
5214
5215 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5216                                            MachineRegisterInfo &MRI,
5217                                            MachineIRBuilder &B) const {
5218   // Bypass the correct expansion a standard promotion through G_FSQRT would
5219   // get. The f32 op is accurate enough for the f16 cas.
5220   unsigned Flags = MI.getFlags();
5221   assert(!ST.has16BitInsts());
5222   const LLT F32 = LLT::scalar(32);
5223   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5224   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5225     .addUse(Ext.getReg(0))
5226     .setMIFlags(Flags);
5227   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5228   MI.eraseFromParent();
5229   return true;
5230 }
5231
5232 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5233                                            MachineRegisterInfo &MRI,
5234                                            MachineIRBuilder &B) const {
5235   MachineFunction &MF = B.getMF();
5236   Register Dst = MI.getOperand(0).getReg();
5237   Register X = MI.getOperand(1).getReg();
5238   const unsigned Flags = MI.getFlags();
5239   const LLT S1 = LLT::scalar(1);
5240   const LLT F32 = LLT::scalar(32);
5241   const LLT I32 = LLT::scalar(32);
5242
5243   if (allowApproxFunc(MF, Flags)) {
5244     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5245       .addUse(X)
5246       .setMIFlags(Flags);
5247     MI.eraseFromParent();
5248     return true;
5249   }
5250
5251   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5252   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5253   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5254   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5255   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5256
5257   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5258   if (needsDenormHandlingF32(MF, X, Flags)) {
5259     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5260       .addUse(SqrtX.getReg(0))
5261       .setMIFlags(Flags);
5262
5263     auto NegOne = B.buildConstant(I32, -1);
5264     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5265
5266     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5267     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5268
5269     auto PosOne = B.buildConstant(I32, 1);
5270     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5271
5272     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5273     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5274
5275     auto Zero = B.buildFConstant(F32, 0.0f);
5276     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5277
5278     SqrtS =
5279         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5280
5281     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5282     SqrtS =
5283         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5284   } else {
5285     auto SqrtR =
5286         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5287     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5288
5289     auto Half = B.buildFConstant(F32, 0.5f);
5290     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5291     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5292     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5293     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5294     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5295     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5296     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5297     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5298   }
5299
5300   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5301
5302   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5303
5304   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5305
5306   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5307   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5308
5309   MI.eraseFromParent();
5310   return true;
5311 }
5312
5313 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5314                                            MachineRegisterInfo &MRI,
5315                                            MachineIRBuilder &B) const {
5316   // For double type, the SQRT and RSQ instructions don't have required
5317   // precision, we apply Goldschmidt's algorithm to improve the result:
5318   //
5319   //   y0 = rsq(x)
5320   //   g0 = x * y0
5321   //   h0 = 0.5 * y0
5322   //
5323   //   r0 = 0.5 - h0 * g0
5324   //   g1 = g0 * r0 + g0
5325   //   h1 = h0 * r0 + h0
5326   //
5327   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5328   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
5329   //   h2 = h1 * r1 + h1
5330   //
5331   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5332   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
5333   //
5334   //   sqrt(x) = g3
5335
5336   const LLT S1 = LLT::scalar(1);
5337   const LLT S32 = LLT::scalar(32);
5338   const LLT F64 = LLT::scalar(64);
5339
5340   Register Dst = MI.getOperand(0).getReg();
5341   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5342
5343   Register X = MI.getOperand(1).getReg();
5344   unsigned Flags = MI.getFlags();
5345
5346   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5347
5348   auto ZeroInt = B.buildConstant(S32, 0);
5349   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5350
5351   // Scale up input if it is too small.
5352   auto ScaleUpFactor = B.buildConstant(S32, 256);
5353   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5354   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5355
5356   auto SqrtY =
5357       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5358
5359   auto Half = B.buildFConstant(F64, 0.5);
5360   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5361   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5362
5363   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5364   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5365
5366   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5367   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5368
5369   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5370   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5371
5372   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5373
5374   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5375   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5376
5377   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5378
5379   // Scale down the result.
5380   auto ScaleDownFactor = B.buildConstant(S32, -128);
5381   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5382   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5383
5384   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5385   // with finite only or nsz because rsq(+/-0) = +/-inf
5386
5387   // TODO: Check for DAZ and expand to subnormals
5388   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5389
5390   // If x is +INF, +0, or -0, use its original value
5391   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5392
5393   MI.eraseFromParent();
5394   return true;
5395 }
5396
5397 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5398                                         MachineRegisterInfo &MRI,
5399                                         MachineIRBuilder &B) const {
5400   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5401   if (Ty == LLT::scalar(32))
5402     return legalizeFSQRTF32(MI, MRI, B);
5403   if (Ty == LLT::scalar(64))
5404     return legalizeFSQRTF64(MI, MRI, B);
5405   if (Ty == LLT::scalar(16))
5406     return legalizeFSQRTF16(MI, MRI, B);
5407   return false;
5408 }
5409
5410 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5411 // FIXME: Why do we handle this one but not other removed instructions?
5412 //
5413 // Reciprocal square root.  The clamp prevents infinite results, clamping
5414 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5415 // +-max_float.
5416 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5417                                                     MachineRegisterInfo &MRI,
5418                                                     MachineIRBuilder &B) const {
5419   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5420     return true;
5421
5422   Register Dst = MI.getOperand(0).getReg();
5423   Register Src = MI.getOperand(2).getReg();
5424   auto Flags = MI.getFlags();
5425
5426   LLT Ty = MRI.getType(Dst);
5427
5428   const fltSemantics *FltSemantics;
5429   if (Ty == LLT::scalar(32))
5430     FltSemantics = &APFloat::IEEEsingle();
5431   else if (Ty == LLT::scalar(64))
5432     FltSemantics = &APFloat::IEEEdouble();
5433   else
5434     return false;
5435
5436   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5437                  .addUse(Src)
5438                  .setMIFlags(Flags);
5439
5440   // We don't need to concern ourselves with the snan handling difference, since
5441   // the rsq quieted (or not) so use the one which will directly select.
5442   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5443   const bool UseIEEE = MFI->getMode().IEEE;
5444
5445   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5446   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5447                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5448
5449   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5450
5451   if (UseIEEE)
5452     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5453   else
5454     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5455   MI.eraseFromParent();
5456   return true;
5457 }
5458
5459 // TODO: Fix pointer type handling
5460 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5461                                          MachineInstr &MI,
5462                                          Intrinsic::ID IID) const {
5463
5464   MachineIRBuilder &B = Helper.MIRBuilder;
5465   MachineRegisterInfo &MRI = *B.getMRI();
5466
5467   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5468                       IID == Intrinsic::amdgcn_permlanex16;
5469   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5470                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5471
5472   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5473                                       Register Src2, LLT VT) -> Register {
5474     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5475     switch (IID) {
5476     case Intrinsic::amdgcn_readfirstlane:
5477     case Intrinsic::amdgcn_permlane64:
5478       return LaneOp.getReg(0);
5479     case Intrinsic::amdgcn_readlane:
5480     case Intrinsic::amdgcn_set_inactive:
5481     case Intrinsic::amdgcn_set_inactive_chain_arg:
5482       return LaneOp.addUse(Src1).getReg(0);
5483     case Intrinsic::amdgcn_writelane:
5484       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5485     case Intrinsic::amdgcn_permlane16:
5486     case Intrinsic::amdgcn_permlanex16: {
5487       Register Src3 = MI.getOperand(5).getReg();
5488       Register Src4 = MI.getOperand(6).getImm();
5489       Register Src5 = MI.getOperand(7).getImm();
5490       return LaneOp.addUse(Src1)
5491           .addUse(Src2)
5492           .addUse(Src3)
5493           .addImm(Src4)
5494           .addImm(Src5)
5495           .getReg(0);
5496     }
5497     case Intrinsic::amdgcn_mov_dpp8:
5498       return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5499     case Intrinsic::amdgcn_update_dpp:
5500       return LaneOp.addUse(Src1)
5501           .addImm(MI.getOperand(4).getImm())
5502           .addImm(MI.getOperand(5).getImm())
5503           .addImm(MI.getOperand(6).getImm())
5504           .addImm(MI.getOperand(7).getImm())
5505           .getReg(0);
5506     default:
5507       llvm_unreachable("unhandled lane op");
5508     }
5509   };
5510
5511   Register DstReg = MI.getOperand(0).getReg();
5512   Register Src0 = MI.getOperand(2).getReg();
5513   Register Src1, Src2;
5514   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5515       IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5516     Src1 = MI.getOperand(3).getReg();
5517     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5518       Src2 = MI.getOperand(4).getReg();
5519     }
5520   }
5521
5522   LLT Ty = MRI.getType(DstReg);
5523   unsigned Size = Ty.getSizeInBits();
5524
5525   unsigned SplitSize = 32;
5526   if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5527       ST.hasDPALU_DPP() &&
5528       AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
5529     SplitSize = 64;
5530
5531   if (Size == SplitSize) {
5532     // Already legal
5533     return true;
5534   }
5535
5536   if (Size < 32) {
5537     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5538
5539     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5540       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5541
5542     if (IID == Intrinsic::amdgcn_writelane)
5543       Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5544
5545     Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5546     B.buildTrunc(DstReg, LaneOpDst);
5547     MI.eraseFromParent();
5548     return true;
5549   }
5550
5551   if (Size % SplitSize != 0)
5552     return false;
5553
5554   LLT PartialResTy = LLT::scalar(SplitSize);
5555   if (Ty.isVector()) {
5556     LLT EltTy = Ty.getElementType();
5557     unsigned EltSize = EltTy.getSizeInBits();
5558     if (EltSize == SplitSize) {
5559       PartialResTy = EltTy;
5560     } else if (EltSize == 16 || EltSize == 32) {
5561       unsigned NElem = SplitSize / EltSize;
5562       PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5563     }
5564     // Handle all other cases via S32/S64 pieces;
5565   }
5566
5567   SmallVector<Register, 4> PartialRes;
5568   unsigned NumParts = Size / SplitSize;
5569   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5570   MachineInstrBuilder Src1Parts, Src2Parts;
5571
5572   if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5573     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5574
5575   if (IID == Intrinsic::amdgcn_writelane)
5576     Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5577
5578   for (unsigned i = 0; i < NumParts; ++i) {
5579     Src0 = Src0Parts.getReg(i);
5580
5581     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5582       Src1 = Src1Parts.getReg(i);
5583
5584     if (IID == Intrinsic::amdgcn_writelane)
5585       Src2 = Src2Parts.getReg(i);
5586
5587     PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5588   }
5589
5590   B.buildMergeLikeInstr(DstReg, PartialRes);
5591   MI.eraseFromParent();
5592   return true;
5593 }
5594
5595 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5596                                             MachineRegisterInfo &MRI,
5597                                             MachineIRBuilder &B) const {
5598   uint64_t Offset =
5599     ST.getTargetLowering()->getImplicitParameterOffset(
5600       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5601   LLT DstTy = MRI.getType(DstReg);
5602   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5603
5604   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5605   if (!loadInputValue(KernargPtrReg, B,
5606                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5607     return false;
5608
5609   // FIXME: This should be nuw
5610   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5611   return true;
5612 }
5613
5614 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5615 /// bits of the pointer and replace them with the stride argument, then
5616 /// merge_values everything together. In the common case of a raw buffer (the
5617 /// stride component is 0), we can just AND off the upper half.
5618 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5619     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5620   Register Result = MI.getOperand(0).getReg();
5621   Register Pointer = MI.getOperand(2).getReg();
5622   Register Stride = MI.getOperand(3).getReg();
5623   Register NumRecords = MI.getOperand(4).getReg();
5624   Register Flags = MI.getOperand(5).getReg();
5625
5626   LLT S32 = LLT::scalar(32);
5627
5628   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5629   auto Unmerge = B.buildUnmerge(S32, Pointer);
5630   Register LowHalf = Unmerge.getReg(0);
5631   Register HighHalf = Unmerge.getReg(1);
5632
5633   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5634   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5635
5636   MachineInstrBuilder NewHighHalf = Masked;
5637   std::optional<ValueAndVReg> StrideConst =
5638       getIConstantVRegValWithLookThrough(Stride, MRI);
5639   if (!StrideConst || !StrideConst->Value.isZero()) {
5640     MachineInstrBuilder ShiftedStride;
5641     if (StrideConst) {
5642       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5643       uint32_t ShiftedStrideVal = StrideVal << 16;
5644       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5645     } else {
5646       auto ExtStride = B.buildAnyExt(S32, Stride);
5647       auto ShiftConst = B.buildConstant(S32, 16);
5648       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5649     }
5650     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5651   }
5652   Register NewHighHalfReg = NewHighHalf.getReg(0);
5653   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5654   MI.eraseFromParent();
5655   return true;
5656 }
5657
5658 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5659                                                  MachineRegisterInfo &MRI,
5660                                                  MachineIRBuilder &B) const {
5661   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5662   if (!MFI->isEntryFunction()) {
5663     return legalizePreloadedArgIntrin(MI, MRI, B,
5664                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5665   }
5666
5667   Register DstReg = MI.getOperand(0).getReg();
5668   if (!getImplicitArgPtr(DstReg, MRI, B))
5669     return false;
5670
5671   MI.eraseFromParent();
5672   return true;
5673 }
5674
5675 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5676                                          MachineRegisterInfo &MRI,
5677                                          MachineIRBuilder &B) const {
5678   Function &F = B.getMF().getFunction();
5679   std::optional<uint32_t> KnownSize =
5680       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5681   if (KnownSize.has_value())
5682     B.buildConstant(DstReg, *KnownSize);
5683   return false;
5684 }
5685
5686 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5687                                               MachineRegisterInfo &MRI,
5688                                               MachineIRBuilder &B) const {
5689
5690   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5691   if (!MFI->isEntryFunction()) {
5692     return legalizePreloadedArgIntrin(MI, MRI, B,
5693                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5694   }
5695
5696   Register DstReg = MI.getOperand(0).getReg();
5697   if (!getLDSKernelId(DstReg, MRI, B))
5698     return false;
5699
5700   MI.eraseFromParent();
5701   return true;
5702 }
5703
5704 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5705                                               MachineRegisterInfo &MRI,
5706                                               MachineIRBuilder &B,
5707                                               unsigned AddrSpace) const {
5708   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5709   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5710   Register Hi32 = Unmerge.getReg(1);
5711
5712   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5713   MI.eraseFromParent();
5714   return true;
5715 }
5716
5717 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5718 // offset (the offset that is included in bounds checking and swizzling, to be
5719 // split between the instruction's voffset and immoffset fields) and soffset
5720 // (the offset that is excluded from bounds checking and swizzling, to go in
5721 // the instruction's soffset field).  This function takes the first kind of
5722 // offset and figures out how to split it between voffset and immoffset.
5723 std::pair<Register, unsigned>
5724 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5725                                         Register OrigOffset) const {
5726   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5727   Register BaseReg;
5728   unsigned ImmOffset;
5729   const LLT S32 = LLT::scalar(32);
5730   MachineRegisterInfo &MRI = *B.getMRI();
5731
5732   std::tie(BaseReg, ImmOffset) =
5733       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5734
5735   // If BaseReg is a pointer, convert it to int.
5736   if (MRI.getType(BaseReg).isPointer())
5737     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5738
5739   // If the immediate value is too big for the immoffset field, put only bits
5740   // that would normally fit in the immoffset field. The remaining value that
5741   // is copied/added for the voffset field is a large power of 2, and it
5742   // stands more chance of being CSEd with the copy/add for another similar
5743   // load/store.
5744   // However, do not do that rounding down if that is a negative
5745   // number, as it appears to be illegal to have a negative offset in the
5746   // vgpr, even if adding the immediate offset makes it positive.
5747   unsigned Overflow = ImmOffset & ~MaxImm;
5748   ImmOffset -= Overflow;
5749   if ((int32_t)Overflow < 0) {
5750     Overflow += ImmOffset;
5751     ImmOffset = 0;
5752   }
5753
5754   if (Overflow != 0) {
5755     if (!BaseReg) {
5756       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5757     } else {
5758       auto OverflowVal = B.buildConstant(S32, Overflow);
5759       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5760     }
5761   }
5762
5763   if (!BaseReg)
5764     BaseReg = B.buildConstant(S32, 0).getReg(0);
5765
5766   return std::pair(BaseReg, ImmOffset);
5767 }
5768
5769 /// Handle register layout difference for f16 images for some subtargets.
5770 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5771                                              MachineRegisterInfo &MRI,
5772                                              Register Reg,
5773                                              bool ImageStore) const {
5774   const LLT S16 = LLT::scalar(16);
5775   const LLT S32 = LLT::scalar(32);
5776   LLT StoreVT = MRI.getType(Reg);
5777   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5778
5779   if (ST.hasUnpackedD16VMem()) {
5780     auto Unmerge = B.buildUnmerge(S16, Reg);
5781
5782     SmallVector<Register, 4> WideRegs;
5783     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5784       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5785
5786     int NumElts = StoreVT.getNumElements();
5787
5788     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5789         .getReg(0);
5790   }
5791
5792   if (ImageStore && ST.hasImageStoreD16Bug()) {
5793     if (StoreVT.getNumElements() == 2) {
5794       SmallVector<Register, 4> PackedRegs;
5795       Reg = B.buildBitcast(S32, Reg).getReg(0);
5796       PackedRegs.push_back(Reg);
5797       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5798       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5799           .getReg(0);
5800     }
5801
5802     if (StoreVT.getNumElements() == 3) {
5803       SmallVector<Register, 4> PackedRegs;
5804       auto Unmerge = B.buildUnmerge(S16, Reg);
5805       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5806         PackedRegs.push_back(Unmerge.getReg(I));
5807       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5808       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5809       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5810     }
5811
5812     if (StoreVT.getNumElements() == 4) {
5813       SmallVector<Register, 4> PackedRegs;
5814       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5815       auto Unmerge = B.buildUnmerge(S32, Reg);
5816       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5817         PackedRegs.push_back(Unmerge.getReg(I));
5818       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5819       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5820           .getReg(0);
5821     }
5822
5823     llvm_unreachable("invalid data type");
5824   }
5825
5826   if (StoreVT == LLT::fixed_vector(3, S16)) {
5827     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5828               .getReg(0);
5829   }
5830   return Reg;
5831 }
5832
5833 Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
5834                                                  Register VData, LLT MemTy,
5835                                                  bool IsFormat) const {
5836   MachineRegisterInfo *MRI = B.getMRI();
5837   LLT Ty = MRI->getType(VData);
5838
5839   const LLT S16 = LLT::scalar(16);
5840
5841   // Fixup buffer resources themselves needing to be v4i128.
5842   if (hasBufferRsrcWorkaround(Ty))
5843     return castBufferRsrcToV4I32(VData, B);
5844
5845   if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5846     Ty = getBitcastRegisterType(Ty);
5847     VData = B.buildBitcast(Ty, VData).getReg(0);
5848   }
5849   // Fixup illegal register types for i8 stores.
5850   if (Ty == LLT::scalar(8) || Ty == S16) {
5851     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5852     return AnyExt;
5853   }
5854
5855   if (Ty.isVector()) {
5856     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5857       if (IsFormat)
5858         return handleD16VData(B, *MRI, VData);
5859     }
5860   }
5861
5862   return VData;
5863 }
5864
5865 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5866                                               LegalizerHelper &Helper,
5867                                               bool IsTyped,
5868                                               bool IsFormat) const {
5869   MachineIRBuilder &B = Helper.MIRBuilder;
5870   MachineRegisterInfo &MRI = *B.getMRI();
5871
5872   Register VData = MI.getOperand(1).getReg();
5873   LLT Ty = MRI.getType(VData);
5874   LLT EltTy = Ty.getScalarType();
5875   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5876   const LLT S32 = LLT::scalar(32);
5877
5878   MachineMemOperand *MMO = *MI.memoperands_begin();
5879   const int MemSize = MMO->getSize().getValue();
5880   LLT MemTy = MMO->getMemoryType();
5881
5882   VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5883
5884   castBufferRsrcArgToV4I32(MI, B, 2);
5885   Register RSrc = MI.getOperand(2).getReg();
5886
5887   unsigned ImmOffset;
5888
5889   // The typed intrinsics add an immediate after the registers.
5890   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5891
5892   // The struct intrinsic variants add one additional operand over raw.
5893   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5894   Register VIndex;
5895   int OpOffset = 0;
5896   if (HasVIndex) {
5897     VIndex = MI.getOperand(3).getReg();
5898     OpOffset = 1;
5899   } else {
5900     VIndex = B.buildConstant(S32, 0).getReg(0);
5901   }
5902
5903   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5904   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5905
5906   unsigned Format = 0;
5907   if (IsTyped) {
5908     Format = MI.getOperand(5 + OpOffset).getImm();
5909     ++OpOffset;
5910   }
5911
5912   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5913
5914   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5915
5916   unsigned Opc;
5917   if (IsTyped) {
5918     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5919                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5920   } else if (IsFormat) {
5921     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5922                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5923   } else {
5924     switch (MemSize) {
5925     case 1:
5926       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5927       break;
5928     case 2:
5929       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5930       break;
5931     default:
5932       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5933       break;
5934     }
5935   }
5936
5937   auto MIB = B.buildInstr(Opc)
5938     .addUse(VData)              // vdata
5939     .addUse(RSrc)               // rsrc
5940     .addUse(VIndex)             // vindex
5941     .addUse(VOffset)            // voffset
5942     .addUse(SOffset)            // soffset
5943     .addImm(ImmOffset);         // offset(imm)
5944
5945   if (IsTyped)
5946     MIB.addImm(Format);
5947
5948   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5949      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5950      .addMemOperand(MMO);
5951
5952   MI.eraseFromParent();
5953   return true;
5954 }
5955
5956 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5957                             Register VIndex, Register VOffset, Register SOffset,
5958                             unsigned ImmOffset, unsigned Format,
5959                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5960                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5961   auto MIB = B.buildInstr(Opc)
5962                  .addDef(LoadDstReg) // vdata
5963                  .addUse(RSrc)       // rsrc
5964                  .addUse(VIndex)     // vindex
5965                  .addUse(VOffset)    // voffset
5966                  .addUse(SOffset)    // soffset
5967                  .addImm(ImmOffset); // offset(imm)
5968
5969   if (IsTyped)
5970     MIB.addImm(Format);
5971
5972   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5973       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5974       .addMemOperand(MMO);
5975 }
5976
5977 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5978                                              LegalizerHelper &Helper,
5979                                              bool IsFormat,
5980                                              bool IsTyped) const {
5981   MachineIRBuilder &B = Helper.MIRBuilder;
5982   MachineRegisterInfo &MRI = *B.getMRI();
5983   GISelChangeObserver &Observer = Helper.Observer;
5984
5985   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5986   MachineMemOperand *MMO = *MI.memoperands_begin();
5987   const LLT MemTy = MMO->getMemoryType();
5988   const LLT S32 = LLT::scalar(32);
5989
5990   Register Dst = MI.getOperand(0).getReg();
5991
5992   Register StatusDst;
5993   int OpOffset = 0;
5994   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5995   bool IsTFE = MI.getNumExplicitDefs() == 2;
5996   if (IsTFE) {
5997     StatusDst = MI.getOperand(1).getReg();
5998     ++OpOffset;
5999   }
6000
6001   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6002   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6003
6004   // The typed intrinsics add an immediate after the registers.
6005   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6006
6007   // The struct intrinsic variants add one additional operand over raw.
6008   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6009   Register VIndex;
6010   if (HasVIndex) {
6011     VIndex = MI.getOperand(3 + OpOffset).getReg();
6012     ++OpOffset;
6013   } else {
6014     VIndex = B.buildConstant(S32, 0).getReg(0);
6015   }
6016
6017   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6018   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6019
6020   unsigned Format = 0;
6021   if (IsTyped) {
6022     Format = MI.getOperand(5 + OpOffset).getImm();
6023     ++OpOffset;
6024   }
6025
6026   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6027   unsigned ImmOffset;
6028
6029   LLT Ty = MRI.getType(Dst);
6030   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6031   // logic doesn't have to handle that case.
6032   if (hasBufferRsrcWorkaround(Ty)) {
6033     Observer.changingInstr(MI);
6034     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6035     Observer.changedInstr(MI);
6036     Dst = MI.getOperand(0).getReg();
6037     B.setInsertPt(B.getMBB(), MI);
6038   }
6039   if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6040     Ty = getBitcastRegisterType(Ty);
6041     Observer.changingInstr(MI);
6042     Helper.bitcastDst(MI, Ty, 0);
6043     Observer.changedInstr(MI);
6044     Dst = MI.getOperand(0).getReg();
6045     B.setInsertPt(B.getMBB(), MI);
6046   }
6047
6048   LLT EltTy = Ty.getScalarType();
6049   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6050   const bool Unpacked = ST.hasUnpackedD16VMem();
6051
6052   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6053
6054   unsigned Opc;
6055
6056   // TODO: Support TFE for typed and narrow loads.
6057   if (IsTyped) {
6058     if (IsTFE)
6059       return false;
6060     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6061                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6062   } else if (IsFormat) {
6063     if (IsD16) {
6064       if (IsTFE)
6065         return false;
6066       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6067     } else {
6068       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6069                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6070     }
6071   } else {
6072     switch (MemTy.getSizeInBits()) {
6073     case 8:
6074       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6075                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6076       break;
6077     case 16:
6078       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6079                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6080       break;
6081     default:
6082       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6083                   : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6084       break;
6085     }
6086   }
6087
6088   if (IsTFE) {
6089     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6090     unsigned NumLoadDWords = NumValueDWords + 1;
6091     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6092     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6093     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6094                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6095     if (MemTy.getSizeInBits() < 32) {
6096       Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6097       B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6098       B.buildTrunc(Dst, ExtDst);
6099     } else if (NumValueDWords == 1) {
6100       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6101     } else {
6102       SmallVector<Register, 5> LoadElts;
6103       for (unsigned I = 0; I != NumValueDWords; ++I)
6104         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6105       LoadElts.push_back(StatusDst);
6106       B.buildUnmerge(LoadElts, LoadDstReg);
6107       LoadElts.truncate(NumValueDWords);
6108       B.buildMergeLikeInstr(Dst, LoadElts);
6109     }
6110   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6111              (IsD16 && !Ty.isVector())) {
6112     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6113     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6114                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6115     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6116     B.buildTrunc(Dst, LoadDstReg);
6117   } else if (Unpacked && IsD16 && Ty.isVector()) {
6118     LLT UnpackedTy = Ty.changeElementSize(32);
6119     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6120     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6121                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6122     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6123     // FIXME: G_TRUNC should work, but legalization currently fails
6124     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6125     SmallVector<Register, 4> Repack;
6126     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6127       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6128     B.buildMergeLikeInstr(Dst, Repack);
6129   } else {
6130     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6131                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6132   }
6133
6134   MI.eraseFromParent();
6135   return true;
6136 }
6137
6138 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6139   switch (IntrID) {
6140   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6141   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6142   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6143   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6144     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6145   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6146   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6147   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6148   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6149     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6150   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6151   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6152   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6153   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6154     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6155   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6156   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6157   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6158   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6159     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6160   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6161   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6162   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6163   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6164     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6165   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6166   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6167   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6168   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6169     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6170   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6171   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6172   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6173   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6174     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6175   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6176   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6177   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6178   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6179     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6180   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6181   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6182   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6183   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6184     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6185   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6186   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6187   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6188   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6189     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6190   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6191   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6192   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6193   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6194     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6195   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6196   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6197   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6198   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6199     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6200   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6201   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6202   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6203   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6204     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6205   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6206   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6207   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6208   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6209     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6210   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6211   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6212   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6213   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6214     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6215   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6216   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6217   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6218   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6219     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6220   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6221   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6222     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6223   default:
6224     llvm_unreachable("unhandled atomic opcode");
6225   }
6226 }
6227
6228 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6229                                                MachineIRBuilder &B,
6230                                                Intrinsic::ID IID) const {
6231   const bool IsCmpSwap =
6232       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6233       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6234       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6235       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6236
6237   Register Dst = MI.getOperand(0).getReg();
6238   // Since we don't have 128-bit atomics, we don't need to handle the case of
6239   // p8 argmunents to the atomic itself
6240   Register VData = MI.getOperand(2).getReg();
6241
6242   Register CmpVal;
6243   int OpOffset = 0;
6244
6245   if (IsCmpSwap) {
6246     CmpVal = MI.getOperand(3).getReg();
6247     ++OpOffset;
6248   }
6249
6250   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6251   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6252   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6253
6254   // The struct intrinsic variants add one additional operand over raw.
6255   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6256   Register VIndex;
6257   if (HasVIndex) {
6258     VIndex = MI.getOperand(4 + OpOffset).getReg();
6259     ++OpOffset;
6260   } else {
6261     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6262   }
6263
6264   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6265   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6266   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6267
6268   MachineMemOperand *MMO = *MI.memoperands_begin();
6269
6270   unsigned ImmOffset;
6271   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6272
6273   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6274       .addDef(Dst)
6275       .addUse(VData); // vdata
6276
6277   if (IsCmpSwap)
6278     MIB.addReg(CmpVal);
6279
6280   MIB.addUse(RSrc)               // rsrc
6281      .addUse(VIndex)             // vindex
6282      .addUse(VOffset)            // voffset
6283      .addUse(SOffset)            // soffset
6284      .addImm(ImmOffset)          // offset(imm)
6285      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
6286      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6287      .addMemOperand(MMO);
6288
6289   MI.eraseFromParent();
6290   return true;
6291 }
6292
6293 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6294 /// vector with s16 typed elements.
6295 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6296                                       SmallVectorImpl<Register> &PackedAddrs,
6297                                       unsigned ArgOffset,
6298                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
6299                                       bool IsA16, bool IsG16) {
6300   const LLT S16 = LLT::scalar(16);
6301   const LLT V2S16 = LLT::fixed_vector(2, 16);
6302   auto EndIdx = Intr->VAddrEnd;
6303
6304   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6305     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6306     if (!SrcOp.isReg())
6307       continue; // _L to _LZ may have eliminated this.
6308
6309     Register AddrReg = SrcOp.getReg();
6310
6311     if ((I < Intr->GradientStart) ||
6312         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6313         (I >= Intr->CoordStart && !IsA16)) {
6314       if ((I < Intr->GradientStart) && IsA16 &&
6315           (B.getMRI()->getType(AddrReg) == S16)) {
6316         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6317         // Special handling of bias when A16 is on. Bias is of type half but
6318         // occupies full 32-bit.
6319         PackedAddrs.push_back(
6320             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6321                 .getReg(0));
6322       } else {
6323         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6324                "Bias needs to be converted to 16 bit in A16 mode");
6325         // Handle any gradient or coordinate operands that should not be packed
6326         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6327         PackedAddrs.push_back(AddrReg);
6328       }
6329     } else {
6330       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6331       // derivatives dx/dh and dx/dv are packed with undef.
6332       if (((I + 1) >= EndIdx) ||
6333           ((Intr->NumGradients / 2) % 2 == 1 &&
6334            (I == static_cast<unsigned>(Intr->GradientStart +
6335                                        (Intr->NumGradients / 2) - 1) ||
6336             I == static_cast<unsigned>(Intr->GradientStart +
6337                                        Intr->NumGradients - 1))) ||
6338           // Check for _L to _LZ optimization
6339           !MI.getOperand(ArgOffset + I + 1).isReg()) {
6340         PackedAddrs.push_back(
6341             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6342                 .getReg(0));
6343       } else {
6344         PackedAddrs.push_back(
6345             B.buildBuildVector(
6346                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6347                 .getReg(0));
6348         ++I;
6349       }
6350     }
6351   }
6352 }
6353
6354 /// Convert from separate vaddr components to a single vector address register,
6355 /// and replace the remaining operands with $noreg.
6356 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6357                                      int DimIdx, int NumVAddrs) {
6358   const LLT S32 = LLT::scalar(32);
6359   (void)S32;
6360   SmallVector<Register, 8> AddrRegs;
6361   for (int I = 0; I != NumVAddrs; ++I) {
6362     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6363     if (SrcOp.isReg()) {
6364       AddrRegs.push_back(SrcOp.getReg());
6365       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6366     }
6367   }
6368
6369   int NumAddrRegs = AddrRegs.size();
6370   if (NumAddrRegs != 1) {
6371     auto VAddr =
6372         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6373     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6374   }
6375
6376   for (int I = 1; I != NumVAddrs; ++I) {
6377     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6378     if (SrcOp.isReg())
6379       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6380   }
6381 }
6382
6383 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6384 ///
6385 /// Depending on the subtarget, load/store with 16-bit element data need to be
6386 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6387 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6388 /// registers.
6389 ///
6390 /// We don't want to directly select image instructions just yet, but also want
6391 /// to exposes all register repacking to the legalizer/combiners. We also don't
6392 /// want a selected instruction entering RegBankSelect. In order to avoid
6393 /// defining a multitude of intermediate image instructions, directly hack on
6394 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6395 /// padding now unnecessary arguments with $noreg.
6396 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6397     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6398     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6399
6400   const MachineFunction &MF = *MI.getMF();
6401   const unsigned NumDefs = MI.getNumExplicitDefs();
6402   const unsigned ArgOffset = NumDefs + 1;
6403   bool IsTFE = NumDefs == 2;
6404   // We are only processing the operands of d16 image operations on subtargets
6405   // that use the unpacked register layout, or need to repack the TFE result.
6406
6407   // TODO: Do we need to guard against already legalized intrinsics?
6408   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6409       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6410
6411   MachineRegisterInfo *MRI = B.getMRI();
6412   const LLT S32 = LLT::scalar(32);
6413   const LLT S16 = LLT::scalar(16);
6414   const LLT V2S16 = LLT::fixed_vector(2, 16);
6415
6416   unsigned DMask = 0;
6417   Register VData;
6418   LLT Ty;
6419
6420   if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6421     VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6422     Ty = MRI->getType(VData);
6423   }
6424
6425   const bool IsAtomicPacked16Bit =
6426       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6427        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6428
6429   // Check for 16 bit addresses and pack if true.
6430   LLT GradTy =
6431       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6432   LLT AddrTy =
6433       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6434   const bool IsG16 =
6435       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6436   const bool IsA16 = AddrTy == S16;
6437   const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6438
6439   int DMaskLanes = 0;
6440   if (!BaseOpcode->Atomic) {
6441     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6442     if (BaseOpcode->Gather4) {
6443       DMaskLanes = 4;
6444     } else if (DMask != 0) {
6445       DMaskLanes = llvm::popcount(DMask);
6446     } else if (!IsTFE && !BaseOpcode->Store) {
6447       // If dmask is 0, this is a no-op load. This can be eliminated.
6448       B.buildUndef(MI.getOperand(0));
6449       MI.eraseFromParent();
6450       return true;
6451     }
6452   }
6453
6454   Observer.changingInstr(MI);
6455   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6456
6457   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6458                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6459   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6460                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6461   unsigned NewOpcode = LoadOpcode;
6462   if (BaseOpcode->Store)
6463     NewOpcode = StoreOpcode;
6464   else if (BaseOpcode->NoReturn)
6465     NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6466
6467   // Track that we legalized this
6468   MI.setDesc(B.getTII().get(NewOpcode));
6469
6470   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6471   // dmask to be at least 1 otherwise the instruction will fail
6472   if (IsTFE && DMask == 0) {
6473     DMask = 0x1;
6474     DMaskLanes = 1;
6475     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6476   }
6477
6478   if (BaseOpcode->Atomic) {
6479     Register VData0 = MI.getOperand(2).getReg();
6480     LLT Ty = MRI->getType(VData0);
6481
6482     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6483     if (Ty.isVector() && !IsAtomicPacked16Bit)
6484       return false;
6485
6486     if (BaseOpcode->AtomicX2) {
6487       Register VData1 = MI.getOperand(3).getReg();
6488       // The two values are packed in one register.
6489       LLT PackedTy = LLT::fixed_vector(2, Ty);
6490       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6491       MI.getOperand(2).setReg(Concat.getReg(0));
6492       MI.getOperand(3).setReg(AMDGPU::NoRegister);
6493     }
6494   }
6495
6496   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6497
6498   // Rewrite the addressing register layout before doing anything else.
6499   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6500     // 16 bit gradients are supported, but are tied to the A16 control
6501     // so both gradients and addresses must be 16 bit
6502     return false;
6503   }
6504
6505   if (IsA16 && !ST.hasA16()) {
6506     // A16 not supported
6507     return false;
6508   }
6509
6510   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6511   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6512
6513   if (IsA16 || IsG16) {
6514     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6515     // instructions expect VGPR_32
6516     SmallVector<Register, 4> PackedRegs;
6517
6518     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6519
6520     // See also below in the non-a16 branch
6521     const bool UseNSA = ST.hasNSAEncoding() &&
6522                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6523                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6524     const bool UsePartialNSA =
6525         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6526
6527     if (UsePartialNSA) {
6528       // Pack registers that would go over NSAMaxSize into last VAddr register
6529       LLT PackedAddrTy =
6530           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6531       auto Concat = B.buildConcatVectors(
6532           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6533       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6534       PackedRegs.resize(NSAMaxSize);
6535     } else if (!UseNSA && PackedRegs.size() > 1) {
6536       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6537       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6538       PackedRegs[0] = Concat.getReg(0);
6539       PackedRegs.resize(1);
6540     }
6541
6542     const unsigned NumPacked = PackedRegs.size();
6543     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6544       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6545       if (!SrcOp.isReg()) {
6546         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6547         continue;
6548       }
6549
6550       assert(SrcOp.getReg() != AMDGPU::NoRegister);
6551
6552       if (I - Intr->VAddrStart < NumPacked)
6553         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6554       else
6555         SrcOp.setReg(AMDGPU::NoRegister);
6556     }
6557   } else {
6558     // If the register allocator cannot place the address registers contiguously
6559     // without introducing moves, then using the non-sequential address encoding
6560     // is always preferable, since it saves VALU instructions and is usually a
6561     // wash in terms of code size or even better.
6562     //
6563     // However, we currently have no way of hinting to the register allocator
6564     // that MIMG addresses should be placed contiguously when it is possible to
6565     // do so, so force non-NSA for the common 2-address case as a heuristic.
6566     //
6567     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6568     // allocation when possible.
6569     //
6570     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6571     // set of the remaining addresses.
6572     const bool UseNSA = ST.hasNSAEncoding() &&
6573                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6574                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6575     const bool UsePartialNSA =
6576         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6577
6578     if (UsePartialNSA) {
6579       convertImageAddrToPacked(B, MI,
6580                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6581                                Intr->NumVAddrs - NSAMaxSize + 1);
6582     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6583       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6584                                Intr->NumVAddrs);
6585     }
6586   }
6587
6588   int Flags = 0;
6589   if (IsA16)
6590     Flags |= 1;
6591   if (IsG16)
6592     Flags |= 2;
6593   MI.addOperand(MachineOperand::CreateImm(Flags));
6594
6595   if (BaseOpcode->NoReturn) { // No TFE for stores?
6596     // TODO: Handle dmask trim
6597     if (!Ty.isVector() || !IsD16)
6598       return true;
6599
6600     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6601     if (RepackedReg != VData) {
6602       MI.getOperand(1).setReg(RepackedReg);
6603     }
6604
6605     return true;
6606   }
6607
6608   Register DstReg = MI.getOperand(0).getReg();
6609   const LLT EltTy = Ty.getScalarType();
6610   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6611
6612   // Confirm that the return type is large enough for the dmask specified
6613   if (NumElts < DMaskLanes)
6614     return false;
6615
6616   if (NumElts > 4 || DMaskLanes > 4)
6617     return false;
6618
6619   // Image atomic instructions are using DMask to specify how many bits
6620   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6621   // DMaskLanes for image atomic has default value '0'.
6622   // We must be sure that atomic variants (especially packed) will not be
6623   // truncated from v2s16 or v4s16 to s16 type.
6624   //
6625   // ChangeElementCount will be needed for image load where Ty is always scalar.
6626   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6627   const LLT AdjustedTy =
6628       DMaskLanes == 0
6629           ? Ty
6630           : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6631
6632   // The raw dword aligned data component of the load. The only legal cases
6633   // where this matters should be when using the packed D16 format, for
6634   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6635   LLT RoundedTy;
6636
6637   // S32 vector to cover all data, plus TFE result element.
6638   LLT TFETy;
6639
6640   // Register type to use for each loaded component. Will be S32 or V2S16.
6641   LLT RegTy;
6642
6643   if (IsD16 && ST.hasUnpackedD16VMem()) {
6644     RoundedTy =
6645         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6646     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6647     RegTy = S32;
6648   } else {
6649     unsigned EltSize = EltTy.getSizeInBits();
6650     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6651     unsigned RoundedSize = 32 * RoundedElts;
6652     RoundedTy = LLT::scalarOrVector(
6653         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6654     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6655     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6656   }
6657
6658   // The return type does not need adjustment.
6659   // TODO: Should we change s16 case to s32 or <2 x s16>?
6660   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6661     return true;
6662
6663   Register Dst1Reg;
6664
6665   // Insert after the instruction.
6666   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6667
6668   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6669   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6670   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6671   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6672
6673   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6674
6675   MI.getOperand(0).setReg(NewResultReg);
6676
6677   // In the IR, TFE is supposed to be used with a 2 element struct return
6678   // type. The instruction really returns these two values in one contiguous
6679   // register, with one additional dword beyond the loaded data. Rewrite the
6680   // return type to use a single register result.
6681
6682   if (IsTFE) {
6683     Dst1Reg = MI.getOperand(1).getReg();
6684     if (MRI->getType(Dst1Reg) != S32)
6685       return false;
6686
6687     // TODO: Make sure the TFE operand bit is set.
6688     MI.removeOperand(1);
6689
6690     // Handle the easy case that requires no repack instructions.
6691     if (Ty == S32) {
6692       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6693       return true;
6694     }
6695   }
6696
6697   // Now figure out how to copy the new result register back into the old
6698   // result.
6699   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6700
6701   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6702
6703   if (ResultNumRegs == 1) {
6704     assert(!IsTFE);
6705     ResultRegs[0] = NewResultReg;
6706   } else {
6707     // We have to repack into a new vector of some kind.
6708     for (int I = 0; I != NumDataRegs; ++I)
6709       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6710     B.buildUnmerge(ResultRegs, NewResultReg);
6711
6712     // Drop the final TFE element to get the data part. The TFE result is
6713     // directly written to the right place already.
6714     if (IsTFE)
6715       ResultRegs.resize(NumDataRegs);
6716   }
6717
6718   // For an s16 scalar result, we form an s32 result with a truncate regardless
6719   // of packed vs. unpacked.
6720   if (IsD16 && !Ty.isVector()) {
6721     B.buildTrunc(DstReg, ResultRegs[0]);
6722     return true;
6723   }
6724
6725   // Avoid a build/concat_vector of 1 entry.
6726   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6727     B.buildBitcast(DstReg, ResultRegs[0]);
6728     return true;
6729   }
6730
6731   assert(Ty.isVector());
6732
6733   if (IsD16) {
6734     // For packed D16 results with TFE enabled, all the data components are
6735     // S32. Cast back to the expected type.
6736     //
6737     // TODO: We don't really need to use load s32 elements. We would only need one
6738     // cast for the TFE result if a multiple of v2s16 was used.
6739     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6740       for (Register &Reg : ResultRegs)
6741         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6742     } else if (ST.hasUnpackedD16VMem()) {
6743       for (Register &Reg : ResultRegs)
6744         Reg = B.buildTrunc(S16, Reg).getReg(0);
6745     }
6746   }
6747
6748   auto padWithUndef = [&](LLT Ty, int NumElts) {
6749     if (NumElts == 0)
6750       return;
6751     Register Undef = B.buildUndef(Ty).getReg(0);
6752     for (int I = 0; I != NumElts; ++I)
6753       ResultRegs.push_back(Undef);
6754   };
6755
6756   // Pad out any elements eliminated due to the dmask.
6757   LLT ResTy = MRI->getType(ResultRegs[0]);
6758   if (!ResTy.isVector()) {
6759     padWithUndef(ResTy, NumElts - ResultRegs.size());
6760     B.buildBuildVector(DstReg, ResultRegs);
6761     return true;
6762   }
6763
6764   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6765   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6766
6767   // Deal with the one annoying legal case.
6768   const LLT V3S16 = LLT::fixed_vector(3, 16);
6769   if (Ty == V3S16) {
6770     if (IsTFE) {
6771       if (ResultRegs.size() == 1) {
6772         NewResultReg = ResultRegs[0];
6773       } else if (ResultRegs.size() == 2) {
6774         LLT V4S16 = LLT::fixed_vector(4, 16);
6775         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6776       } else {
6777         return false;
6778       }
6779     }
6780
6781     if (MRI->getType(DstReg).getNumElements() <
6782         MRI->getType(NewResultReg).getNumElements()) {
6783       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6784     } else {
6785       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6786     }
6787     return true;
6788   }
6789
6790   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6791   B.buildConcatVectors(DstReg, ResultRegs);
6792   return true;
6793 }
6794
6795 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6796                                               MachineInstr &MI) const {
6797   MachineIRBuilder &B = Helper.MIRBuilder;
6798   GISelChangeObserver &Observer = Helper.Observer;
6799
6800   Register OrigDst = MI.getOperand(0).getReg();
6801   Register Dst;
6802   LLT Ty = B.getMRI()->getType(OrigDst);
6803   unsigned Size = Ty.getSizeInBits();
6804   MachineFunction &MF = B.getMF();
6805   unsigned Opc = 0;
6806   if (Size < 32 && ST.hasScalarSubwordLoads()) {
6807     assert(Size == 8 || Size == 16);
6808     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6809                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6810     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6811     // destination register.
6812     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6813   } else {
6814     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6815     Dst = OrigDst;
6816   }
6817
6818   Observer.changingInstr(MI);
6819
6820   // Handle needing to s.buffer.load() a p8 value.
6821   if (hasBufferRsrcWorkaround(Ty)) {
6822     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6823     B.setInsertPt(B.getMBB(), MI);
6824   }
6825   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6826     Ty = getBitcastRegisterType(Ty);
6827     Helper.bitcastDst(MI, Ty, 0);
6828     B.setInsertPt(B.getMBB(), MI);
6829   }
6830
6831   // FIXME: We don't really need this intermediate instruction. The intrinsic
6832   // should be fixed to have a memory operand. Since it's readnone, we're not
6833   // allowed to add one.
6834   MI.setDesc(B.getTII().get(Opc));
6835   MI.removeOperand(1); // Remove intrinsic ID
6836
6837   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6838   const unsigned MemSize = (Size + 7) / 8;
6839   const Align MemAlign = B.getDataLayout().getABITypeAlign(
6840       getTypeForLLT(Ty, MF.getFunction().getContext()));
6841   MachineMemOperand *MMO = MF.getMachineMemOperand(
6842       MachinePointerInfo(),
6843       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6844           MachineMemOperand::MOInvariant,
6845       MemSize, MemAlign);
6846   MI.addMemOperand(MF, MMO);
6847   if (Dst != OrigDst) {
6848     MI.getOperand(0).setReg(Dst);
6849     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6850     B.buildTrunc(OrigDst, Dst);
6851   }
6852
6853   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6854   // always be legal. We may need to restore this to a 96-bit result if it turns
6855   // out this needs to be converted to a vector load during RegBankSelect.
6856   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6857     if (Ty.isVector())
6858       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6859     else
6860       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6861   }
6862
6863   Observer.changedInstr(MI);
6864   return true;
6865 }
6866
6867 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6868                                                   MachineInstr &MI) const {
6869   MachineIRBuilder &B = Helper.MIRBuilder;
6870   GISelChangeObserver &Observer = Helper.Observer;
6871   Observer.changingInstr(MI);
6872   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6873   MI.removeOperand(0); // Remove intrinsic ID
6874   castBufferRsrcArgToV4I32(MI, B, 0);
6875   Observer.changedInstr(MI);
6876   return true;
6877 }
6878
6879 // TODO: Move to selection
6880 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6881                                        MachineRegisterInfo &MRI,
6882                                        MachineIRBuilder &B) const {
6883   if (!ST.isTrapHandlerEnabled() ||
6884       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6885     return legalizeTrapEndpgm(MI, MRI, B);
6886
6887   return ST.supportsGetDoorbellID() ?
6888          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6889 }
6890
6891 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6892     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6893   const DebugLoc &DL = MI.getDebugLoc();
6894   MachineBasicBlock &BB = B.getMBB();
6895   MachineFunction *MF = BB.getParent();
6896
6897   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6898     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6899       .addImm(0);
6900     MI.eraseFromParent();
6901     return true;
6902   }
6903
6904   // We need a block split to make the real endpgm a terminator. We also don't
6905   // want to break phis in successor blocks, so we can't just delete to the
6906   // end of the block.
6907   BB.splitAt(MI, false /*UpdateLiveIns*/);
6908   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6909   MF->push_back(TrapBB);
6910   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6911     .addImm(0);
6912   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6913     .addMBB(TrapBB);
6914
6915   BB.addSuccessor(TrapBB);
6916   MI.eraseFromParent();
6917   return true;
6918 }
6919
6920 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6921     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6922   MachineFunction &MF = B.getMF();
6923   const LLT S64 = LLT::scalar(64);
6924
6925   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6926   // For code object version 5, queue_ptr is passed through implicit kernarg.
6927   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6928       AMDGPU::AMDHSA_COV5) {
6929     AMDGPUTargetLowering::ImplicitParameter Param =
6930         AMDGPUTargetLowering::QUEUE_PTR;
6931     uint64_t Offset =
6932         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6933
6934     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6935         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6936
6937     if (!loadInputValue(KernargPtrReg, B,
6938                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6939       return false;
6940
6941     // TODO: can we be smarter about machine pointer info?
6942     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6943     MachineMemOperand *MMO = MF.getMachineMemOperand(
6944         PtrInfo,
6945         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6946             MachineMemOperand::MOInvariant,
6947         LLT::scalar(64), commonAlignment(Align(64), Offset));
6948
6949     // Pointer address
6950     Register LoadAddr = MRI.createGenericVirtualRegister(
6951         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6952     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6953                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6954     // Load address
6955     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6956     B.buildCopy(SGPR01, Temp);
6957     B.buildInstr(AMDGPU::S_TRAP)
6958         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6959         .addReg(SGPR01, RegState::Implicit);
6960     MI.eraseFromParent();
6961     return true;
6962   }
6963
6964   // Pass queue pointer to trap handler as input, and insert trap instruction
6965   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6966   Register LiveIn =
6967     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6968   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6969     return false;
6970
6971   B.buildCopy(SGPR01, LiveIn);
6972   B.buildInstr(AMDGPU::S_TRAP)
6973       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6974       .addReg(SGPR01, RegState::Implicit);
6975
6976   MI.eraseFromParent();
6977   return true;
6978 }
6979
6980 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6981                                           MachineRegisterInfo &MRI,
6982                                           MachineIRBuilder &B) const {
6983   // We need to simulate the 's_trap 2' instruction on targets that run in
6984   // PRIV=1 (where it is treated as a nop).
6985   if (ST.hasPrivEnabledTrap2NopBug()) {
6986     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6987                                            MI.getDebugLoc());
6988     MI.eraseFromParent();
6989     return true;
6990   }
6991
6992   B.buildInstr(AMDGPU::S_TRAP)
6993       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6994   MI.eraseFromParent();
6995   return true;
6996 }
6997
6998 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6999                                             MachineRegisterInfo &MRI,
7000                                             MachineIRBuilder &B) const {
7001   // Is non-HSA path or trap-handler disabled? Then, report a warning
7002   // accordingly
7003   if (!ST.isTrapHandlerEnabled() ||
7004       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7005     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
7006                                      "debugtrap handler not supported",
7007                                      MI.getDebugLoc(), DS_Warning);
7008     LLVMContext &Ctx = B.getMF().getFunction().getContext();
7009     Ctx.diagnose(NoTrap);
7010   } else {
7011     // Insert debug-trap instruction
7012     B.buildInstr(AMDGPU::S_TRAP)
7013         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7014   }
7015
7016   MI.eraseFromParent();
7017   return true;
7018 }
7019
7020 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
7021                                                MachineIRBuilder &B) const {
7022   MachineRegisterInfo &MRI = *B.getMRI();
7023   const LLT S16 = LLT::scalar(16);
7024   const LLT S32 = LLT::scalar(32);
7025   const LLT V2S16 = LLT::fixed_vector(2, 16);
7026   const LLT V3S32 = LLT::fixed_vector(3, 32);
7027
7028   Register DstReg = MI.getOperand(0).getReg();
7029   Register NodePtr = MI.getOperand(2).getReg();
7030   Register RayExtent = MI.getOperand(3).getReg();
7031   Register RayOrigin = MI.getOperand(4).getReg();
7032   Register RayDir = MI.getOperand(5).getReg();
7033   Register RayInvDir = MI.getOperand(6).getReg();
7034   Register TDescr = MI.getOperand(7).getReg();
7035
7036   if (!ST.hasGFX10_AEncoding()) {
7037     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
7038                                         "intrinsic not supported on subtarget",
7039                                         MI.getDebugLoc());
7040     B.getMF().getFunction().getContext().diagnose(BadIntrin);
7041     return false;
7042   }
7043
7044   const bool IsGFX11 = AMDGPU::isGFX11(ST);
7045   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7046   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7047   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7048   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7049   const unsigned NumVDataDwords = 4;
7050   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7051   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7052   const bool UseNSA =
7053       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7054
7055   const unsigned BaseOpcodes[2][2] = {
7056       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7057       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7058        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7059   int Opcode;
7060   if (UseNSA) {
7061     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7062                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7063                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
7064                                                : AMDGPU::MIMGEncGfx10NSA,
7065                                    NumVDataDwords, NumVAddrDwords);
7066   } else {
7067     assert(!IsGFX12Plus);
7068     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7069                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7070                                            : AMDGPU::MIMGEncGfx10Default,
7071                                    NumVDataDwords, NumVAddrDwords);
7072   }
7073   assert(Opcode != -1);
7074
7075   SmallVector<Register, 12> Ops;
7076   if (UseNSA && IsGFX11Plus) {
7077     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7078       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7079       auto Merged = B.buildMergeLikeInstr(
7080           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7081       Ops.push_back(Merged.getReg(0));
7082     };
7083
7084     Ops.push_back(NodePtr);
7085     Ops.push_back(RayExtent);
7086     packLanes(RayOrigin);
7087
7088     if (IsA16) {
7089       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7090       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7091       auto MergedDir = B.buildMergeLikeInstr(
7092           V3S32,
7093           {B.buildBitcast(
7094                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7095                                                    UnmergeRayDir.getReg(0)}))
7096                .getReg(0),
7097            B.buildBitcast(
7098                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7099                                                    UnmergeRayDir.getReg(1)}))
7100                .getReg(0),
7101            B.buildBitcast(
7102                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7103                                                    UnmergeRayDir.getReg(2)}))
7104                .getReg(0)});
7105       Ops.push_back(MergedDir.getReg(0));
7106     } else {
7107       packLanes(RayDir);
7108       packLanes(RayInvDir);
7109     }
7110   } else {
7111     if (Is64) {
7112       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7113       Ops.push_back(Unmerge.getReg(0));
7114       Ops.push_back(Unmerge.getReg(1));
7115     } else {
7116       Ops.push_back(NodePtr);
7117     }
7118     Ops.push_back(RayExtent);
7119
7120     auto packLanes = [&Ops, &S32, &B](Register Src) {
7121       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7122       Ops.push_back(Unmerge.getReg(0));
7123       Ops.push_back(Unmerge.getReg(1));
7124       Ops.push_back(Unmerge.getReg(2));
7125     };
7126
7127     packLanes(RayOrigin);
7128     if (IsA16) {
7129       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7130       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7131       Register R1 = MRI.createGenericVirtualRegister(S32);
7132       Register R2 = MRI.createGenericVirtualRegister(S32);
7133       Register R3 = MRI.createGenericVirtualRegister(S32);
7134       B.buildMergeLikeInstr(R1,
7135                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7136       B.buildMergeLikeInstr(
7137           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7138       B.buildMergeLikeInstr(
7139           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7140       Ops.push_back(R1);
7141       Ops.push_back(R2);
7142       Ops.push_back(R3);
7143     } else {
7144       packLanes(RayDir);
7145       packLanes(RayInvDir);
7146     }
7147   }
7148
7149   if (!UseNSA) {
7150     // Build a single vector containing all the operands so far prepared.
7151     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7152     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7153     Ops.clear();
7154     Ops.push_back(MergedOps);
7155   }
7156
7157   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7158     .addDef(DstReg)
7159     .addImm(Opcode);
7160
7161   for (Register R : Ops) {
7162     MIB.addUse(R);
7163   }
7164
7165   MIB.addUse(TDescr)
7166      .addImm(IsA16 ? 1 : 0)
7167      .cloneMemRefs(MI);
7168
7169   MI.eraseFromParent();
7170   return true;
7171 }
7172
7173 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7174                                             MachineIRBuilder &B) const {
7175   const SITargetLowering *TLI = ST.getTargetLowering();
7176   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7177   Register DstReg = MI.getOperand(0).getReg();
7178   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7179   MI.eraseFromParent();
7180   return true;
7181 }
7182
7183 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7184                                          MachineIRBuilder &B) const {
7185   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7186   if (!ST.hasArchitectedSGPRs())
7187     return false;
7188   LLT S32 = LLT::scalar(32);
7189   Register DstReg = MI.getOperand(0).getReg();
7190   auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7191   auto LSB = B.buildConstant(S32, 25);
7192   auto Width = B.buildConstant(S32, 5);
7193   B.buildUbfx(DstReg, TTMP8, LSB, Width);
7194   MI.eraseFromParent();
7195   return true;
7196 }
7197
7198 static constexpr unsigned FPEnvModeBitField =
7199     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7200
7201 static constexpr unsigned FPEnvTrapBitField =
7202     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7203
7204 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7205                                            MachineRegisterInfo &MRI,
7206                                            MachineIRBuilder &B) const {
7207   Register Src = MI.getOperand(0).getReg();
7208   if (MRI.getType(Src) != S64)
7209     return false;
7210
7211   auto ModeReg =
7212       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7213                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7214           .addImm(FPEnvModeBitField);
7215   auto TrapReg =
7216       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7217                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7218           .addImm(FPEnvTrapBitField);
7219   B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7220   MI.eraseFromParent();
7221   return true;
7222 }
7223
7224 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7225                                            MachineRegisterInfo &MRI,
7226                                            MachineIRBuilder &B) const {
7227   Register Src = MI.getOperand(0).getReg();
7228   if (MRI.getType(Src) != S64)
7229     return false;
7230
7231   auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7232   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7233                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7234       .addImm(static_cast<int16_t>(FPEnvModeBitField))
7235       .addReg(Unmerge.getReg(0));
7236   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7237                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7238       .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7239       .addReg(Unmerge.getReg(1));
7240   MI.eraseFromParent();
7241   return true;
7242 }
7243
7244 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7245                                             MachineInstr &MI) const {
7246   MachineIRBuilder &B = Helper.MIRBuilder;
7247   MachineRegisterInfo &MRI = *B.getMRI();
7248
7249   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7250   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7251   switch (IntrID) {
7252   case Intrinsic::amdgcn_if:
7253   case Intrinsic::amdgcn_else: {
7254     MachineInstr *Br = nullptr;
7255     MachineBasicBlock *UncondBrTarget = nullptr;
7256     bool Negated = false;
7257     if (MachineInstr *BrCond =
7258             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7259       const SIRegisterInfo *TRI
7260         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7261
7262       Register Def = MI.getOperand(1).getReg();
7263       Register Use = MI.getOperand(3).getReg();
7264
7265       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7266
7267       if (Negated)
7268         std::swap(CondBrTarget, UncondBrTarget);
7269
7270       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7271       if (IntrID == Intrinsic::amdgcn_if) {
7272         B.buildInstr(AMDGPU::SI_IF)
7273           .addDef(Def)
7274           .addUse(Use)
7275           .addMBB(UncondBrTarget);
7276       } else {
7277         B.buildInstr(AMDGPU::SI_ELSE)
7278             .addDef(Def)
7279             .addUse(Use)
7280             .addMBB(UncondBrTarget);
7281       }
7282
7283       if (Br) {
7284         Br->getOperand(0).setMBB(CondBrTarget);
7285       } else {
7286         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7287         // since we're swapping branch targets it needs to be reinserted.
7288         // FIXME: IRTranslator should probably not do this
7289         B.buildBr(*CondBrTarget);
7290       }
7291
7292       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7293       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7294       MI.eraseFromParent();
7295       BrCond->eraseFromParent();
7296       return true;
7297     }
7298
7299     return false;
7300   }
7301   case Intrinsic::amdgcn_loop: {
7302     MachineInstr *Br = nullptr;
7303     MachineBasicBlock *UncondBrTarget = nullptr;
7304     bool Negated = false;
7305     if (MachineInstr *BrCond =
7306             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7307       const SIRegisterInfo *TRI
7308         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7309
7310       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7311       Register Reg = MI.getOperand(2).getReg();
7312
7313       if (Negated)
7314         std::swap(CondBrTarget, UncondBrTarget);
7315
7316       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7317       B.buildInstr(AMDGPU::SI_LOOP)
7318         .addUse(Reg)
7319         .addMBB(UncondBrTarget);
7320
7321       if (Br)
7322         Br->getOperand(0).setMBB(CondBrTarget);
7323       else
7324         B.buildBr(*CondBrTarget);
7325
7326       MI.eraseFromParent();
7327       BrCond->eraseFromParent();
7328       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7329       return true;
7330     }
7331
7332     return false;
7333   }
7334   case Intrinsic::amdgcn_addrspacecast_nonnull:
7335     return legalizeAddrSpaceCast(MI, MRI, B);
7336   case Intrinsic::amdgcn_make_buffer_rsrc:
7337     return legalizePointerAsRsrcIntrin(MI, MRI, B);
7338   case Intrinsic::amdgcn_kernarg_segment_ptr:
7339     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7340       // This only makes sense to call in a kernel, so just lower to null.
7341       B.buildConstant(MI.getOperand(0).getReg(), 0);
7342       MI.eraseFromParent();
7343       return true;
7344     }
7345
7346     return legalizePreloadedArgIntrin(
7347       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7348   case Intrinsic::amdgcn_implicitarg_ptr:
7349     return legalizeImplicitArgPtr(MI, MRI, B);
7350   case Intrinsic::amdgcn_workitem_id_x:
7351     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7352                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7353   case Intrinsic::amdgcn_workitem_id_y:
7354     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7355                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7356   case Intrinsic::amdgcn_workitem_id_z:
7357     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7358                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7359   case Intrinsic::amdgcn_workgroup_id_x:
7360     return legalizePreloadedArgIntrin(MI, MRI, B,
7361                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7362   case Intrinsic::amdgcn_workgroup_id_y:
7363     return legalizePreloadedArgIntrin(MI, MRI, B,
7364                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7365   case Intrinsic::amdgcn_workgroup_id_z:
7366     return legalizePreloadedArgIntrin(MI, MRI, B,
7367                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7368   case Intrinsic::amdgcn_wave_id:
7369     return legalizeWaveID(MI, B);
7370   case Intrinsic::amdgcn_lds_kernel_id:
7371     return legalizePreloadedArgIntrin(MI, MRI, B,
7372                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7373   case Intrinsic::amdgcn_dispatch_ptr:
7374     return legalizePreloadedArgIntrin(MI, MRI, B,
7375                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
7376   case Intrinsic::amdgcn_queue_ptr:
7377     return legalizePreloadedArgIntrin(MI, MRI, B,
7378                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
7379   case Intrinsic::amdgcn_implicit_buffer_ptr:
7380     return legalizePreloadedArgIntrin(
7381       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7382   case Intrinsic::amdgcn_dispatch_id:
7383     return legalizePreloadedArgIntrin(MI, MRI, B,
7384                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
7385   case Intrinsic::r600_read_ngroups_x:
7386     // TODO: Emit error for hsa
7387     return legalizeKernargMemParameter(MI, B,
7388                                        SI::KernelInputOffsets::NGROUPS_X);
7389   case Intrinsic::r600_read_ngroups_y:
7390     return legalizeKernargMemParameter(MI, B,
7391                                        SI::KernelInputOffsets::NGROUPS_Y);
7392   case Intrinsic::r600_read_ngroups_z:
7393     return legalizeKernargMemParameter(MI, B,
7394                                        SI::KernelInputOffsets::NGROUPS_Z);
7395   case Intrinsic::r600_read_local_size_x:
7396     // TODO: Could insert G_ASSERT_ZEXT from s16
7397     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7398   case Intrinsic::r600_read_local_size_y:
7399     // TODO: Could insert G_ASSERT_ZEXT from s16
7400     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
7401     // TODO: Could insert G_ASSERT_ZEXT from s16
7402   case Intrinsic::r600_read_local_size_z:
7403     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7404   case Intrinsic::r600_read_global_size_x:
7405     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7406   case Intrinsic::r600_read_global_size_y:
7407     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7408   case Intrinsic::r600_read_global_size_z:
7409     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7410   case Intrinsic::amdgcn_fdiv_fast:
7411     return legalizeFDIVFastIntrin(MI, MRI, B);
7412   case Intrinsic::amdgcn_is_shared:
7413     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7414   case Intrinsic::amdgcn_is_private:
7415     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7416   case Intrinsic::amdgcn_wavefrontsize: {
7417     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7418     MI.eraseFromParent();
7419     return true;
7420   }
7421   case Intrinsic::amdgcn_s_buffer_load:
7422     return legalizeSBufferLoad(Helper, MI);
7423   case Intrinsic::amdgcn_raw_buffer_store:
7424   case Intrinsic::amdgcn_raw_ptr_buffer_store:
7425   case Intrinsic::amdgcn_struct_buffer_store:
7426   case Intrinsic::amdgcn_struct_ptr_buffer_store:
7427     return legalizeBufferStore(MI, Helper, false, false);
7428   case Intrinsic::amdgcn_raw_buffer_store_format:
7429   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7430   case Intrinsic::amdgcn_struct_buffer_store_format:
7431   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7432     return legalizeBufferStore(MI, Helper, false, true);
7433   case Intrinsic::amdgcn_raw_tbuffer_store:
7434   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7435   case Intrinsic::amdgcn_struct_tbuffer_store:
7436   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7437     return legalizeBufferStore(MI, Helper, true, true);
7438   case Intrinsic::amdgcn_raw_buffer_load:
7439   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7440   case Intrinsic::amdgcn_raw_atomic_buffer_load:
7441   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7442   case Intrinsic::amdgcn_struct_buffer_load:
7443   case Intrinsic::amdgcn_struct_ptr_buffer_load:
7444   case Intrinsic::amdgcn_struct_atomic_buffer_load:
7445   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7446     return legalizeBufferLoad(MI, Helper, false, false);
7447   case Intrinsic::amdgcn_raw_buffer_load_format:
7448   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7449   case Intrinsic::amdgcn_struct_buffer_load_format:
7450   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7451     return legalizeBufferLoad(MI, Helper, true, false);
7452   case Intrinsic::amdgcn_raw_tbuffer_load:
7453   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7454   case Intrinsic::amdgcn_struct_tbuffer_load:
7455   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7456     return legalizeBufferLoad(MI, Helper, true, true);
7457   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7458   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7459   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7460   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7461   case Intrinsic::amdgcn_raw_buffer_atomic_add:
7462   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7463   case Intrinsic::amdgcn_struct_buffer_atomic_add:
7464   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7465   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7466   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7467   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7468   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7469   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7470   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7471   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7472   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7473   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7474   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7475   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7476   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7477   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7478   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7479   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7480   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7481   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7482   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7483   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7484   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7485   case Intrinsic::amdgcn_raw_buffer_atomic_and:
7486   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7487   case Intrinsic::amdgcn_struct_buffer_atomic_and:
7488   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7489   case Intrinsic::amdgcn_raw_buffer_atomic_or:
7490   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7491   case Intrinsic::amdgcn_struct_buffer_atomic_or:
7492   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7493   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7494   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7495   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7496   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7497   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7498   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7499   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7500   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7501   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7502   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7503   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7504   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7505   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7506   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7507   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7508   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7509   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7510   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7511   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7512   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7513   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7514   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7515   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7516   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7517   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7518   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7519   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7520   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7521     return legalizeBufferAtomic(MI, B, IntrID);
7522   case Intrinsic::amdgcn_rsq_clamp:
7523     return legalizeRsqClampIntrinsic(MI, MRI, B);
7524   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7525     return legalizeBVHIntrinsic(MI, B);
7526   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7527   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7528   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7529   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7530   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7531   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7532   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7533   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7534     Register Index = MI.getOperand(5).getReg();
7535     LLT S32 = LLT::scalar(32);
7536     if (MRI.getType(Index) != S32)
7537       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7538     return true;
7539   }
7540   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7541   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7542   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7543     Register Index = MI.getOperand(7).getReg();
7544     LLT S32 = LLT::scalar(32);
7545     if (MRI.getType(Index) != S32)
7546       MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7547     return true;
7548   }
7549   case Intrinsic::amdgcn_fmed3: {
7550     GISelChangeObserver &Observer = Helper.Observer;
7551
7552     // FIXME: This is to workaround the inability of tablegen match combiners to
7553     // match intrinsics in patterns.
7554     Observer.changingInstr(MI);
7555     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7556     MI.removeOperand(1);
7557     Observer.changedInstr(MI);
7558     return true;
7559   }
7560   case Intrinsic::amdgcn_readlane:
7561   case Intrinsic::amdgcn_writelane:
7562   case Intrinsic::amdgcn_readfirstlane:
7563   case Intrinsic::amdgcn_permlane16:
7564   case Intrinsic::amdgcn_permlanex16:
7565   case Intrinsic::amdgcn_permlane64:
7566   case Intrinsic::amdgcn_set_inactive:
7567   case Intrinsic::amdgcn_set_inactive_chain_arg:
7568   case Intrinsic::amdgcn_mov_dpp8:
7569   case Intrinsic::amdgcn_update_dpp:
7570     return legalizeLaneOp(Helper, MI, IntrID);
7571   case Intrinsic::amdgcn_s_buffer_prefetch_data:
7572     return legalizeSBufferPrefetch(Helper, MI);
7573   default: {
7574     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7575             AMDGPU::getImageDimIntrinsicInfo(IntrID))
7576       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7577     return true;
7578   }
7579   }
7580
7581   return true;
7582 }