lib/Target/AMDGPU/AMDGPUISelLowering.cpp

   1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This is the parent TargetLowering class for hardware code gen
  11 /// targets.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUISelLowering.h"
  16 #include "AMDGPU.h"
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUFrameLowering.h"
  19 #include "AMDGPURegisterInfo.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "AMDGPUTargetMachine.h"
  22 #include "Utils/AMDGPUBaseInfo.h"
  23 #include "R600MachineFunctionInfo.h"
  24 #include "SIInstrInfo.h"
  25 #include "SIMachineFunctionInfo.h"
  26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  27 #include "llvm/CodeGen/Analysis.h"
  28 #include "llvm/CodeGen/CallingConvLower.h"
  29 #include "llvm/CodeGen/MachineFunction.h"
  30 #include "llvm/CodeGen/MachineRegisterInfo.h"
  31 #include "llvm/CodeGen/SelectionDAG.h"
  32 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  33 #include "llvm/IR/DataLayout.h"
  34 #include "llvm/IR/DiagnosticInfo.h"
  35 #include "llvm/Support/KnownBits.h"
  36 #include "llvm/Support/MathExtras.h"
  37 using namespace llvm;
  38
  39 #include "AMDGPUGenCallingConv.inc"
  40
  41 // Find a larger type to do a load / store of a vector with.
  42 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
  43   unsigned StoreSize = VT.getStoreSizeInBits();
  44   if (StoreSize <= 32)
  45     return EVT::getIntegerVT(Ctx, StoreSize);
  46
  47   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
  48   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
  49 }
  50
  51 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
  52   EVT VT = Op.getValueType();
  53   KnownBits Known = DAG.computeKnownBits(Op);
  54   return VT.getSizeInBits() - Known.countMinLeadingZeros();
  55 }
  56
  57 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
  58   EVT VT = Op.getValueType();
  59
  60   // In order for this to be a signed 24-bit value, bit 23, must
  61   // be a sign bit.
  62   return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
  63 }
  64
  65 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
  66                                            const AMDGPUSubtarget &STI)
  67     : TargetLowering(TM), Subtarget(&STI) {
  68   // Lower floating point store/load to integer store/load to reduce the number
  69   // of patterns in tablegen.
  70   setOperationAction(ISD::LOAD, MVT::f32, Promote);
  71   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
  72
  73   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
  74   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
  75
  76   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
  77   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
  78
  79   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
  80   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
  81
  82   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
  83   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
  84
  85   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
  86   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
  87
  88   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
  89   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
  90
  91   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
  92   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
  93
  94   setOperationAction(ISD::LOAD, MVT::i64, Promote);
  95   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
  96
  97   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
  98   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
  99
 100   setOperationAction(ISD::LOAD, MVT::f64, Promote);
 101   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
 102
 103   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
 104   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
 105
 106   // There are no 64-bit extloads. These should be done as a 32-bit extload and
 107   // an extension to 64-bit.
 108   for (MVT VT : MVT::integer_valuetypes()) {
 109     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
 110     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
 111     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
 112   }
 113
 114   for (MVT VT : MVT::integer_valuetypes()) {
 115     if (VT == MVT::i64)
 116       continue;
 117
 118     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 119     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
 120     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
 121     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
 122
 123     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 124     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
 125     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
 126     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
 127
 128     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 129     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
 130     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
 131     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
 132   }
 133
 134   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
 135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
 136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
 137     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
 138     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
 139     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
 140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
 141     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
 142     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
 143     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
 144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
 145     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
 146     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
 147     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
 148     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
 149     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
 150   }
 151
 152   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 153   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
 154   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
 155   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
 156   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
 157   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
 158   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
 159
 160   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 161   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
 162   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
 163   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
 164
 165   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 166   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
 167   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
 168   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
 169
 170   setOperationAction(ISD::STORE, MVT::f32, Promote);
 171   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 172
 173   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
 174   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
 175
 176   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
 177   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
 178
 179   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
 180   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 181
 182   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
 183   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
 184
 185   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
 186   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
 187
 188   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
 189   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 190
 191   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
 192   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
 193
 194   setOperationAction(ISD::STORE, MVT::i64, Promote);
 195   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 196
 197   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
 198   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
 199
 200   setOperationAction(ISD::STORE, MVT::f64, Promote);
 201   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
 202
 203   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
 204   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 205
 206   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
 207   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
 208   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 209   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 210
 211   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
 212   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
 213   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
 214   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
 215
 216   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 217   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
 218   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
 219   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
 220   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
 221   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
 222   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
 223
 224   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 225   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 226
 227   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
 228   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 229
 230   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
 231   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 232
 233   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
 234   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 235
 236
 237   setOperationAction(ISD::Constant, MVT::i32, Legal);
 238   setOperationAction(ISD::Constant, MVT::i64, Legal);
 239   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 240   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 241
 242   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 243   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 244
 245   // This is totally unsupported, just custom lower to produce an error.
 246   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 247
 248   // Library functions.  These default to Expand, but we have instructions
 249   // for them.
 250   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
 251   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
 252   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
 253   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
 254   setOperationAction(ISD::FABS,   MVT::f32, Legal);
 255   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 256   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 257   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 258   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
 259   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
 260
 261   setOperationAction(ISD::FROUND, MVT::f32, Custom);
 262   setOperationAction(ISD::FROUND, MVT::f64, Custom);
 263
 264   setOperationAction(ISD::FLOG, MVT::f32, Custom);
 265   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
 266   setOperationAction(ISD::FEXP, MVT::f32, Custom);
 267
 268
 269   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
 270   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 271
 272   setOperationAction(ISD::FREM, MVT::f32, Custom);
 273   setOperationAction(ISD::FREM, MVT::f64, Custom);
 274
 275   // Expand to fneg + fadd.
 276   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 277
 278   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
 279   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
 280   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
 281   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
 282   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
 283   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
 284   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
 285   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
 286   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
 287   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
 288   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
 289   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
 290   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
 291   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
 292   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
 293   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
 294   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
 295   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 296   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
 297   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
 298   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
 299   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
 300
 301   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 302   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
 303   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
 304
 305   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 306   for (MVT VT : ScalarIntVTs) {
 307     // These should use [SU]DIVREM, so set them to expand
 308     setOperationAction(ISD::SDIV, VT, Expand);
 309     setOperationAction(ISD::UDIV, VT, Expand);
 310     setOperationAction(ISD::SREM, VT, Expand);
 311     setOperationAction(ISD::UREM, VT, Expand);
 312
 313     // GPU does not have divrem function for signed or unsigned.
 314     setOperationAction(ISD::SDIVREM, VT, Custom);
 315     setOperationAction(ISD::UDIVREM, VT, Custom);
 316
 317     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
 318     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 319     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 320
 321     setOperationAction(ISD::BSWAP, VT, Expand);
 322     setOperationAction(ISD::CTTZ, VT, Expand);
 323     setOperationAction(ISD::CTLZ, VT, Expand);
 324
 325     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
 326     setOperationAction(ISD::ADDC, VT, Legal);
 327     setOperationAction(ISD::SUBC, VT, Legal);
 328     setOperationAction(ISD::ADDE, VT, Legal);
 329     setOperationAction(ISD::SUBE, VT, Legal);
 330   }
 331
 332   // The hardware supports 32-bit ROTR, but not ROTL.
 333   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 334   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 335   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 336
 337   setOperationAction(ISD::MUL, MVT::i64, Expand);
 338   setOperationAction(ISD::MULHU, MVT::i64, Expand);
 339   setOperationAction(ISD::MULHS, MVT::i64, Expand);
 340   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 341   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 342   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 343   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 344   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 345
 346   setOperationAction(ISD::SMIN, MVT::i32, Legal);
 347   setOperationAction(ISD::UMIN, MVT::i32, Legal);
 348   setOperationAction(ISD::SMAX, MVT::i32, Legal);
 349   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 350
 351   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
 352   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
 353   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
 354   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 355
 356   static const MVT::SimpleValueType VectorIntTypes[] = {
 357     MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
 358   };
 359
 360   for (MVT VT : VectorIntTypes) {
 361     // Expand the following operations for the current type by default.
 362     setOperationAction(ISD::ADD,  VT, Expand);
 363     setOperationAction(ISD::AND,  VT, Expand);
 364     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 365     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 366     setOperationAction(ISD::MUL,  VT, Expand);
 367     setOperationAction(ISD::MULHU, VT, Expand);
 368     setOperationAction(ISD::MULHS, VT, Expand);
 369     setOperationAction(ISD::OR,   VT, Expand);
 370     setOperationAction(ISD::SHL,  VT, Expand);
 371     setOperationAction(ISD::SRA,  VT, Expand);
 372     setOperationAction(ISD::SRL,  VT, Expand);
 373     setOperationAction(ISD::ROTL, VT, Expand);
 374     setOperationAction(ISD::ROTR, VT, Expand);
 375     setOperationAction(ISD::SUB,  VT, Expand);
 376     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 377     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 378     setOperationAction(ISD::SDIV, VT, Expand);
 379     setOperationAction(ISD::UDIV, VT, Expand);
 380     setOperationAction(ISD::SREM, VT, Expand);
 381     setOperationAction(ISD::UREM, VT, Expand);
 382     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 383     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 384     setOperationAction(ISD::SDIVREM, VT, Custom);
 385     setOperationAction(ISD::UDIVREM, VT, Expand);
 386     setOperationAction(ISD::SELECT, VT, Expand);
 387     setOperationAction(ISD::VSELECT, VT, Expand);
 388     setOperationAction(ISD::SELECT_CC, VT, Expand);
 389     setOperationAction(ISD::XOR,  VT, Expand);
 390     setOperationAction(ISD::BSWAP, VT, Expand);
 391     setOperationAction(ISD::CTPOP, VT, Expand);
 392     setOperationAction(ISD::CTTZ, VT, Expand);
 393     setOperationAction(ISD::CTLZ, VT, Expand);
 394     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 395     setOperationAction(ISD::SETCC, VT, Expand);
 396   }
 397
 398   static const MVT::SimpleValueType FloatVectorTypes[] = {
 399      MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
 400   };
 401
 402   for (MVT VT : FloatVectorTypes) {
 403     setOperationAction(ISD::FABS, VT, Expand);
 404     setOperationAction(ISD::FMINNUM, VT, Expand);
 405     setOperationAction(ISD::FMAXNUM, VT, Expand);
 406     setOperationAction(ISD::FADD, VT, Expand);
 407     setOperationAction(ISD::FCEIL, VT, Expand);
 408     setOperationAction(ISD::FCOS, VT, Expand);
 409     setOperationAction(ISD::FDIV, VT, Expand);
 410     setOperationAction(ISD::FEXP2, VT, Expand);
 411     setOperationAction(ISD::FEXP, VT, Expand);
 412     setOperationAction(ISD::FLOG2, VT, Expand);
 413     setOperationAction(ISD::FREM, VT, Expand);
 414     setOperationAction(ISD::FLOG, VT, Expand);
 415     setOperationAction(ISD::FLOG10, VT, Expand);
 416     setOperationAction(ISD::FPOW, VT, Expand);
 417     setOperationAction(ISD::FFLOOR, VT, Expand);
 418     setOperationAction(ISD::FTRUNC, VT, Expand);
 419     setOperationAction(ISD::FMUL, VT, Expand);
 420     setOperationAction(ISD::FMA, VT, Expand);
 421     setOperationAction(ISD::FRINT, VT, Expand);
 422     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 423     setOperationAction(ISD::FSQRT, VT, Expand);
 424     setOperationAction(ISD::FSIN, VT, Expand);
 425     setOperationAction(ISD::FSUB, VT, Expand);
 426     setOperationAction(ISD::FNEG, VT, Expand);
 427     setOperationAction(ISD::VSELECT, VT, Expand);
 428     setOperationAction(ISD::SELECT_CC, VT, Expand);
 429     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 430     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 431     setOperationAction(ISD::SETCC, VT, Expand);
 432     setOperationAction(ISD::FCANONICALIZE, VT, Expand);
 433   }
 434
 435   // This causes using an unrolled select operation rather than expansion with
 436   // bit operations. This is in general better, but the alternative using BFI
 437   // instructions may be better if the select sources are SGPRs.
 438   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
 439   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
 440
 441   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
 442   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
 443
 444   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
 445   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
 446
 447   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
 448   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
 449
 450   // There are no libcalls of any kind.
 451   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
 452     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
 453
 454   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 455   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 456
 457   setSchedulingPreference(Sched::RegPressure);
 458   setJumpIsExpensive(true);
 459
 460   // FIXME: This is only partially true. If we have to do vector compares, any
 461   // SGPR pair can be a condition register. If we have a uniform condition, we
 462   // are better off doing SALU operations, where there is only one SCC. For now,
 463   // we don't have a way of knowing during instruction selection if a condition
 464   // will be uniform and we always use vector compares. Assume we are using
 465   // vector compares until that is fixed.
 466   setHasMultipleConditionRegisters(true);
 467
 468   setMinCmpXchgSizeInBits(32);
 469   setSupportsUnalignedAtomics(false);
 470
 471   PredictableSelectIsExpensive = false;
 472
 473   // We want to find all load dependencies for long chains of stores to enable
 474   // merging into very wide vectors. The problem is with vectors with > 4
 475   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
 476   // vectors are a legal type, even though we have to split the loads
 477   // usually. When we can more precisely specify load legality per address
 478   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
 479   // smarter so that they can figure out what to do in 2 iterations without all
 480   // N > 4 stores on the same chain.
 481   GatherAllAliasesMaxDepth = 16;
 482
 483   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
 484   // about these during lowering.
 485   MaxStoresPerMemcpy  = 0xffffffff;
 486   MaxStoresPerMemmove = 0xffffffff;
 487   MaxStoresPerMemset  = 0xffffffff;
 488
 489   setTargetDAGCombine(ISD::BITCAST);
 490   setTargetDAGCombine(ISD::SHL);
 491   setTargetDAGCombine(ISD::SRA);
 492   setTargetDAGCombine(ISD::SRL);
 493   setTargetDAGCombine(ISD::TRUNCATE);
 494   setTargetDAGCombine(ISD::MUL);
 495   setTargetDAGCombine(ISD::MULHU);
 496   setTargetDAGCombine(ISD::MULHS);
 497   setTargetDAGCombine(ISD::SELECT);
 498   setTargetDAGCombine(ISD::SELECT_CC);
 499   setTargetDAGCombine(ISD::STORE);
 500   setTargetDAGCombine(ISD::FADD);
 501   setTargetDAGCombine(ISD::FSUB);
 502   setTargetDAGCombine(ISD::FNEG);
 503   setTargetDAGCombine(ISD::FABS);
 504   setTargetDAGCombine(ISD::AssertZext);
 505   setTargetDAGCombine(ISD::AssertSext);
 506   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 507 }
 508
 509 //===----------------------------------------------------------------------===//
 510 // Target Information
 511 //===----------------------------------------------------------------------===//
 512
 513 LLVM_READNONE
 514 static bool fnegFoldsIntoOp(unsigned Opc) {
 515   switch (Opc) {
 516   case ISD::FADD:
 517   case ISD::FSUB:
 518   case ISD::FMUL:
 519   case ISD::FMA:
 520   case ISD::FMAD:
 521   case ISD::FMINNUM:
 522   case ISD::FMAXNUM:
 523   case ISD::FMINNUM_IEEE:
 524   case ISD::FMAXNUM_IEEE:
 525   case ISD::FSIN:
 526   case ISD::FTRUNC:
 527   case ISD::FRINT:
 528   case ISD::FNEARBYINT:
 529   case ISD::FCANONICALIZE:
 530   case AMDGPUISD::RCP:
 531   case AMDGPUISD::RCP_LEGACY:
 532   case AMDGPUISD::RCP_IFLAG:
 533   case AMDGPUISD::SIN_HW:
 534   case AMDGPUISD::FMUL_LEGACY:
 535   case AMDGPUISD::FMIN_LEGACY:
 536   case AMDGPUISD::FMAX_LEGACY:
 537   case AMDGPUISD::FMED3:
 538     return true;
 539   default:
 540     return false;
 541   }
 542 }
 543
 544 /// \p returns true if the operation will definitely need to use a 64-bit
 545 /// encoding, and thus will use a VOP3 encoding regardless of the source
 546 /// modifiers.
 547 LLVM_READONLY
 548 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
 549   return N->getNumOperands() > 2 || VT == MVT::f64;
 550 }
 551
 552 // Most FP instructions support source modifiers, but this could be refined
 553 // slightly.
 554 LLVM_READONLY
 555 static bool hasSourceMods(const SDNode *N) {
 556   if (isa<MemSDNode>(N))
 557     return false;
 558
 559   switch (N->getOpcode()) {
 560   case ISD::CopyToReg:
 561   case ISD::SELECT:
 562   case ISD::FDIV:
 563   case ISD::FREM:
 564   case ISD::INLINEASM:
 565   case ISD::INLINEASM_BR:
 566   case AMDGPUISD::DIV_SCALE:
 567   case ISD::INTRINSIC_W_CHAIN:
 568
 569   // TODO: Should really be looking at the users of the bitcast. These are
 570   // problematic because bitcasts are used to legalize all stores to integer
 571   // types.
 572   case ISD::BITCAST:
 573     return false;
 574   case ISD::INTRINSIC_WO_CHAIN: {
 575     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
 576     case Intrinsic::amdgcn_interp_p1:
 577     case Intrinsic::amdgcn_interp_p2:
 578     case Intrinsic::amdgcn_interp_mov:
 579     case Intrinsic::amdgcn_interp_p1_f16:
 580     case Intrinsic::amdgcn_interp_p2_f16:
 581       return false;
 582     default:
 583       return true;
 584     }
 585   }
 586   default:
 587     return true;
 588   }
 589 }
 590
 591 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
 592                                                  unsigned CostThreshold) {
 593   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
 594   // it is truly free to use a source modifier in all cases. If there are
 595   // multiple users but for each one will necessitate using VOP3, there will be
 596   // a code size increase. Try to avoid increasing code size unless we know it
 597   // will save on the instruction count.
 598   unsigned NumMayIncreaseSize = 0;
 599   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
 600
 601   // XXX - Should this limit number of uses to check?
 602   for (const SDNode *U : N->uses()) {
 603     if (!hasSourceMods(U))
 604       return false;
 605
 606     if (!opMustUseVOP3Encoding(U, VT)) {
 607       if (++NumMayIncreaseSize > CostThreshold)
 608         return false;
 609     }
 610   }
 611
 612   return true;
 613 }
 614
 615 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
 616   return MVT::i32;
 617 }
 618
 619 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
 620   return true;
 621 }
 622
 623 // The backend supports 32 and 64 bit floating point immediates.
 624 // FIXME: Why are we reporting vectors of FP immediates as legal?
 625 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
 626                                         bool ForCodeSize) const {
 627   EVT ScalarVT = VT.getScalarType();
 628   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
 629          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
 630 }
 631
 632 // We don't want to shrink f64 / f32 constants.
 633 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 634   EVT ScalarVT = VT.getScalarType();
 635   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
 636 }
 637
 638 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 639                                                  ISD::LoadExtType ExtTy,
 640                                                  EVT NewVT) const {
 641   // TODO: This may be worth removing. Check regression tests for diffs.
 642   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
 643     return false;
 644
 645   unsigned NewSize = NewVT.getStoreSizeInBits();
 646
 647   // If we are reducing to a 32-bit load, this is always better.
 648   if (NewSize == 32)
 649     return true;
 650
 651   EVT OldVT = N->getValueType(0);
 652   unsigned OldSize = OldVT.getStoreSizeInBits();
 653
 654   MemSDNode *MN = cast<MemSDNode>(N);
 655   unsigned AS = MN->getAddressSpace();
 656   // Do not shrink an aligned scalar load to sub-dword.
 657   // Scalar engine cannot do sub-dword loads.
 658   if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
 659       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
 660        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
 661        (isa<LoadSDNode>(N) &&
 662         AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
 663       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
 664     return false;
 665
 666   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
 667   // extloads, so doing one requires using a buffer_load. In cases where we
 668   // still couldn't use a scalar load, using the wider load shouldn't really
 669   // hurt anything.
 670
 671   // If the old size already had to be an extload, there's no harm in continuing
 672   // to reduce the width.
 673   return (OldSize < 32);
 674 }
 675
 676 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
 677                                                    const SelectionDAG &DAG,
 678                                                    const MachineMemOperand &MMO) const {
 679
 680   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
 681
 682   if (LoadTy.getScalarType() == MVT::i32)
 683     return false;
 684
 685   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
 686   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
 687
 688   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
 689     return false;
 690
 691   bool Fast = false;
 692   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
 693                                         CastTy, MMO, &Fast) &&
 694          Fast;
 695 }
 696
 697 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
 698 // profitable with the expansion for 64-bit since it's generally good to
 699 // speculate things.
 700 // FIXME: These should really have the size as a parameter.
 701 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
 702   return true;
 703 }
 704
 705 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 706   return true;
 707 }
 708
 709 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
 710   switch (N->getOpcode()) {
 711     default:
 712     return false;
 713     case ISD::EntryToken:
 714     case ISD::TokenFactor:
 715       return true;
 716     case ISD::INTRINSIC_WO_CHAIN:
 717     {
 718       unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
 719       switch (IntrID) {
 720         default:
 721         return false;
 722         case Intrinsic::amdgcn_readfirstlane:
 723         case Intrinsic::amdgcn_readlane:
 724           return true;
 725       }
 726     }
 727     break;
 728     case ISD::LOAD:
 729     {
 730       if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
 731           AMDGPUAS::CONSTANT_ADDRESS_32BIT)
 732         return true;
 733       return false;
 734     }
 735     break;
 736   }
 737 }
 738
 739 //===---------------------------------------------------------------------===//
 740 // Target Properties
 741 //===---------------------------------------------------------------------===//
 742
 743 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
 744   assert(VT.isFloatingPoint());
 745
 746   // Packed operations do not have a fabs modifier.
 747   return VT == MVT::f32 || VT == MVT::f64 ||
 748          (Subtarget->has16BitInsts() && VT == MVT::f16);
 749 }
 750
 751 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
 752   assert(VT.isFloatingPoint());
 753   return VT == MVT::f32 || VT == MVT::f64 ||
 754          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
 755          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
 756 }
 757
 758 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
 759                                                          unsigned NumElem,
 760                                                          unsigned AS) const {
 761   return true;
 762 }
 763
 764 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
 765   // There are few operations which truly have vector input operands. Any vector
 766   // operation is going to involve operations on each component, and a
 767   // build_vector will be a copy per element, so it always makes sense to use a
 768   // build_vector input in place of the extracted element to avoid a copy into a
 769   // super register.
 770   //
 771   // We should probably only do this if all users are extracts only, but this
 772   // should be the common case.
 773   return true;
 774 }
 775
 776 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
 777   // Truncate is just accessing a subregister.
 778
 779   unsigned SrcSize = Source.getSizeInBits();
 780   unsigned DestSize = Dest.getSizeInBits();
 781
 782   return DestSize < SrcSize && DestSize % 32 == 0 ;
 783 }
 784
 785 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
 786   // Truncate is just accessing a subregister.
 787
 788   unsigned SrcSize = Source->getScalarSizeInBits();
 789   unsigned DestSize = Dest->getScalarSizeInBits();
 790
 791   if (DestSize== 16 && Subtarget->has16BitInsts())
 792     return SrcSize >= 32;
 793
 794   return DestSize < SrcSize && DestSize % 32 == 0;
 795 }
 796
 797 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
 798   unsigned SrcSize = Src->getScalarSizeInBits();
 799   unsigned DestSize = Dest->getScalarSizeInBits();
 800
 801   if (SrcSize == 16 && Subtarget->has16BitInsts())
 802     return DestSize >= 32;
 803
 804   return SrcSize == 32 && DestSize == 64;
 805 }
 806
 807 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
 808   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
 809   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
 810   // this will enable reducing 64-bit operations the 32-bit, which is always
 811   // good.
 812
 813   if (Src == MVT::i16)
 814     return Dest == MVT::i32 ||Dest == MVT::i64 ;
 815
 816   return Src == MVT::i32 && Dest == MVT::i64;
 817 }
 818
 819 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 820   return isZExtFree(Val.getValueType(), VT2);
 821 }
 822
 823 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 824   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
 825   // limited number of native 64-bit operations. Shrinking an operation to fit
 826   // in a single 32-bit register should always be helpful. As currently used,
 827   // this is much less general than the name suggests, and is only used in
 828   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
 829   // not profitable, and may actually be harmful.
 830   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
 831 }
 832
 833 //===---------------------------------------------------------------------===//
 834 // TargetLowering Callbacks
 835 //===---------------------------------------------------------------------===//
 836
 837 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
 838                                                   bool IsVarArg) {
 839   switch (CC) {
 840   case CallingConv::AMDGPU_VS:
 841   case CallingConv::AMDGPU_GS:
 842   case CallingConv::AMDGPU_PS:
 843   case CallingConv::AMDGPU_CS:
 844   case CallingConv::AMDGPU_HS:
 845   case CallingConv::AMDGPU_ES:
 846   case CallingConv::AMDGPU_LS:
 847     return CC_AMDGPU;
 848   case CallingConv::C:
 849   case CallingConv::Fast:
 850   case CallingConv::Cold:
 851     return CC_AMDGPU_Func;
 852   case CallingConv::AMDGPU_KERNEL:
 853   case CallingConv::SPIR_KERNEL:
 854   default:
 855     report_fatal_error("Unsupported calling convention for call");
 856   }
 857 }
 858
 859 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 860                                                     bool IsVarArg) {
 861   switch (CC) {
 862   case CallingConv::AMDGPU_KERNEL:
 863   case CallingConv::SPIR_KERNEL:
 864     llvm_unreachable("kernels should not be handled here");
 865   case CallingConv::AMDGPU_VS:
 866   case CallingConv::AMDGPU_GS:
 867   case CallingConv::AMDGPU_PS:
 868   case CallingConv::AMDGPU_CS:
 869   case CallingConv::AMDGPU_HS:
 870   case CallingConv::AMDGPU_ES:
 871   case CallingConv::AMDGPU_LS:
 872     return RetCC_SI_Shader;
 873   case CallingConv::C:
 874   case CallingConv::Fast:
 875   case CallingConv::Cold:
 876     return RetCC_AMDGPU_Func;
 877   default:
 878     report_fatal_error("Unsupported calling convention.");
 879   }
 880 }
 881
 882 /// The SelectionDAGBuilder will automatically promote function arguments
 883 /// with illegal types.  However, this does not work for the AMDGPU targets
 884 /// since the function arguments are stored in memory as these illegal types.
 885 /// In order to handle this properly we need to get the original types sizes
 886 /// from the LLVM IR Function and fixup the ISD:InputArg values before
 887 /// passing them to AnalyzeFormalArguments()
 888
 889 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 890 /// input values across multiple registers.  Each item in the Ins array
 891 /// represents a single value that will be stored in registers.  Ins[x].VT is
 892 /// the value type of the value that will be stored in the register, so
 893 /// whatever SDNode we lower the argument to needs to be this type.
 894 ///
 895 /// In order to correctly lower the arguments we need to know the size of each
 896 /// argument.  Since Ins[x].VT gives us the size of the register that will
 897 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
 898 /// for the orignal function argument so that we can deduce the correct memory
 899 /// type to use for Ins[x].  In most cases the correct memory type will be
 900 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
 901 /// we have a kernel argument of type v8i8, this argument will be split into
 902 /// 8 parts and each part will be represented by its own item in the Ins array.
 903 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
 904 /// the argument before it was split.  From this, we deduce that the memory type
 905 /// for each individual part is i8.  We pass the memory type as LocVT to the
 906 /// calling convention analysis function and the register type (Ins[x].VT) as
 907 /// the ValVT.
 908 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 909   CCState &State,
 910   const SmallVectorImpl<ISD::InputArg> &Ins) const {
 911   const MachineFunction &MF = State.getMachineFunction();
 912   const Function &Fn = MF.getFunction();
 913   LLVMContext &Ctx = Fn.getParent()->getContext();
 914   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
 915   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
 916   CallingConv::ID CC = Fn.getCallingConv();
 917
 918   unsigned MaxAlign = 1;
 919   uint64_t ExplicitArgOffset = 0;
 920   const DataLayout &DL = Fn.getParent()->getDataLayout();
 921
 922   unsigned InIndex = 0;
 923
 924   for (const Argument &Arg : Fn.args()) {
 925     Type *BaseArgTy = Arg.getType();
 926     unsigned Align = DL.getABITypeAlignment(BaseArgTy);
 927     MaxAlign = std::max(Align, MaxAlign);
 928     unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
 929
 930     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
 931     ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
 932
 933     // We're basically throwing away everything passed into us and starting over
 934     // to get accurate in-memory offsets. The "PartOffset" is completely useless
 935     // to us as computed in Ins.
 936     //
 937     // We also need to figure out what type legalization is trying to do to get
 938     // the correct memory offsets.
 939
 940     SmallVector<EVT, 16> ValueVTs;
 941     SmallVector<uint64_t, 16> Offsets;
 942     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
 943
 944     for (unsigned Value = 0, NumValues = ValueVTs.size();
 945          Value != NumValues; ++Value) {
 946       uint64_t BasePartOffset = Offsets[Value];
 947
 948       EVT ArgVT = ValueVTs[Value];
 949       EVT MemVT = ArgVT;
 950       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
 951       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
 952
 953       if (NumRegs == 1) {
 954         // This argument is not split, so the IR type is the memory type.
 955         if (ArgVT.isExtended()) {
 956           // We have an extended type, like i24, so we should just use the
 957           // register type.
 958           MemVT = RegisterVT;
 959         } else {
 960           MemVT = ArgVT;
 961         }
 962       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
 963                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
 964         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
 965         // We have a vector value which has been split into a vector with
 966         // the same scalar type, but fewer elements.  This should handle
 967         // all the floating-point vector types.
 968         MemVT = RegisterVT;
 969       } else if (ArgVT.isVector() &&
 970                  ArgVT.getVectorNumElements() == NumRegs) {
 971         // This arg has been split so that each element is stored in a separate
 972         // register.
 973         MemVT = ArgVT.getScalarType();
 974       } else if (ArgVT.isExtended()) {
 975         // We have an extended type, like i65.
 976         MemVT = RegisterVT;
 977       } else {
 978         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
 979         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
 980         if (RegisterVT.isInteger()) {
 981           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
 982         } else if (RegisterVT.isVector()) {
 983           assert(!RegisterVT.getScalarType().isFloatingPoint());
 984           unsigned NumElements = RegisterVT.getVectorNumElements();
 985           assert(MemoryBits % NumElements == 0);
 986           // This vector type has been split into another vector type with
 987           // a different elements size.
 988           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
 989                                            MemoryBits / NumElements);
 990           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
 991         } else {
 992           llvm_unreachable("cannot deduce memory type.");
 993         }
 994       }
 995
 996       // Convert one element vectors to scalar.
 997       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
 998         MemVT = MemVT.getScalarType();
 999
1000       // Round up vec3/vec5 argument.
1001       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1002         assert(MemVT.getVectorNumElements() == 3 ||
1003                MemVT.getVectorNumElements() == 5);
1004         MemVT = MemVT.getPow2VectorType(State.getContext());
1005       }
1006
1007       unsigned PartOffset = 0;
1008       for (unsigned i = 0; i != NumRegs; ++i) {
1009         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1010                                                BasePartOffset + PartOffset,
1011                                                MemVT.getSimpleVT(),
1012                                                CCValAssign::Full));
1013         PartOffset += MemVT.getStoreSize();
1014       }
1015     }
1016   }
1017 }
1018
1019 SDValue AMDGPUTargetLowering::LowerReturn(
1020   SDValue Chain, CallingConv::ID CallConv,
1021   bool isVarArg,
1022   const SmallVectorImpl<ISD::OutputArg> &Outs,
1023   const SmallVectorImpl<SDValue> &OutVals,
1024   const SDLoc &DL, SelectionDAG &DAG) const {
1025   // FIXME: Fails for r600 tests
1026   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1027   // "wave terminate should not have return values");
1028   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1029 }
1030
1031 //===---------------------------------------------------------------------===//
1032 // Target specific lowering
1033 //===---------------------------------------------------------------------===//
1034
1035 /// Selects the correct CCAssignFn for a given CallingConvention value.
1036 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1037                                                     bool IsVarArg) {
1038   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1039 }
1040
1041 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1042                                                       bool IsVarArg) {
1043   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1044 }
1045
1046 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1047                                                   SelectionDAG &DAG,
1048                                                   MachineFrameInfo &MFI,
1049                                                   int ClobberedFI) const {
1050   SmallVector<SDValue, 8> ArgChains;
1051   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1052   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1053
1054   // Include the original chain at the beginning of the list. When this is
1055   // used by target LowerCall hooks, this helps legalize find the
1056   // CALLSEQ_BEGIN node.
1057   ArgChains.push_back(Chain);
1058
1059   // Add a chain value for each stack argument corresponding
1060   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1061                             UE = DAG.getEntryNode().getNode()->use_end();
1062        U != UE; ++U) {
1063     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1064       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1065         if (FI->getIndex() < 0) {
1066           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1067           int64_t InLastByte = InFirstByte;
1068           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1069
1070           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1071               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1072             ArgChains.push_back(SDValue(L, 1));
1073         }
1074       }
1075     }
1076   }
1077
1078   // Build a tokenfactor for all the chains.
1079   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1080 }
1081
1082 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1083                                                  SmallVectorImpl<SDValue> &InVals,
1084                                                  StringRef Reason) const {
1085   SDValue Callee = CLI.Callee;
1086   SelectionDAG &DAG = CLI.DAG;
1087
1088   const Function &Fn = DAG.getMachineFunction().getFunction();
1089
1090   StringRef FuncName("<unknown>");
1091
1092   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1093     FuncName = G->getSymbol();
1094   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1095     FuncName = G->getGlobal()->getName();
1096
1097   DiagnosticInfoUnsupported NoCalls(
1098     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1099   DAG.getContext()->diagnose(NoCalls);
1100
1101   if (!CLI.IsTailCall) {
1102     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1103       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1104   }
1105
1106   return DAG.getEntryNode();
1107 }
1108
1109 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1110                                         SmallVectorImpl<SDValue> &InVals) const {
1111   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1112 }
1113
1114 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1115                                                       SelectionDAG &DAG) const {
1116   const Function &Fn = DAG.getMachineFunction().getFunction();
1117
1118   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1119                                             SDLoc(Op).getDebugLoc());
1120   DAG.getContext()->diagnose(NoDynamicAlloca);
1121   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1122   return DAG.getMergeValues(Ops, SDLoc());
1123 }
1124
1125 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1126                                              SelectionDAG &DAG) const {
1127   switch (Op.getOpcode()) {
1128   default:
1129     Op->print(errs(), &DAG);
1130     llvm_unreachable("Custom lowering code for this"
1131                      "instruction is not implemented yet!");
1132     break;
1133   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1134   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1135   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1136   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1137   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1138   case ISD::FREM: return LowerFREM(Op, DAG);
1139   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1140   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1141   case ISD::FRINT: return LowerFRINT(Op, DAG);
1142   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1143   case ISD::FROUND: return LowerFROUND(Op, DAG);
1144   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1145   case ISD::FLOG:
1146     return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
1147   case ISD::FLOG10:
1148     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1149   case ISD::FEXP:
1150     return lowerFEXP(Op, DAG);
1151   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1152   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1153   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1154   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1155   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1156   case ISD::CTTZ:
1157   case ISD::CTTZ_ZERO_UNDEF:
1158   case ISD::CTLZ:
1159   case ISD::CTLZ_ZERO_UNDEF:
1160     return LowerCTLZ_CTTZ(Op, DAG);
1161   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1162   }
1163   return Op;
1164 }
1165
1166 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1167                                               SmallVectorImpl<SDValue> &Results,
1168                                               SelectionDAG &DAG) const {
1169   switch (N->getOpcode()) {
1170   case ISD::SIGN_EXTEND_INREG:
1171     // Different parts of legalization seem to interpret which type of
1172     // sign_extend_inreg is the one to check for custom lowering. The extended
1173     // from type is what really matters, but some places check for custom
1174     // lowering of the result type. This results in trying to use
1175     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1176     // nothing here and let the illegal result integer be handled normally.
1177     return;
1178   default:
1179     return;
1180   }
1181 }
1182
1183 bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1184   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1185   if (!GVar || !GVar->hasInitializer())
1186     return false;
1187
1188   return !isa<UndefValue>(GVar->getInitializer());
1189 }
1190
1191 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1192                                                  SDValue Op,
1193                                                  SelectionDAG &DAG) const {
1194
1195   const DataLayout &DL = DAG.getDataLayout();
1196   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1197   const GlobalValue *GV = G->getGlobal();
1198
1199   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1200       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1201     if (!MFI->isEntryFunction()) {
1202       const Function &Fn = DAG.getMachineFunction().getFunction();
1203       DiagnosticInfoUnsupported BadLDSDecl(
1204         Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
1205       DAG.getContext()->diagnose(BadLDSDecl);
1206     }
1207
1208     // XXX: What does the value of G->getOffset() mean?
1209     assert(G->getOffset() == 0 &&
1210          "Do not know what to do with an non-zero offset");
1211
1212     // TODO: We could emit code to handle the initialization somewhere.
1213     if (!hasDefinedInitializer(GV)) {
1214       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1215       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1216     }
1217   }
1218
1219   const Function &Fn = DAG.getMachineFunction().getFunction();
1220   DiagnosticInfoUnsupported BadInit(
1221       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1222   DAG.getContext()->diagnose(BadInit);
1223   return SDValue();
1224 }
1225
1226 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1227                                                   SelectionDAG &DAG) const {
1228   SmallVector<SDValue, 8> Args;
1229
1230   EVT VT = Op.getValueType();
1231   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1232     SDLoc SL(Op);
1233     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1234     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1235
1236     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1237     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1238   }
1239
1240   for (const SDUse &U : Op->ops())
1241     DAG.ExtractVectorElements(U.get(), Args);
1242
1243   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1244 }
1245
1246 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1247                                                      SelectionDAG &DAG) const {
1248
1249   SmallVector<SDValue, 8> Args;
1250   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1251   EVT VT = Op.getValueType();
1252   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1253                             VT.getVectorNumElements());
1254
1255   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1256 }
1257
1258 /// Generate Min/Max node
1259 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1260                                                    SDValue LHS, SDValue RHS,
1261                                                    SDValue True, SDValue False,
1262                                                    SDValue CC,
1263                                                    DAGCombinerInfo &DCI) const {
1264   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1265     return SDValue();
1266
1267   SelectionDAG &DAG = DCI.DAG;
1268   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1269   switch (CCOpcode) {
1270   case ISD::SETOEQ:
1271   case ISD::SETONE:
1272   case ISD::SETUNE:
1273   case ISD::SETNE:
1274   case ISD::SETUEQ:
1275   case ISD::SETEQ:
1276   case ISD::SETFALSE:
1277   case ISD::SETFALSE2:
1278   case ISD::SETTRUE:
1279   case ISD::SETTRUE2:
1280   case ISD::SETUO:
1281   case ISD::SETO:
1282     break;
1283   case ISD::SETULE:
1284   case ISD::SETULT: {
1285     if (LHS == True)
1286       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1287     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1288   }
1289   case ISD::SETOLE:
1290   case ISD::SETOLT:
1291   case ISD::SETLE:
1292   case ISD::SETLT: {
1293     // Ordered. Assume ordered for undefined.
1294
1295     // Only do this after legalization to avoid interfering with other combines
1296     // which might occur.
1297     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1298         !DCI.isCalledByLegalizer())
1299       return SDValue();
1300
1301     // We need to permute the operands to get the correct NaN behavior. The
1302     // selected operand is the second one based on the failing compare with NaN,
1303     // so permute it based on the compare type the hardware uses.
1304     if (LHS == True)
1305       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1306     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1307   }
1308   case ISD::SETUGE:
1309   case ISD::SETUGT: {
1310     if (LHS == True)
1311       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1312     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1313   }
1314   case ISD::SETGT:
1315   case ISD::SETGE:
1316   case ISD::SETOGE:
1317   case ISD::SETOGT: {
1318     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1319         !DCI.isCalledByLegalizer())
1320       return SDValue();
1321
1322     if (LHS == True)
1323       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1324     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1325   }
1326   case ISD::SETCC_INVALID:
1327     llvm_unreachable("Invalid setcc condcode!");
1328   }
1329   return SDValue();
1330 }
1331
1332 std::pair<SDValue, SDValue>
1333 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1334   SDLoc SL(Op);
1335
1336   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1337
1338   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1339   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1340
1341   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1342   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1343
1344   return std::make_pair(Lo, Hi);
1345 }
1346
1347 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1348   SDLoc SL(Op);
1349
1350   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1351   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1352   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1353 }
1354
1355 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1356   SDLoc SL(Op);
1357
1358   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1359   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1360   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1361 }
1362
1363 // Split a vector type into two parts. The first part is a power of two vector.
1364 // The second part is whatever is left over, and is a scalar if it would
1365 // otherwise be a 1-vector.
1366 std::pair<EVT, EVT>
1367 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1368   EVT LoVT, HiVT;
1369   EVT EltVT = VT.getVectorElementType();
1370   unsigned NumElts = VT.getVectorNumElements();
1371   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1372   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1373   HiVT = NumElts - LoNumElts == 1
1374              ? EltVT
1375              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1376   return std::make_pair(LoVT, HiVT);
1377 }
1378
1379 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1380 // scalar.
1381 std::pair<SDValue, SDValue>
1382 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1383                                   const EVT &LoVT, const EVT &HiVT,
1384                                   SelectionDAG &DAG) const {
1385   assert(LoVT.getVectorNumElements() +
1386                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1387              N.getValueType().getVectorNumElements() &&
1388          "More vector elements requested than available!");
1389   auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1390   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1391                            DAG.getConstant(0, DL, IdxTy));
1392   SDValue Hi = DAG.getNode(
1393       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1394       HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
1395   return std::make_pair(Lo, Hi);
1396 }
1397
1398 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1399                                               SelectionDAG &DAG) const {
1400   LoadSDNode *Load = cast<LoadSDNode>(Op);
1401   EVT VT = Op.getValueType();
1402
1403
1404   // If this is a 2 element vector, we really want to scalarize and not create
1405   // weird 1 element vectors.
1406   if (VT.getVectorNumElements() == 2)
1407     return scalarizeVectorLoad(Load, DAG);
1408
1409   SDValue BasePtr = Load->getBasePtr();
1410   EVT MemVT = Load->getMemoryVT();
1411   SDLoc SL(Op);
1412
1413   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1414
1415   EVT LoVT, HiVT;
1416   EVT LoMemVT, HiMemVT;
1417   SDValue Lo, Hi;
1418
1419   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1420   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1421   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1422
1423   unsigned Size = LoMemVT.getStoreSize();
1424   unsigned BaseAlign = Load->getAlignment();
1425   unsigned HiAlign = MinAlign(BaseAlign, Size);
1426
1427   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1428                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1429                                   BaseAlign, Load->getMemOperand()->getFlags());
1430   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1431   SDValue HiLoad =
1432       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1433                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1434                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1435
1436   auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1437   SDValue Join;
1438   if (LoVT == HiVT) {
1439     // This is the case that the vector is power of two so was evenly split.
1440     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1441   } else {
1442     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1443                        DAG.getConstant(0, SL, IdxTy));
1444     Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
1445                                        : ISD::INSERT_VECTOR_ELT,
1446                        SL, VT, Join, HiLoad,
1447                        DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
1448   }
1449
1450   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1451                                      LoLoad.getValue(1), HiLoad.getValue(1))};
1452
1453   return DAG.getMergeValues(Ops, SL);
1454 }
1455
1456 // Widen a vector load from vec3 to vec4.
1457 SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
1458                                               SelectionDAG &DAG) const {
1459   LoadSDNode *Load = cast<LoadSDNode>(Op);
1460   EVT VT = Op.getValueType();
1461   assert(VT.getVectorNumElements() == 3);
1462   SDValue BasePtr = Load->getBasePtr();
1463   EVT MemVT = Load->getMemoryVT();
1464   SDLoc SL(Op);
1465   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1466   unsigned BaseAlign = Load->getAlignment();
1467
1468   EVT WideVT =
1469       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1470   EVT WideMemVT =
1471       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1472   SDValue WideLoad = DAG.getExtLoad(
1473       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1474       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1475   return DAG.getMergeValues(
1476       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1477                    DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
1478        WideLoad.getValue(1)},
1479       SL);
1480 }
1481
1482 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1483                                                SelectionDAG &DAG) const {
1484   StoreSDNode *Store = cast<StoreSDNode>(Op);
1485   SDValue Val = Store->getValue();
1486   EVT VT = Val.getValueType();
1487
1488   // If this is a 2 element vector, we really want to scalarize and not create
1489   // weird 1 element vectors.
1490   if (VT.getVectorNumElements() == 2)
1491     return scalarizeVectorStore(Store, DAG);
1492
1493   EVT MemVT = Store->getMemoryVT();
1494   SDValue Chain = Store->getChain();
1495   SDValue BasePtr = Store->getBasePtr();
1496   SDLoc SL(Op);
1497
1498   EVT LoVT, HiVT;
1499   EVT LoMemVT, HiMemVT;
1500   SDValue Lo, Hi;
1501
1502   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1503   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1504   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1505
1506   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1507
1508   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1509   unsigned BaseAlign = Store->getAlignment();
1510   unsigned Size = LoMemVT.getStoreSize();
1511   unsigned HiAlign = MinAlign(BaseAlign, Size);
1512
1513   SDValue LoStore =
1514       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1515                         Store->getMemOperand()->getFlags());
1516   SDValue HiStore =
1517       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1518                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1519
1520   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1521 }
1522
1523 // This is a shortcut for integer division because we have fast i32<->f32
1524 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1525 // float is enough to accurately represent up to a 24-bit signed integer.
1526 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1527                                             bool Sign) const {
1528   SDLoc DL(Op);
1529   EVT VT = Op.getValueType();
1530   SDValue LHS = Op.getOperand(0);
1531   SDValue RHS = Op.getOperand(1);
1532   MVT IntVT = MVT::i32;
1533   MVT FltVT = MVT::f32;
1534
1535   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1536   if (LHSSignBits < 9)
1537     return SDValue();
1538
1539   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1540   if (RHSSignBits < 9)
1541     return SDValue();
1542
1543   unsigned BitSize = VT.getSizeInBits();
1544   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1545   unsigned DivBits = BitSize - SignBits;
1546   if (Sign)
1547     ++DivBits;
1548
1549   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1550   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1551
1552   SDValue jq = DAG.getConstant(1, DL, IntVT);
1553
1554   if (Sign) {
1555     // char|short jq = ia ^ ib;
1556     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1557
1558     // jq = jq >> (bitsize - 2)
1559     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1560                      DAG.getConstant(BitSize - 2, DL, VT));
1561
1562     // jq = jq | 0x1
1563     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1564   }
1565
1566   // int ia = (int)LHS;
1567   SDValue ia = LHS;
1568
1569   // int ib, (int)RHS;
1570   SDValue ib = RHS;
1571
1572   // float fa = (float)ia;
1573   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1574
1575   // float fb = (float)ib;
1576   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1577
1578   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1579                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1580
1581   // fq = trunc(fq);
1582   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1583
1584   // float fqneg = -fq;
1585   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1586
1587   // float fr = mad(fqneg, fb, fa);
1588   unsigned OpCode = Subtarget->hasFP32Denormals() ?
1589                     (unsigned)AMDGPUISD::FMAD_FTZ :
1590                     (unsigned)ISD::FMAD;
1591   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1592
1593   // int iq = (int)fq;
1594   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1595
1596   // fr = fabs(fr);
1597   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1598
1599   // fb = fabs(fb);
1600   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1601
1602   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1603
1604   // int cv = fr >= fb;
1605   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1606
1607   // jq = (cv ? jq : 0);
1608   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1609
1610   // dst = iq + jq;
1611   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1612
1613   // Rem needs compensation, it's easier to recompute it
1614   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1615   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1616
1617   // Truncate to number of bits this divide really is.
1618   if (Sign) {
1619     SDValue InRegSize
1620       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1621     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1622     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1623   } else {
1624     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1625     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1626     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1627   }
1628
1629   return DAG.getMergeValues({ Div, Rem }, DL);
1630 }
1631
1632 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1633                                       SelectionDAG &DAG,
1634                                       SmallVectorImpl<SDValue> &Results) const {
1635   SDLoc DL(Op);
1636   EVT VT = Op.getValueType();
1637
1638   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1639
1640   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1641
1642   SDValue One = DAG.getConstant(1, DL, HalfVT);
1643   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1644
1645   //HiLo split
1646   SDValue LHS = Op.getOperand(0);
1647   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1648   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1649
1650   SDValue RHS = Op.getOperand(1);
1651   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1652   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1653
1654   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1655       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1656
1657     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1658                               LHS_Lo, RHS_Lo);
1659
1660     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1661     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1662
1663     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1664     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1665     return;
1666   }
1667
1668   if (isTypeLegal(MVT::i64)) {
1669     // Compute denominator reciprocal.
1670     unsigned FMAD = Subtarget->hasFP32Denormals() ?
1671                     (unsigned)AMDGPUISD::FMAD_FTZ :
1672                     (unsigned)ISD::FMAD;
1673
1674     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1675     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1676     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1677       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1678       Cvt_Lo);
1679     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1680     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1681       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1682     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1683       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1684     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1685     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1686       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1687       Mul1);
1688     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1689     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1690     SDValue Rcp64 = DAG.getBitcast(VT,
1691                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1692
1693     SDValue Zero64 = DAG.getConstant(0, DL, VT);
1694     SDValue One64  = DAG.getConstant(1, DL, VT);
1695     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1696     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1697
1698     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1699     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1700     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1701     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1702                                     Zero);
1703     SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1704                                     One);
1705
1706     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1707                                   Mulhi1_Lo, Zero1);
1708     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1709                                   Mulhi1_Hi, Add1_Lo.getValue(1));
1710     SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1711     SDValue Add1 = DAG.getBitcast(VT,
1712                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1713
1714     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1715     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1716     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1717                                     Zero);
1718     SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1719                                     One);
1720
1721     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1722                                   Mulhi2_Lo, Zero1);
1723     SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1724                                    Mulhi2_Hi, Add1_Lo.getValue(1));
1725     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1726                                   Zero, Add2_Lo.getValue(1));
1727     SDValue Add2 = DAG.getBitcast(VT,
1728                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1729     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1730
1731     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1732
1733     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1734     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1735     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1736                                   Mul3_Lo, Zero1);
1737     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1738                                   Mul3_Hi, Sub1_Lo.getValue(1));
1739     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1740     SDValue Sub1 = DAG.getBitcast(VT,
1741                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1742
1743     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1744     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1745                                  ISD::SETUGE);
1746     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1747                                  ISD::SETUGE);
1748     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1749
1750     // TODO: Here and below portions of the code can be enclosed into if/endif.
1751     // Currently control flow is unconditional and we have 4 selects after
1752     // potential endif to substitute PHIs.
1753
1754     // if C3 != 0 ...
1755     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1756                                   RHS_Lo, Zero1);
1757     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1758                                   RHS_Hi, Sub1_Lo.getValue(1));
1759     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1760                                   Zero, Sub2_Lo.getValue(1));
1761     SDValue Sub2 = DAG.getBitcast(VT,
1762                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1763
1764     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1765
1766     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1767                                  ISD::SETUGE);
1768     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1769                                  ISD::SETUGE);
1770     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1771
1772     // if (C6 != 0)
1773     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1774
1775     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1776                                   RHS_Lo, Zero1);
1777     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1778                                   RHS_Hi, Sub2_Lo.getValue(1));
1779     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1780                                   Zero, Sub3_Lo.getValue(1));
1781     SDValue Sub3 = DAG.getBitcast(VT,
1782                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1783
1784     // endif C6
1785     // endif C3
1786
1787     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1788     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1789
1790     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1791     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1792
1793     Results.push_back(Div);
1794     Results.push_back(Rem);
1795
1796     return;
1797   }
1798
1799   // r600 expandion.
1800   // Get Speculative values
1801   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1802   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1803
1804   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1805   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1806   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1807
1808   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1809   SDValue DIV_Lo = Zero;
1810
1811   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1812
1813   for (unsigned i = 0; i < halfBitWidth; ++i) {
1814     const unsigned bitPos = halfBitWidth - i - 1;
1815     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1816     // Get value of high bit
1817     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1818     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1819     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1820
1821     // Shift
1822     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1823     // Add LHS high bit
1824     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1825
1826     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1827     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1828
1829     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1830
1831     // Update REM
1832     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1833     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1834   }
1835
1836   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1837   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1838   Results.push_back(DIV);
1839   Results.push_back(REM);
1840 }
1841
1842 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1843                                            SelectionDAG &DAG) const {
1844   SDLoc DL(Op);
1845   EVT VT = Op.getValueType();
1846
1847   if (VT == MVT::i64) {
1848     SmallVector<SDValue, 2> Results;
1849     LowerUDIVREM64(Op, DAG, Results);
1850     return DAG.getMergeValues(Results, DL);
1851   }
1852
1853   if (VT == MVT::i32) {
1854     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1855       return Res;
1856   }
1857
1858   SDValue Num = Op.getOperand(0);
1859   SDValue Den = Op.getOperand(1);
1860
1861   // RCP =  URECIP(Den) = 2^32 / Den + e
1862   // e is rounding error.
1863   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1864
1865   // RCP_LO = mul(RCP, Den) */
1866   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1867
1868   // RCP_HI = mulhu (RCP, Den) */
1869   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1870
1871   // NEG_RCP_LO = -RCP_LO
1872   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1873                                                      RCP_LO);
1874
1875   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1876   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1877                                            NEG_RCP_LO, RCP_LO,
1878                                            ISD::SETEQ);
1879   // Calculate the rounding error from the URECIP instruction
1880   // E = mulhu(ABS_RCP_LO, RCP)
1881   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1882
1883   // RCP_A_E = RCP + E
1884   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1885
1886   // RCP_S_E = RCP - E
1887   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1888
1889   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1890   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1891                                      RCP_A_E, RCP_S_E,
1892                                      ISD::SETEQ);
1893   // Quotient = mulhu(Tmp0, Num)
1894   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1895
1896   // Num_S_Remainder = Quotient * Den
1897   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1898
1899   // Remainder = Num - Num_S_Remainder
1900   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1901
1902   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1903   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1904                                                  DAG.getConstant(-1, DL, VT),
1905                                                  DAG.getConstant(0, DL, VT),
1906                                                  ISD::SETUGE);
1907   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1908   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1909                                                   Num_S_Remainder,
1910                                                   DAG.getConstant(-1, DL, VT),
1911                                                   DAG.getConstant(0, DL, VT),
1912                                                   ISD::SETUGE);
1913   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1914   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1915                                                Remainder_GE_Zero);
1916
1917   // Calculate Division result:
1918
1919   // Quotient_A_One = Quotient + 1
1920   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1921                                        DAG.getConstant(1, DL, VT));
1922
1923   // Quotient_S_One = Quotient - 1
1924   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1925                                        DAG.getConstant(1, DL, VT));
1926
1927   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1928   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1929                                      Quotient, Quotient_A_One, ISD::SETEQ);
1930
1931   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1932   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1933                             Quotient_S_One, Div, ISD::SETEQ);
1934
1935   // Calculate Rem result:
1936
1937   // Remainder_S_Den = Remainder - Den
1938   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1939
1940   // Remainder_A_Den = Remainder + Den
1941   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1942
1943   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1944   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1945                                     Remainder, Remainder_S_Den, ISD::SETEQ);
1946
1947   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1948   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1949                             Remainder_A_Den, Rem, ISD::SETEQ);
1950   SDValue Ops[2] = {
1951     Div,
1952     Rem
1953   };
1954   return DAG.getMergeValues(Ops, DL);
1955 }
1956
1957 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1958                                            SelectionDAG &DAG) const {
1959   SDLoc DL(Op);
1960   EVT VT = Op.getValueType();
1961
1962   SDValue LHS = Op.getOperand(0);
1963   SDValue RHS = Op.getOperand(1);
1964
1965   SDValue Zero = DAG.getConstant(0, DL, VT);
1966   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1967
1968   if (VT == MVT::i32) {
1969     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1970       return Res;
1971   }
1972
1973   if (VT == MVT::i64 &&
1974       DAG.ComputeNumSignBits(LHS) > 32 &&
1975       DAG.ComputeNumSignBits(RHS) > 32) {
1976     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1977
1978     //HiLo split
1979     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1980     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1981     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1982                                  LHS_Lo, RHS_Lo);
1983     SDValue Res[2] = {
1984       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1985       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1986     };
1987     return DAG.getMergeValues(Res, DL);
1988   }
1989
1990   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1991   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1992   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1993   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1994
1995   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1996   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1997
1998   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1999   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2000
2001   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2002   SDValue Rem = Div.getValue(1);
2003
2004   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2005   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2006
2007   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2008   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2009
2010   SDValue Res[2] = {
2011     Div,
2012     Rem
2013   };
2014   return DAG.getMergeValues(Res, DL);
2015 }
2016
2017 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2018 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2019   SDLoc SL(Op);
2020   EVT VT = Op.getValueType();
2021   SDValue X = Op.getOperand(0);
2022   SDValue Y = Op.getOperand(1);
2023
2024   // TODO: Should this propagate fast-math-flags?
2025
2026   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2027   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2028   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2029
2030   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2031 }
2032
2033 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2034   SDLoc SL(Op);
2035   SDValue Src = Op.getOperand(0);
2036
2037   // result = trunc(src)
2038   // if (src > 0.0 && src != result)
2039   //   result += 1.0
2040
2041   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2042
2043   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2044   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2045
2046   EVT SetCCVT =
2047       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2048
2049   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2050   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2051   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2052
2053   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2054   // TODO: Should this propagate fast-math-flags?
2055   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2056 }
2057
2058 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2059                                   SelectionDAG &DAG) {
2060   const unsigned FractBits = 52;
2061   const unsigned ExpBits = 11;
2062
2063   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2064                                 Hi,
2065                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2066                                 DAG.getConstant(ExpBits, SL, MVT::i32));
2067   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2068                             DAG.getConstant(1023, SL, MVT::i32));
2069
2070   return Exp;
2071 }
2072
2073 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2074   SDLoc SL(Op);
2075   SDValue Src = Op.getOperand(0);
2076
2077   assert(Op.getValueType() == MVT::f64);
2078
2079   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2080   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2081
2082   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2083
2084   // Extract the upper half, since this is where we will find the sign and
2085   // exponent.
2086   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2087
2088   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2089
2090   const unsigned FractBits = 52;
2091
2092   // Extract the sign bit.
2093   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2094   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2095
2096   // Extend back to 64-bits.
2097   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2098   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2099
2100   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2101   const SDValue FractMask
2102     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2103
2104   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2105   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2106   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2107
2108   EVT SetCCVT =
2109       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2110
2111   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2112
2113   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2114   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2115
2116   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2117   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2118
2119   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2120 }
2121
2122 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2123   SDLoc SL(Op);
2124   SDValue Src = Op.getOperand(0);
2125
2126   assert(Op.getValueType() == MVT::f64);
2127
2128   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2129   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2130   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2131
2132   // TODO: Should this propagate fast-math-flags?
2133
2134   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2135   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2136
2137   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2138
2139   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2140   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2141
2142   EVT SetCCVT =
2143       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2144   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2145
2146   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2147 }
2148
2149 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2150   // FNEARBYINT and FRINT are the same, except in their handling of FP
2151   // exceptions. Those aren't really meaningful for us, and OpenCL only has
2152   // rint, so just treat them as equivalent.
2153   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2154 }
2155
2156 // XXX - May require not supporting f32 denormals?
2157
2158 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2159 // compare and vselect end up producing worse code than scalarizing the whole
2160 // operation.
2161 SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
2162   SDLoc SL(Op);
2163   SDValue X = Op.getOperand(0);
2164   EVT VT = Op.getValueType();
2165
2166   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2167
2168   // TODO: Should this propagate fast-math-flags?
2169
2170   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2171
2172   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2173
2174   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2175   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2176   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2177
2178   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2179
2180   EVT SetCCVT =
2181       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2182
2183   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2184
2185   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2186
2187   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2188 }
2189
2190 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
2191   SDLoc SL(Op);
2192   SDValue X = Op.getOperand(0);
2193
2194   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2195
2196   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2197   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2198   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2199   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2200   EVT SetCCVT =
2201       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2202
2203   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2204
2205   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2206
2207   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2208
2209   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
2210                                        MVT::i64);
2211
2212   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2213   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2214                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
2215                                           MVT::i64),
2216                           Exp);
2217
2218   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2219   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2220                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
2221                               ISD::SETNE);
2222
2223   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2224                              D, DAG.getConstant(0, SL, MVT::i64));
2225   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2226
2227   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2228   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2229
2230   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2231   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2232   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2233
2234   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2235                             ExpEqNegOne,
2236                             DAG.getConstantFP(1.0, SL, MVT::f64),
2237                             DAG.getConstantFP(0.0, SL, MVT::f64));
2238
2239   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2240
2241   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2242   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2243
2244   return K;
2245 }
2246
2247 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2248   EVT VT = Op.getValueType();
2249
2250   if (VT == MVT::f32 || VT == MVT::f16)
2251     return LowerFROUND32_16(Op, DAG);
2252
2253   if (VT == MVT::f64)
2254     return LowerFROUND64(Op, DAG);
2255
2256   llvm_unreachable("unhandled type");
2257 }
2258
2259 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2260   SDLoc SL(Op);
2261   SDValue Src = Op.getOperand(0);
2262
2263   // result = trunc(src);
2264   // if (src < 0.0 && src != result)
2265   //   result += -1.0.
2266
2267   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2268
2269   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2270   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2271
2272   EVT SetCCVT =
2273       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2274
2275   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2276   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2277   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2278
2279   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2280   // TODO: Should this propagate fast-math-flags?
2281   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2282 }
2283
2284 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2285                                         double Log2BaseInverted) const {
2286   EVT VT = Op.getValueType();
2287
2288   SDLoc SL(Op);
2289   SDValue Operand = Op.getOperand(0);
2290   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2291   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2292
2293   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2294 }
2295
2296 // exp2(M_LOG2E_F * f);
2297 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2298   EVT VT = Op.getValueType();
2299   SDLoc SL(Op);
2300   SDValue Src = Op.getOperand(0);
2301
2302   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2303   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2304   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2305 }
2306
2307 static bool isCtlzOpc(unsigned Opc) {
2308   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2309 }
2310
2311 static bool isCttzOpc(unsigned Opc) {
2312   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2313 }
2314
2315 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2316   SDLoc SL(Op);
2317   SDValue Src = Op.getOperand(0);
2318   bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2319                    Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2320
2321   unsigned ISDOpc, NewOpc;
2322   if (isCtlzOpc(Op.getOpcode())) {
2323     ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2324     NewOpc = AMDGPUISD::FFBH_U32;
2325   } else if (isCttzOpc(Op.getOpcode())) {
2326     ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2327     NewOpc = AMDGPUISD::FFBL_B32;
2328   } else
2329     llvm_unreachable("Unexpected OPCode!!!");
2330
2331
2332   if (ZeroUndef && Src.getValueType() == MVT::i32)
2333     return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2334
2335   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2336
2337   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2338   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2339
2340   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2341   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2342
2343   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2344                                    *DAG.getContext(), MVT::i32);
2345
2346   SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2347   SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2348
2349   SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2350   SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2351
2352   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2353   SDValue Add, NewOpr;
2354   if (isCtlzOpc(Op.getOpcode())) {
2355     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2356     // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2357     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2358   } else {
2359     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2360     // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2361     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2362   }
2363
2364   if (!ZeroUndef) {
2365     // Test if the full 64-bit input is zero.
2366
2367     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2368     // which we probably don't want.
2369     SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2370     SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2371     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2372
2373     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2374     // with the same cycles, otherwise it is slower.
2375     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2376     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2377
2378     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2379
2380     // The instruction returns -1 for 0 input, but the defined intrinsic
2381     // behavior is to return the number of bits.
2382     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2383                          SrcIsZero, Bits32, NewOpr);
2384   }
2385
2386   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2387 }
2388
2389 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2390                                                bool Signed) const {
2391   // Unsigned
2392   // cul2f(ulong u)
2393   //{
2394   //  uint lz = clz(u);
2395   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
2396   //  u = (u << lz) & 0x7fffffffffffffffUL;
2397   //  ulong t = u & 0xffffffffffUL;
2398   //  uint v = (e << 23) | (uint)(u >> 40);
2399   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2400   //  return as_float(v + r);
2401   //}
2402   // Signed
2403   // cl2f(long l)
2404   //{
2405   //  long s = l >> 63;
2406   //  float r = cul2f((l + s) ^ s);
2407   //  return s ? -r : r;
2408   //}
2409
2410   SDLoc SL(Op);
2411   SDValue Src = Op.getOperand(0);
2412   SDValue L = Src;
2413
2414   SDValue S;
2415   if (Signed) {
2416     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2417     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2418
2419     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2420     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2421   }
2422
2423   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2424                                    *DAG.getContext(), MVT::f32);
2425
2426
2427   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2428   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2429   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2430   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2431
2432   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2433   SDValue E = DAG.getSelect(SL, MVT::i32,
2434     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2435     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2436     ZeroI32);
2437
2438   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2439     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2440     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2441
2442   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2443                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2444
2445   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2446                              U, DAG.getConstant(40, SL, MVT::i64));
2447
2448   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2449     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2450     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
2451
2452   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2453   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2454   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2455
2456   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2457
2458   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2459
2460   SDValue R = DAG.getSelect(SL, MVT::i32,
2461     RCmp,
2462     One,
2463     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2464   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2465   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2466
2467   if (!Signed)
2468     return R;
2469
2470   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2471   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2472 }
2473
2474 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2475                                                bool Signed) const {
2476   SDLoc SL(Op);
2477   SDValue Src = Op.getOperand(0);
2478
2479   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2480
2481   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2482                            DAG.getConstant(0, SL, MVT::i32));
2483   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2484                            DAG.getConstant(1, SL, MVT::i32));
2485
2486   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2487                               SL, MVT::f64, Hi);
2488
2489   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2490
2491   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2492                               DAG.getConstant(32, SL, MVT::i32));
2493   // TODO: Should this propagate fast-math-flags?
2494   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2495 }
2496
2497 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2498                                                SelectionDAG &DAG) const {
2499   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2500          "operation should be legal");
2501
2502   // TODO: Factor out code common with LowerSINT_TO_FP.
2503
2504   EVT DestVT = Op.getValueType();
2505   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2506     SDLoc DL(Op);
2507     SDValue Src = Op.getOperand(0);
2508
2509     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2510     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2511     SDValue FPRound =
2512         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2513
2514     return FPRound;
2515   }
2516
2517   if (DestVT == MVT::f32)
2518     return LowerINT_TO_FP32(Op, DAG, false);
2519
2520   assert(DestVT == MVT::f64);
2521   return LowerINT_TO_FP64(Op, DAG, false);
2522 }
2523
2524 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2525                                               SelectionDAG &DAG) const {
2526   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2527          "operation should be legal");
2528
2529   // TODO: Factor out code common with LowerUINT_TO_FP.
2530
2531   EVT DestVT = Op.getValueType();
2532   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2533     SDLoc DL(Op);
2534     SDValue Src = Op.getOperand(0);
2535
2536     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2537     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2538     SDValue FPRound =
2539         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2540
2541     return FPRound;
2542   }
2543
2544   if (DestVT == MVT::f32)
2545     return LowerINT_TO_FP32(Op, DAG, true);
2546
2547   assert(DestVT == MVT::f64);
2548   return LowerINT_TO_FP64(Op, DAG, true);
2549 }
2550
2551 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2552                                                bool Signed) const {
2553   SDLoc SL(Op);
2554
2555   SDValue Src = Op.getOperand(0);
2556
2557   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2558
2559   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2560                                  MVT::f64);
2561   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2562                                  MVT::f64);
2563   // TODO: Should this propagate fast-math-flags?
2564   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2565
2566   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2567
2568
2569   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2570
2571   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2572                            MVT::i32, FloorMul);
2573   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2574
2575   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2576
2577   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2578 }
2579
2580 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2581   SDLoc DL(Op);
2582   SDValue N0 = Op.getOperand(0);
2583
2584   // Convert to target node to get known bits
2585   if (N0.getValueType() == MVT::f32)
2586     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2587
2588   if (getTargetMachine().Options.UnsafeFPMath) {
2589     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2590     return SDValue();
2591   }
2592
2593   assert(N0.getSimpleValueType() == MVT::f64);
2594
2595   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2596   const unsigned ExpMask = 0x7ff;
2597   const unsigned ExpBiasf64 = 1023;
2598   const unsigned ExpBiasf16 = 15;
2599   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2600   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2601   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2602   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2603                            DAG.getConstant(32, DL, MVT::i64));
2604   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2605   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2606   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2607                           DAG.getConstant(20, DL, MVT::i64));
2608   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2609                   DAG.getConstant(ExpMask, DL, MVT::i32));
2610   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2611   // add the f16 bias (15) to get the biased exponent for the f16 format.
2612   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2613                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2614
2615   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2616                           DAG.getConstant(8, DL, MVT::i32));
2617   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2618                   DAG.getConstant(0xffe, DL, MVT::i32));
2619
2620   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2621                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2622   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2623
2624   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2625   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2626
2627   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2628   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2629       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2630                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2631
2632   // N = M | (E << 12);
2633   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2634       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2635                   DAG.getConstant(12, DL, MVT::i32)));
2636
2637   // B = clamp(1-E, 0, 13);
2638   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2639                                   One, E);
2640   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2641   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2642                   DAG.getConstant(13, DL, MVT::i32));
2643
2644   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2645                                    DAG.getConstant(0x1000, DL, MVT::i32));
2646
2647   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2648   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2649   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2650   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2651
2652   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2653   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2654                               DAG.getConstant(0x7, DL, MVT::i32));
2655   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2656                   DAG.getConstant(2, DL, MVT::i32));
2657   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2658                                One, Zero, ISD::SETEQ);
2659   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2660                                One, Zero, ISD::SETGT);
2661   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2662   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2663
2664   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2665                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2666   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2667                       I, V, ISD::SETEQ);
2668
2669   // Extract the sign bit.
2670   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2671                             DAG.getConstant(16, DL, MVT::i32));
2672   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2673                      DAG.getConstant(0x8000, DL, MVT::i32));
2674
2675   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2676   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2677 }
2678
2679 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2680                                               SelectionDAG &DAG) const {
2681   SDValue Src = Op.getOperand(0);
2682
2683   // TODO: Factor out code common with LowerFP_TO_UINT.
2684
2685   EVT SrcVT = Src.getValueType();
2686   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2687     SDLoc DL(Op);
2688
2689     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2690     SDValue FpToInt32 =
2691         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2692
2693     return FpToInt32;
2694   }
2695
2696   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2697     return LowerFP64_TO_INT(Op, DAG, true);
2698
2699   return SDValue();
2700 }
2701
2702 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2703                                               SelectionDAG &DAG) const {
2704   SDValue Src = Op.getOperand(0);
2705
2706   // TODO: Factor out code common with LowerFP_TO_SINT.
2707
2708   EVT SrcVT = Src.getValueType();
2709   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2710     SDLoc DL(Op);
2711
2712     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2713     SDValue FpToInt32 =
2714         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2715
2716     return FpToInt32;
2717   }
2718
2719   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2720     return LowerFP64_TO_INT(Op, DAG, false);
2721
2722   return SDValue();
2723 }
2724
2725 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2726                                                      SelectionDAG &DAG) const {
2727   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2728   MVT VT = Op.getSimpleValueType();
2729   MVT ScalarVT = VT.getScalarType();
2730
2731   assert(VT.isVector());
2732
2733   SDValue Src = Op.getOperand(0);
2734   SDLoc DL(Op);
2735
2736   // TODO: Don't scalarize on Evergreen?
2737   unsigned NElts = VT.getVectorNumElements();
2738   SmallVector<SDValue, 8> Args;
2739   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2740
2741   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2742   for (unsigned I = 0; I < NElts; ++I)
2743     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2744
2745   return DAG.getBuildVector(VT, DL, Args);
2746 }
2747
2748 //===----------------------------------------------------------------------===//
2749 // Custom DAG optimizations
2750 //===----------------------------------------------------------------------===//
2751
2752 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2753   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2754 }
2755
2756 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2757   EVT VT = Op.getValueType();
2758   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2759                                      // as unsigned 24-bit values.
2760     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2761 }
2762
2763 static SDValue simplifyI24(SDNode *Node24,
2764                            TargetLowering::DAGCombinerInfo &DCI) {
2765   SelectionDAG &DAG = DCI.DAG;
2766   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2767
2768   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2769   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2770   unsigned NewOpcode = Node24->getOpcode();
2771   if (IsIntrin) {
2772     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2773     NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2774       AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2775   }
2776
2777   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2778
2779   // First try to simplify using GetDemandedBits which allows the operands to
2780   // have other uses, but will only perform simplifications that involve
2781   // bypassing some nodes for this user.
2782   SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
2783   SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
2784   if (DemandedLHS || DemandedRHS)
2785     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2786                        DemandedLHS ? DemandedLHS : LHS,
2787                        DemandedRHS ? DemandedRHS : RHS);
2788
2789   // Now try SimplifyDemandedBits which can simplify the nodes used by our
2790   // operands if this node is the only user.
2791   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2792   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2793     return SDValue(Node24, 0);
2794   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2795     return SDValue(Node24, 0);
2796
2797   return SDValue();
2798 }
2799
2800 template <typename IntTy>
2801 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2802                                uint32_t Width, const SDLoc &DL) {
2803   if (Width + Offset < 32) {
2804     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2805     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2806     return DAG.getConstant(Result, DL, MVT::i32);
2807   }
2808
2809   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2810 }
2811
2812 static bool hasVolatileUser(SDNode *Val) {
2813   for (SDNode *U : Val->uses()) {
2814     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2815       if (M->isVolatile())
2816         return true;
2817     }
2818   }
2819
2820   return false;
2821 }
2822
2823 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2824   // i32 vectors are the canonical memory type.
2825   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2826     return false;
2827
2828   if (!VT.isByteSized())
2829     return false;
2830
2831   unsigned Size = VT.getStoreSize();
2832
2833   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2834     return false;
2835
2836   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2837     return false;
2838
2839   return true;
2840 }
2841
2842 // Replace load of an illegal type with a store of a bitcast to a friendlier
2843 // type.
2844 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2845                                                  DAGCombinerInfo &DCI) const {
2846   if (!DCI.isBeforeLegalize())
2847     return SDValue();
2848
2849   LoadSDNode *LN = cast<LoadSDNode>(N);
2850   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2851     return SDValue();
2852
2853   SDLoc SL(N);
2854   SelectionDAG &DAG = DCI.DAG;
2855   EVT VT = LN->getMemoryVT();
2856
2857   unsigned Size = VT.getStoreSize();
2858   unsigned Align = LN->getAlignment();
2859   if (Align < Size && isTypeLegal(VT)) {
2860     bool IsFast;
2861     unsigned AS = LN->getAddressSpace();
2862
2863     // Expand unaligned loads earlier than legalization. Due to visitation order
2864     // problems during legalization, the emitted instructions to pack and unpack
2865     // the bytes again are not eliminated in the case of an unaligned copy.
2866     if (!allowsMisalignedMemoryAccesses(
2867             VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2868       if (VT.isVector())
2869         return scalarizeVectorLoad(LN, DAG);
2870
2871       SDValue Ops[2];
2872       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2873       return DAG.getMergeValues(Ops, SDLoc(N));
2874     }
2875
2876     if (!IsFast)
2877       return SDValue();
2878   }
2879
2880   if (!shouldCombineMemoryType(VT))
2881     return SDValue();
2882
2883   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2884
2885   SDValue NewLoad
2886     = DAG.getLoad(NewVT, SL, LN->getChain(),
2887                   LN->getBasePtr(), LN->getMemOperand());
2888
2889   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2890   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2891   return SDValue(N, 0);
2892 }
2893
2894 // Replace store of an illegal type with a store of a bitcast to a friendlier
2895 // type.
2896 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2897                                                   DAGCombinerInfo &DCI) const {
2898   if (!DCI.isBeforeLegalize())
2899     return SDValue();
2900
2901   StoreSDNode *SN = cast<StoreSDNode>(N);
2902   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2903     return SDValue();
2904
2905   EVT VT = SN->getMemoryVT();
2906   unsigned Size = VT.getStoreSize();
2907
2908   SDLoc SL(N);
2909   SelectionDAG &DAG = DCI.DAG;
2910   unsigned Align = SN->getAlignment();
2911   if (Align < Size && isTypeLegal(VT)) {
2912     bool IsFast;
2913     unsigned AS = SN->getAddressSpace();
2914
2915     // Expand unaligned stores earlier than legalization. Due to visitation
2916     // order problems during legalization, the emitted instructions to pack and
2917     // unpack the bytes again are not eliminated in the case of an unaligned
2918     // copy.
2919     if (!allowsMisalignedMemoryAccesses(
2920             VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
2921       if (VT.isVector())
2922         return scalarizeVectorStore(SN, DAG);
2923
2924       return expandUnalignedStore(SN, DAG);
2925     }
2926
2927     if (!IsFast)
2928       return SDValue();
2929   }
2930
2931   if (!shouldCombineMemoryType(VT))
2932     return SDValue();
2933
2934   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2935   SDValue Val = SN->getValue();
2936
2937   //DCI.AddToWorklist(Val.getNode());
2938
2939   bool OtherUses = !Val.hasOneUse();
2940   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2941   if (OtherUses) {
2942     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2943     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2944   }
2945
2946   return DAG.getStore(SN->getChain(), SL, CastVal,
2947                       SN->getBasePtr(), SN->getMemOperand());
2948 }
2949
2950 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2951 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2952 // issues.
2953 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2954                                                         DAGCombinerInfo &DCI) const {
2955   SelectionDAG &DAG = DCI.DAG;
2956   SDValue N0 = N->getOperand(0);
2957
2958   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2959   //     (vt2 (truncate (assertzext vt0:x, vt1)))
2960   if (N0.getOpcode() == ISD::TRUNCATE) {
2961     SDValue N1 = N->getOperand(1);
2962     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2963     SDLoc SL(N);
2964
2965     SDValue Src = N0.getOperand(0);
2966     EVT SrcVT = Src.getValueType();
2967     if (SrcVT.bitsGE(ExtVT)) {
2968       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2969       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2970     }
2971   }
2972
2973   return SDValue();
2974 }
2975
2976 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
2977   SDNode *N, DAGCombinerInfo &DCI) const {
2978   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2979   switch (IID) {
2980   case Intrinsic::amdgcn_mul_i24:
2981   case Intrinsic::amdgcn_mul_u24:
2982     return simplifyI24(N, DCI);
2983   default:
2984     return SDValue();
2985   }
2986 }
2987
2988 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2989 /// binary operation \p Opc to it with the corresponding constant operands.
2990 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2991   DAGCombinerInfo &DCI, const SDLoc &SL,
2992   unsigned Opc, SDValue LHS,
2993   uint32_t ValLo, uint32_t ValHi) const {
2994   SelectionDAG &DAG = DCI.DAG;
2995   SDValue Lo, Hi;
2996   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2997
2998   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2999   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3000
3001   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3002   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3003
3004   // Re-visit the ands. It's possible we eliminated one of them and it could
3005   // simplify the vector.
3006   DCI.AddToWorklist(Lo.getNode());
3007   DCI.AddToWorklist(Hi.getNode());
3008
3009   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3010   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3011 }
3012
3013 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3014                                                 DAGCombinerInfo &DCI) const {
3015   EVT VT = N->getValueType(0);
3016
3017   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3018   if (!RHS)
3019     return SDValue();
3020
3021   SDValue LHS = N->getOperand(0);
3022   unsigned RHSVal = RHS->getZExtValue();
3023   if (!RHSVal)
3024     return LHS;
3025
3026   SDLoc SL(N);
3027   SelectionDAG &DAG = DCI.DAG;
3028
3029   switch (LHS->getOpcode()) {
3030   default:
3031     break;
3032   case ISD::ZERO_EXTEND:
3033   case ISD::SIGN_EXTEND:
3034   case ISD::ANY_EXTEND: {
3035     SDValue X = LHS->getOperand(0);
3036
3037     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3038         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3039       // Prefer build_vector as the canonical form if packed types are legal.
3040       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3041       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3042        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3043       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3044     }
3045
3046     // shl (ext x) => zext (shl x), if shift does not overflow int
3047     if (VT != MVT::i64)
3048       break;
3049     KnownBits Known = DAG.computeKnownBits(X);
3050     unsigned LZ = Known.countMinLeadingZeros();
3051     if (LZ < RHSVal)
3052       break;
3053     EVT XVT = X.getValueType();
3054     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3055     return DAG.getZExtOrTrunc(Shl, SL, VT);
3056   }
3057   }
3058
3059   if (VT != MVT::i64)
3060     return SDValue();
3061
3062   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3063
3064   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3065   // common case, splitting this into a move and a 32-bit shift is faster and
3066   // the same code size.
3067   if (RHSVal < 32)
3068     return SDValue();
3069
3070   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3071
3072   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3073   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3074
3075   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3076
3077   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3078   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3079 }
3080
3081 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3082                                                 DAGCombinerInfo &DCI) const {
3083   if (N->getValueType(0) != MVT::i64)
3084     return SDValue();
3085
3086   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3087   if (!RHS)
3088     return SDValue();
3089
3090   SelectionDAG &DAG = DCI.DAG;
3091   SDLoc SL(N);
3092   unsigned RHSVal = RHS->getZExtValue();
3093
3094   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3095   if (RHSVal == 32) {
3096     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3097     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3098                                    DAG.getConstant(31, SL, MVT::i32));
3099
3100     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3101     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3102   }
3103
3104   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3105   if (RHSVal == 63) {
3106     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3107     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3108                                    DAG.getConstant(31, SL, MVT::i32));
3109     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3110     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3111   }
3112
3113   return SDValue();
3114 }
3115
3116 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3117                                                 DAGCombinerInfo &DCI) const {
3118   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3119   if (!RHS)
3120     return SDValue();
3121
3122   EVT VT = N->getValueType(0);
3123   SDValue LHS = N->getOperand(0);
3124   unsigned ShiftAmt = RHS->getZExtValue();
3125   SelectionDAG &DAG = DCI.DAG;
3126   SDLoc SL(N);
3127
3128   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3129   // this improves the ability to match BFE patterns in isel.
3130   if (LHS.getOpcode() == ISD::AND) {
3131     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3132       if (Mask->getAPIntValue().isShiftedMask() &&
3133           Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3134         return DAG.getNode(
3135             ISD::AND, SL, VT,
3136             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3137             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3138       }
3139     }
3140   }
3141
3142   if (VT != MVT::i64)
3143     return SDValue();
3144
3145   if (ShiftAmt < 32)
3146     return SDValue();
3147
3148   // srl i64:x, C for C >= 32
3149   // =>
3150   //   build_pair (srl hi_32(x), C - 32), 0
3151   SDValue One = DAG.getConstant(1, SL, MVT::i32);
3152   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3153
3154   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3155   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3156
3157   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3158   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3159
3160   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3161
3162   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3163 }
3164
3165 SDValue AMDGPUTargetLowering::performTruncateCombine(
3166   SDNode *N, DAGCombinerInfo &DCI) const {
3167   SDLoc SL(N);
3168   SelectionDAG &DAG = DCI.DAG;
3169   EVT VT = N->getValueType(0);
3170   SDValue Src = N->getOperand(0);
3171
3172   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3173   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3174     SDValue Vec = Src.getOperand(0);
3175     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3176       SDValue Elt0 = Vec.getOperand(0);
3177       EVT EltVT = Elt0.getValueType();
3178       if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3179         if (EltVT.isFloatingPoint()) {
3180           Elt0 = DAG.getNode(ISD::BITCAST, SL,
3181                              EltVT.changeTypeToInteger(), Elt0);
3182         }
3183
3184         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3185       }
3186     }
3187   }
3188
3189   // Equivalent of above for accessing the high element of a vector as an
3190   // integer operation.
3191   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3192   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3193     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3194       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3195         SDValue BV = stripBitcast(Src.getOperand(0));
3196         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3197             BV.getValueType().getVectorNumElements() == 2) {
3198           SDValue SrcElt = BV.getOperand(1);
3199           EVT SrcEltVT = SrcElt.getValueType();
3200           if (SrcEltVT.isFloatingPoint()) {
3201             SrcElt = DAG.getNode(ISD::BITCAST, SL,
3202                                  SrcEltVT.changeTypeToInteger(), SrcElt);
3203           }
3204
3205           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3206         }
3207       }
3208     }
3209   }
3210
3211   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3212   //
3213   // i16 (trunc (srl i64:x, K)), K <= 16 ->
3214   //     i16 (trunc (srl (i32 (trunc x), K)))
3215   if (VT.getScalarSizeInBits() < 32) {
3216     EVT SrcVT = Src.getValueType();
3217     if (SrcVT.getScalarSizeInBits() > 32 &&
3218         (Src.getOpcode() == ISD::SRL ||
3219          Src.getOpcode() == ISD::SRA ||
3220          Src.getOpcode() == ISD::SHL)) {
3221       SDValue Amt = Src.getOperand(1);
3222       KnownBits Known = DAG.computeKnownBits(Amt);
3223       unsigned Size = VT.getScalarSizeInBits();
3224       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3225           (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3226         EVT MidVT = VT.isVector() ?
3227           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3228                            VT.getVectorNumElements()) : MVT::i32;
3229
3230         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3231         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3232                                     Src.getOperand(0));
3233         DCI.AddToWorklist(Trunc.getNode());
3234
3235         if (Amt.getValueType() != NewShiftVT) {
3236           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3237           DCI.AddToWorklist(Amt.getNode());
3238         }
3239
3240         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3241                                           Trunc, Amt);
3242         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3243       }
3244     }
3245   }
3246
3247   return SDValue();
3248 }
3249
3250 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3251 // instructions. If we only match on the legalized i64 mul expansion,
3252 // SimplifyDemandedBits will be unable to remove them because there will be
3253 // multiple uses due to the separate mul + mulh[su].
3254 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3255                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3256   if (Size <= 32) {
3257     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3258     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3259   }
3260
3261   // Because we want to eliminate extension instructions before the
3262   // operation, we need to create a single user here (i.e. not the separate
3263   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3264
3265   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3266
3267   SDValue Mul = DAG.getNode(MulOpc, SL,
3268                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3269
3270   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3271                      Mul.getValue(0), Mul.getValue(1));
3272 }
3273
3274 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3275                                                 DAGCombinerInfo &DCI) const {
3276   EVT VT = N->getValueType(0);
3277
3278   unsigned Size = VT.getSizeInBits();
3279   if (VT.isVector() || Size > 64)
3280     return SDValue();
3281
3282   // There are i16 integer mul/mad.
3283   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3284     return SDValue();
3285
3286   SelectionDAG &DAG = DCI.DAG;
3287   SDLoc DL(N);
3288
3289   SDValue N0 = N->getOperand(0);
3290   SDValue N1 = N->getOperand(1);
3291
3292   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3293   // in the source into any_extends if the result of the mul is truncated. Since
3294   // we can assume the high bits are whatever we want, use the underlying value
3295   // to avoid the unknown high bits from interfering.
3296   if (N0.getOpcode() == ISD::ANY_EXTEND)
3297     N0 = N0.getOperand(0);
3298
3299   if (N1.getOpcode() == ISD::ANY_EXTEND)
3300     N1 = N1.getOperand(0);
3301
3302   SDValue Mul;
3303
3304   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3305     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3306     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3307     Mul = getMul24(DAG, DL, N0, N1, Size, false);
3308   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3309     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3310     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3311     Mul = getMul24(DAG, DL, N0, N1, Size, true);
3312   } else {
3313     return SDValue();
3314   }
3315
3316   // We need to use sext even for MUL_U24, because MUL_U24 is used
3317   // for signed multiply of 8 and 16-bit types.
3318   return DAG.getSExtOrTrunc(Mul, DL, VT);
3319 }
3320
3321 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3322                                                   DAGCombinerInfo &DCI) const {
3323   EVT VT = N->getValueType(0);
3324
3325   if (!Subtarget->hasMulI24() || VT.isVector())
3326     return SDValue();
3327
3328   SelectionDAG &DAG = DCI.DAG;
3329   SDLoc DL(N);
3330
3331   SDValue N0 = N->getOperand(0);
3332   SDValue N1 = N->getOperand(1);
3333
3334   if (!isI24(N0, DAG) || !isI24(N1, DAG))
3335     return SDValue();
3336
3337   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3338   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3339
3340   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3341   DCI.AddToWorklist(Mulhi.getNode());
3342   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3343 }
3344
3345 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3346                                                   DAGCombinerInfo &DCI) const {
3347   EVT VT = N->getValueType(0);
3348
3349   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3350     return SDValue();
3351
3352   SelectionDAG &DAG = DCI.DAG;
3353   SDLoc DL(N);
3354
3355   SDValue N0 = N->getOperand(0);
3356   SDValue N1 = N->getOperand(1);
3357
3358   if (!isU24(N0, DAG) || !isU24(N1, DAG))
3359     return SDValue();
3360
3361   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3362   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3363
3364   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3365   DCI.AddToWorklist(Mulhi.getNode());
3366   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3367 }
3368
3369 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3370   SDNode *N, DAGCombinerInfo &DCI) const {
3371   SelectionDAG &DAG = DCI.DAG;
3372
3373   // Simplify demanded bits before splitting into multiple users.
3374   if (SDValue V = simplifyI24(N, DCI))
3375     return V;
3376
3377   SDValue N0 = N->getOperand(0);
3378   SDValue N1 = N->getOperand(1);
3379
3380   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3381
3382   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3383   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3384
3385   SDLoc SL(N);
3386
3387   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3388   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3389   return DAG.getMergeValues({ MulLo, MulHi }, SL);
3390 }
3391
3392 static bool isNegativeOne(SDValue Val) {
3393   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3394     return C->isAllOnesValue();
3395   return false;
3396 }
3397
3398 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3399                                           SDValue Op,
3400                                           const SDLoc &DL,
3401                                           unsigned Opc) const {
3402   EVT VT = Op.getValueType();
3403   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3404   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3405                               LegalVT != MVT::i16))
3406     return SDValue();
3407
3408   if (VT != MVT::i32)
3409     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3410
3411   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3412   if (VT != MVT::i32)
3413     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3414
3415   return FFBX;
3416 }
3417
3418 // The native instructions return -1 on 0 input. Optimize out a select that
3419 // produces -1 on 0.
3420 //
3421 // TODO: If zero is not undef, we could also do this if the output is compared
3422 // against the bitwidth.
3423 //
3424 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3425 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3426                                                  SDValue LHS, SDValue RHS,
3427                                                  DAGCombinerInfo &DCI) const {
3428   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3429   if (!CmpRhs || !CmpRhs->isNullValue())
3430     return SDValue();
3431
3432   SelectionDAG &DAG = DCI.DAG;
3433   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3434   SDValue CmpLHS = Cond.getOperand(0);
3435
3436   unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3437                                            AMDGPUISD::FFBH_U32;
3438
3439   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3440   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3441   if (CCOpcode == ISD::SETEQ &&
3442       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3443       RHS.getOperand(0) == CmpLHS &&
3444       isNegativeOne(LHS)) {
3445     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3446   }
3447
3448   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3449   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3450   if (CCOpcode == ISD::SETNE &&
3451       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3452       LHS.getOperand(0) == CmpLHS &&
3453       isNegativeOne(RHS)) {
3454     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3455   }
3456
3457   return SDValue();
3458 }
3459
3460 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3461                                          unsigned Op,
3462                                          const SDLoc &SL,
3463                                          SDValue Cond,
3464                                          SDValue N1,
3465                                          SDValue N2) {
3466   SelectionDAG &DAG = DCI.DAG;
3467   EVT VT = N1.getValueType();
3468
3469   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3470                                   N1.getOperand(0), N2.getOperand(0));
3471   DCI.AddToWorklist(NewSelect.getNode());
3472   return DAG.getNode(Op, SL, VT, NewSelect);
3473 }
3474
3475 // Pull a free FP operation out of a select so it may fold into uses.
3476 //
3477 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3478 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3479 //
3480 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3481 // select c, (fabs x), +k -> fabs (select c, x, k)
3482 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3483                                     SDValue N) {
3484   SelectionDAG &DAG = DCI.DAG;
3485   SDValue Cond = N.getOperand(0);
3486   SDValue LHS = N.getOperand(1);
3487   SDValue RHS = N.getOperand(2);
3488
3489   EVT VT = N.getValueType();
3490   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3491       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3492     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3493                                      SDLoc(N), Cond, LHS, RHS);
3494   }
3495
3496   bool Inv = false;
3497   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3498     std::swap(LHS, RHS);
3499     Inv = true;
3500   }
3501
3502   // TODO: Support vector constants.
3503   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3504   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3505     SDLoc SL(N);
3506     // If one side is an fneg/fabs and the other is a constant, we can push the
3507     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3508     SDValue NewLHS = LHS.getOperand(0);
3509     SDValue NewRHS = RHS;
3510
3511     // Careful: if the neg can be folded up, don't try to pull it back down.
3512     bool ShouldFoldNeg = true;
3513
3514     if (NewLHS.hasOneUse()) {
3515       unsigned Opc = NewLHS.getOpcode();
3516       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3517         ShouldFoldNeg = false;
3518       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3519         ShouldFoldNeg = false;
3520     }
3521
3522     if (ShouldFoldNeg) {
3523       if (LHS.getOpcode() == ISD::FNEG)
3524         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3525       else if (CRHS->isNegative())
3526         return SDValue();
3527
3528       if (Inv)
3529         std::swap(NewLHS, NewRHS);
3530
3531       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3532                                       Cond, NewLHS, NewRHS);
3533       DCI.AddToWorklist(NewSelect.getNode());
3534       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3535     }
3536   }
3537
3538   return SDValue();
3539 }
3540
3541
3542 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3543                                                    DAGCombinerInfo &DCI) const {
3544   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3545     return Folded;
3546
3547   SDValue Cond = N->getOperand(0);
3548   if (Cond.getOpcode() != ISD::SETCC)
3549     return SDValue();
3550
3551   EVT VT = N->getValueType(0);
3552   SDValue LHS = Cond.getOperand(0);
3553   SDValue RHS = Cond.getOperand(1);
3554   SDValue CC = Cond.getOperand(2);
3555
3556   SDValue True = N->getOperand(1);
3557   SDValue False = N->getOperand(2);
3558
3559   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3560     SelectionDAG &DAG = DCI.DAG;
3561     if (DAG.isConstantValueOfAnyType(True) &&
3562         !DAG.isConstantValueOfAnyType(False)) {
3563       // Swap cmp + select pair to move constant to false input.
3564       // This will allow using VOPC cndmasks more often.
3565       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3566
3567       SDLoc SL(N);
3568       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3569                                             LHS.getValueType().isInteger());
3570
3571       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3572       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3573     }
3574
3575     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3576       SDValue MinMax
3577         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3578       // Revisit this node so we can catch min3/max3/med3 patterns.
3579       //DCI.AddToWorklist(MinMax.getNode());
3580       return MinMax;
3581     }
3582   }
3583
3584   // There's no reason to not do this if the condition has other uses.
3585   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3586 }
3587
3588 static bool isInv2Pi(const APFloat &APF) {
3589   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3590   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3591   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3592
3593   return APF.bitwiseIsEqual(KF16) ||
3594          APF.bitwiseIsEqual(KF32) ||
3595          APF.bitwiseIsEqual(KF64);
3596 }
3597
3598 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3599 // additional cost to negate them.
3600 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3601   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3602     if (C->isZero() && !C->isNegative())
3603       return true;
3604
3605     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3606       return true;
3607   }
3608
3609   return false;
3610 }
3611
3612 static unsigned inverseMinMax(unsigned Opc) {
3613   switch (Opc) {
3614   case ISD::FMAXNUM:
3615     return ISD::FMINNUM;
3616   case ISD::FMINNUM:
3617     return ISD::FMAXNUM;
3618   case ISD::FMAXNUM_IEEE:
3619     return ISD::FMINNUM_IEEE;
3620   case ISD::FMINNUM_IEEE:
3621     return ISD::FMAXNUM_IEEE;
3622   case AMDGPUISD::FMAX_LEGACY:
3623     return AMDGPUISD::FMIN_LEGACY;
3624   case AMDGPUISD::FMIN_LEGACY:
3625     return  AMDGPUISD::FMAX_LEGACY;
3626   default:
3627     llvm_unreachable("invalid min/max opcode");
3628   }
3629 }
3630
3631 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3632                                                  DAGCombinerInfo &DCI) const {
3633   SelectionDAG &DAG = DCI.DAG;
3634   SDValue N0 = N->getOperand(0);
3635   EVT VT = N->getValueType(0);
3636
3637   unsigned Opc = N0.getOpcode();
3638
3639   // If the input has multiple uses and we can either fold the negate down, or
3640   // the other uses cannot, give up. This both prevents unprofitable
3641   // transformations and infinite loops: we won't repeatedly try to fold around
3642   // a negate that has no 'good' form.
3643   if (N0.hasOneUse()) {
3644     // This may be able to fold into the source, but at a code size cost. Don't
3645     // fold if the fold into the user is free.
3646     if (allUsesHaveSourceMods(N, 0))
3647       return SDValue();
3648   } else {
3649     if (fnegFoldsIntoOp(Opc) &&
3650         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3651       return SDValue();
3652   }
3653
3654   SDLoc SL(N);
3655   switch (Opc) {
3656   case ISD::FADD: {
3657     if (!mayIgnoreSignedZero(N0))
3658       return SDValue();
3659
3660     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3661     SDValue LHS = N0.getOperand(0);
3662     SDValue RHS = N0.getOperand(1);
3663
3664     if (LHS.getOpcode() != ISD::FNEG)
3665       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3666     else
3667       LHS = LHS.getOperand(0);
3668
3669     if (RHS.getOpcode() != ISD::FNEG)
3670       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3671     else
3672       RHS = RHS.getOperand(0);
3673
3674     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3675     if (Res.getOpcode() != ISD::FADD)
3676       return SDValue(); // Op got folded away.
3677     if (!N0.hasOneUse())
3678       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3679     return Res;
3680   }
3681   case ISD::FMUL:
3682   case AMDGPUISD::FMUL_LEGACY: {
3683     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3684     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3685     SDValue LHS = N0.getOperand(0);
3686     SDValue RHS = N0.getOperand(1);
3687
3688     if (LHS.getOpcode() == ISD::FNEG)
3689       LHS = LHS.getOperand(0);
3690     else if (RHS.getOpcode() == ISD::FNEG)
3691       RHS = RHS.getOperand(0);
3692     else
3693       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3694
3695     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3696     if (Res.getOpcode() != Opc)
3697       return SDValue(); // Op got folded away.
3698     if (!N0.hasOneUse())
3699       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3700     return Res;
3701   }
3702   case ISD::FMA:
3703   case ISD::FMAD: {
3704     if (!mayIgnoreSignedZero(N0))
3705       return SDValue();
3706
3707     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3708     SDValue LHS = N0.getOperand(0);
3709     SDValue MHS = N0.getOperand(1);
3710     SDValue RHS = N0.getOperand(2);
3711
3712     if (LHS.getOpcode() == ISD::FNEG)
3713       LHS = LHS.getOperand(0);
3714     else if (MHS.getOpcode() == ISD::FNEG)
3715       MHS = MHS.getOperand(0);
3716     else
3717       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3718
3719     if (RHS.getOpcode() != ISD::FNEG)
3720       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3721     else
3722       RHS = RHS.getOperand(0);
3723
3724     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3725     if (Res.getOpcode() != Opc)
3726       return SDValue(); // Op got folded away.
3727     if (!N0.hasOneUse())
3728       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3729     return Res;
3730   }
3731   case ISD::FMAXNUM:
3732   case ISD::FMINNUM:
3733   case ISD::FMAXNUM_IEEE:
3734   case ISD::FMINNUM_IEEE:
3735   case AMDGPUISD::FMAX_LEGACY:
3736   case AMDGPUISD::FMIN_LEGACY: {
3737     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3738     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3739     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3740     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3741
3742     SDValue LHS = N0.getOperand(0);
3743     SDValue RHS = N0.getOperand(1);
3744
3745     // 0 doesn't have a negated inline immediate.
3746     // TODO: This constant check should be generalized to other operations.
3747     if (isConstantCostlierToNegate(RHS))
3748       return SDValue();
3749
3750     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3751     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3752     unsigned Opposite = inverseMinMax(Opc);
3753
3754     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3755     if (Res.getOpcode() != Opposite)
3756       return SDValue(); // Op got folded away.
3757     if (!N0.hasOneUse())
3758       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3759     return Res;
3760   }
3761   case AMDGPUISD::FMED3: {
3762     SDValue Ops[3];
3763     for (unsigned I = 0; I < 3; ++I)
3764       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3765
3766     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3767     if (Res.getOpcode() != AMDGPUISD::FMED3)
3768       return SDValue(); // Op got folded away.
3769     if (!N0.hasOneUse())
3770       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3771     return Res;
3772   }
3773   case ISD::FP_EXTEND:
3774   case ISD::FTRUNC:
3775   case ISD::FRINT:
3776   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3777   case ISD::FSIN:
3778   case ISD::FCANONICALIZE:
3779   case AMDGPUISD::RCP:
3780   case AMDGPUISD::RCP_LEGACY:
3781   case AMDGPUISD::RCP_IFLAG:
3782   case AMDGPUISD::SIN_HW: {
3783     SDValue CvtSrc = N0.getOperand(0);
3784     if (CvtSrc.getOpcode() == ISD::FNEG) {
3785       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3786       // (fneg (rcp (fneg x))) -> (rcp x)
3787       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3788     }
3789
3790     if (!N0.hasOneUse())
3791       return SDValue();
3792
3793     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3794     // (fneg (rcp x)) -> (rcp (fneg x))
3795     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3796     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3797   }
3798   case ISD::FP_ROUND: {
3799     SDValue CvtSrc = N0.getOperand(0);
3800
3801     if (CvtSrc.getOpcode() == ISD::FNEG) {
3802       // (fneg (fp_round (fneg x))) -> (fp_round x)
3803       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3804                          CvtSrc.getOperand(0), N0.getOperand(1));
3805     }
3806
3807     if (!N0.hasOneUse())
3808       return SDValue();
3809
3810     // (fneg (fp_round x)) -> (fp_round (fneg x))
3811     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3812     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3813   }
3814   case ISD::FP16_TO_FP: {
3815     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3816     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3817     // Put the fneg back as a legal source operation that can be matched later.
3818     SDLoc SL(N);
3819
3820     SDValue Src = N0.getOperand(0);
3821     EVT SrcVT = Src.getValueType();
3822
3823     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3824     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3825                                   DAG.getConstant(0x8000, SL, SrcVT));
3826     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3827   }
3828   default:
3829     return SDValue();
3830   }
3831 }
3832
3833 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3834                                                  DAGCombinerInfo &DCI) const {
3835   SelectionDAG &DAG = DCI.DAG;
3836   SDValue N0 = N->getOperand(0);
3837
3838   if (!N0.hasOneUse())
3839     return SDValue();
3840
3841   switch (N0.getOpcode()) {
3842   case ISD::FP16_TO_FP: {
3843     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3844     SDLoc SL(N);
3845     SDValue Src = N0.getOperand(0);
3846     EVT SrcVT = Src.getValueType();
3847
3848     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3849     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3850                                   DAG.getConstant(0x7fff, SL, SrcVT));
3851     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3852   }
3853   default:
3854     return SDValue();
3855   }
3856 }
3857
3858 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3859                                                 DAGCombinerInfo &DCI) const {
3860   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3861   if (!CFP)
3862     return SDValue();
3863
3864   // XXX - Should this flush denormals?
3865   const APFloat &Val = CFP->getValueAPF();
3866   APFloat One(Val.getSemantics(), "1.0");
3867   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3868 }
3869
3870 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3871                                                 DAGCombinerInfo &DCI) const {
3872   SelectionDAG &DAG = DCI.DAG;
3873   SDLoc DL(N);
3874
3875   switch(N->getOpcode()) {
3876   default:
3877     break;
3878   case ISD::BITCAST: {
3879     EVT DestVT = N->getValueType(0);
3880
3881     // Push casts through vector builds. This helps avoid emitting a large
3882     // number of copies when materializing floating point vector constants.
3883     //
3884     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3885     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3886     if (DestVT.isVector()) {
3887       SDValue Src = N->getOperand(0);
3888       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3889         EVT SrcVT = Src.getValueType();
3890         unsigned NElts = DestVT.getVectorNumElements();
3891
3892         if (SrcVT.getVectorNumElements() == NElts) {
3893           EVT DestEltVT = DestVT.getVectorElementType();
3894
3895           SmallVector<SDValue, 8> CastedElts;
3896           SDLoc SL(N);
3897           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3898             SDValue Elt = Src.getOperand(I);
3899             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3900           }
3901
3902           return DAG.getBuildVector(DestVT, SL, CastedElts);
3903         }
3904       }
3905     }
3906
3907     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3908       break;
3909
3910     // Fold bitcasts of constants.
3911     //
3912     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3913     // TODO: Generalize and move to DAGCombiner
3914     SDValue Src = N->getOperand(0);
3915     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3916       if (Src.getValueType() == MVT::i64) {
3917         SDLoc SL(N);
3918         uint64_t CVal = C->getZExtValue();
3919         SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3920                                  DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3921                                  DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3922         return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3923       }
3924     }
3925
3926     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3927       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3928       SDLoc SL(N);
3929       uint64_t CVal = Val.getZExtValue();
3930       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3931                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3932                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3933
3934       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3935     }
3936
3937     break;
3938   }
3939   case ISD::SHL: {
3940     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3941       break;
3942
3943     return performShlCombine(N, DCI);
3944   }
3945   case ISD::SRL: {
3946     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3947       break;
3948
3949     return performSrlCombine(N, DCI);
3950   }
3951   case ISD::SRA: {
3952     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3953       break;
3954
3955     return performSraCombine(N, DCI);
3956   }
3957   case ISD::TRUNCATE:
3958     return performTruncateCombine(N, DCI);
3959   case ISD::MUL:
3960     return performMulCombine(N, DCI);
3961   case ISD::MULHS:
3962     return performMulhsCombine(N, DCI);
3963   case ISD::MULHU:
3964     return performMulhuCombine(N, DCI);
3965   case AMDGPUISD::MUL_I24:
3966   case AMDGPUISD::MUL_U24:
3967   case AMDGPUISD::MULHI_I24:
3968   case AMDGPUISD::MULHI_U24: {
3969     if (SDValue V = simplifyI24(N, DCI))
3970       return V;
3971     return SDValue();
3972   }
3973   case AMDGPUISD::MUL_LOHI_I24:
3974   case AMDGPUISD::MUL_LOHI_U24:
3975     return performMulLoHi24Combine(N, DCI);
3976   case ISD::SELECT:
3977     return performSelectCombine(N, DCI);
3978   case ISD::FNEG:
3979     return performFNegCombine(N, DCI);
3980   case ISD::FABS:
3981     return performFAbsCombine(N, DCI);
3982   case AMDGPUISD::BFE_I32:
3983   case AMDGPUISD::BFE_U32: {
3984     assert(!N->getValueType(0).isVector() &&
3985            "Vector handling of BFE not implemented");
3986     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3987     if (!Width)
3988       break;
3989
3990     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3991     if (WidthVal == 0)
3992       return DAG.getConstant(0, DL, MVT::i32);
3993
3994     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3995     if (!Offset)
3996       break;
3997
3998     SDValue BitsFrom = N->getOperand(0);
3999     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4000
4001     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4002
4003     if (OffsetVal == 0) {
4004       // This is already sign / zero extended, so try to fold away extra BFEs.
4005       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4006
4007       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4008       if (OpSignBits >= SignBits)
4009         return BitsFrom;
4010
4011       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4012       if (Signed) {
4013         // This is a sign_extend_inreg. Replace it to take advantage of existing
4014         // DAG Combines. If not eliminated, we will match back to BFE during
4015         // selection.
4016
4017         // TODO: The sext_inreg of extended types ends, although we can could
4018         // handle them in a single BFE.
4019         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4020                            DAG.getValueType(SmallVT));
4021       }
4022
4023       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4024     }
4025
4026     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4027       if (Signed) {
4028         return constantFoldBFE<int32_t>(DAG,
4029                                         CVal->getSExtValue(),
4030                                         OffsetVal,
4031                                         WidthVal,
4032                                         DL);
4033       }
4034
4035       return constantFoldBFE<uint32_t>(DAG,
4036                                        CVal->getZExtValue(),
4037                                        OffsetVal,
4038                                        WidthVal,
4039                                        DL);
4040     }
4041
4042     if ((OffsetVal + WidthVal) >= 32 &&
4043         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4044       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4045       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4046                          BitsFrom, ShiftVal);
4047     }
4048
4049     if (BitsFrom.hasOneUse()) {
4050       APInt Demanded = APInt::getBitsSet(32,
4051                                          OffsetVal,
4052                                          OffsetVal + WidthVal);
4053
4054       KnownBits Known;
4055       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4056                                             !DCI.isBeforeLegalizeOps());
4057       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4058       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4059           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4060         DCI.CommitTargetLoweringOpt(TLO);
4061       }
4062     }
4063
4064     break;
4065   }
4066   case ISD::LOAD:
4067     return performLoadCombine(N, DCI);
4068   case ISD::STORE:
4069     return performStoreCombine(N, DCI);
4070   case AMDGPUISD::RCP:
4071   case AMDGPUISD::RCP_IFLAG:
4072     return performRcpCombine(N, DCI);
4073   case ISD::AssertZext:
4074   case ISD::AssertSext:
4075     return performAssertSZExtCombine(N, DCI);
4076   case ISD::INTRINSIC_WO_CHAIN:
4077     return performIntrinsicWOChainCombine(N, DCI);
4078   }
4079   return SDValue();
4080 }
4081
4082 //===----------------------------------------------------------------------===//
4083 // Helper functions
4084 //===----------------------------------------------------------------------===//
4085
4086 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4087                                                    const TargetRegisterClass *RC,
4088                                                    unsigned Reg, EVT VT,
4089                                                    const SDLoc &SL,
4090                                                    bool RawReg) const {
4091   MachineFunction &MF = DAG.getMachineFunction();
4092   MachineRegisterInfo &MRI = MF.getRegInfo();
4093   unsigned VReg;
4094
4095   if (!MRI.isLiveIn(Reg)) {
4096     VReg = MRI.createVirtualRegister(RC);
4097     MRI.addLiveIn(Reg, VReg);
4098   } else {
4099     VReg = MRI.getLiveInVirtReg(Reg);
4100   }
4101
4102   if (RawReg)
4103     return DAG.getRegister(VReg, VT);
4104
4105   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4106 }
4107
4108 // This may be called multiple times, and nothing prevents creating multiple
4109 // objects at the same offset. See if we already defined this object.
4110 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4111                                        int64_t Offset) {
4112   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4113     if (MFI.getObjectOffset(I) == Offset) {
4114       assert(MFI.getObjectSize(I) == Size);
4115       return I;
4116     }
4117   }
4118
4119   return MFI.CreateFixedObject(Size, Offset, true);
4120 }
4121
4122 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4123                                                   EVT VT,
4124                                                   const SDLoc &SL,
4125                                                   int64_t Offset) const {
4126   MachineFunction &MF = DAG.getMachineFunction();
4127   MachineFrameInfo &MFI = MF.getFrameInfo();
4128   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4129
4130   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4131   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4132
4133   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
4134                      MachineMemOperand::MODereferenceable |
4135                      MachineMemOperand::MOInvariant);
4136 }
4137
4138 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4139                                                    const SDLoc &SL,
4140                                                    SDValue Chain,
4141                                                    SDValue ArgVal,
4142                                                    int64_t Offset) const {
4143   MachineFunction &MF = DAG.getMachineFunction();
4144   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4145
4146   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4147   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
4148                                MachineMemOperand::MODereferenceable);
4149   return Store;
4150 }
4151
4152 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4153                                              const TargetRegisterClass *RC,
4154                                              EVT VT, const SDLoc &SL,
4155                                              const ArgDescriptor &Arg) const {
4156   assert(Arg && "Attempting to load missing argument");
4157
4158   SDValue V = Arg.isRegister() ?
4159     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4160     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4161
4162   if (!Arg.isMasked())
4163     return V;
4164
4165   unsigned Mask = Arg.getMask();
4166   unsigned Shift = countTrailingZeros<unsigned>(Mask);
4167   V = DAG.getNode(ISD::SRL, SL, VT, V,
4168                   DAG.getShiftAmountConstant(Shift, VT, SL));
4169   return DAG.getNode(ISD::AND, SL, VT, V,
4170                      DAG.getConstant(Mask >> Shift, SL, VT));
4171 }
4172
4173 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4174     const MachineFunction &MF, const ImplicitParameter Param) const {
4175   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4176   const AMDGPUSubtarget &ST =
4177       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4178   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4179   const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4180   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4181                        ExplicitArgOffset;
4182   switch (Param) {
4183   case GRID_DIM:
4184     return ArgOffset;
4185   case GRID_OFFSET:
4186     return ArgOffset + 4;
4187   }
4188   llvm_unreachable("unexpected implicit parameter type");
4189 }
4190
4191 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
4192
4193 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4194   switch ((AMDGPUISD::NodeType)Opcode) {
4195   case AMDGPUISD::FIRST_NUMBER: break;
4196   // AMDIL DAG nodes
4197   NODE_NAME_CASE(UMUL);
4198   NODE_NAME_CASE(BRANCH_COND);
4199
4200   // AMDGPU DAG nodes
4201   NODE_NAME_CASE(IF)
4202   NODE_NAME_CASE(ELSE)
4203   NODE_NAME_CASE(LOOP)
4204   NODE_NAME_CASE(CALL)
4205   NODE_NAME_CASE(TC_RETURN)
4206   NODE_NAME_CASE(TRAP)
4207   NODE_NAME_CASE(RET_FLAG)
4208   NODE_NAME_CASE(RETURN_TO_EPILOG)
4209   NODE_NAME_CASE(ENDPGM)
4210   NODE_NAME_CASE(DWORDADDR)
4211   NODE_NAME_CASE(FRACT)
4212   NODE_NAME_CASE(SETCC)
4213   NODE_NAME_CASE(SETREG)
4214   NODE_NAME_CASE(DENORM_MODE)
4215   NODE_NAME_CASE(FMA_W_CHAIN)
4216   NODE_NAME_CASE(FMUL_W_CHAIN)
4217   NODE_NAME_CASE(CLAMP)
4218   NODE_NAME_CASE(COS_HW)
4219   NODE_NAME_CASE(SIN_HW)
4220   NODE_NAME_CASE(FMAX_LEGACY)
4221   NODE_NAME_CASE(FMIN_LEGACY)
4222   NODE_NAME_CASE(FMAX3)
4223   NODE_NAME_CASE(SMAX3)
4224   NODE_NAME_CASE(UMAX3)
4225   NODE_NAME_CASE(FMIN3)
4226   NODE_NAME_CASE(SMIN3)
4227   NODE_NAME_CASE(UMIN3)
4228   NODE_NAME_CASE(FMED3)
4229   NODE_NAME_CASE(SMED3)
4230   NODE_NAME_CASE(UMED3)
4231   NODE_NAME_CASE(FDOT2)
4232   NODE_NAME_CASE(URECIP)
4233   NODE_NAME_CASE(DIV_SCALE)
4234   NODE_NAME_CASE(DIV_FMAS)
4235   NODE_NAME_CASE(DIV_FIXUP)
4236   NODE_NAME_CASE(FMAD_FTZ)
4237   NODE_NAME_CASE(TRIG_PREOP)
4238   NODE_NAME_CASE(RCP)
4239   NODE_NAME_CASE(RSQ)
4240   NODE_NAME_CASE(RCP_LEGACY)
4241   NODE_NAME_CASE(RSQ_LEGACY)
4242   NODE_NAME_CASE(RCP_IFLAG)
4243   NODE_NAME_CASE(FMUL_LEGACY)
4244   NODE_NAME_CASE(RSQ_CLAMP)
4245   NODE_NAME_CASE(LDEXP)
4246   NODE_NAME_CASE(FP_CLASS)
4247   NODE_NAME_CASE(DOT4)
4248   NODE_NAME_CASE(CARRY)
4249   NODE_NAME_CASE(BORROW)
4250   NODE_NAME_CASE(BFE_U32)
4251   NODE_NAME_CASE(BFE_I32)
4252   NODE_NAME_CASE(BFI)
4253   NODE_NAME_CASE(BFM)
4254   NODE_NAME_CASE(FFBH_U32)
4255   NODE_NAME_CASE(FFBH_I32)
4256   NODE_NAME_CASE(FFBL_B32)
4257   NODE_NAME_CASE(MUL_U24)
4258   NODE_NAME_CASE(MUL_I24)
4259   NODE_NAME_CASE(MULHI_U24)
4260   NODE_NAME_CASE(MULHI_I24)
4261   NODE_NAME_CASE(MUL_LOHI_U24)
4262   NODE_NAME_CASE(MUL_LOHI_I24)
4263   NODE_NAME_CASE(MAD_U24)
4264   NODE_NAME_CASE(MAD_I24)
4265   NODE_NAME_CASE(MAD_I64_I32)
4266   NODE_NAME_CASE(MAD_U64_U32)
4267   NODE_NAME_CASE(PERM)
4268   NODE_NAME_CASE(TEXTURE_FETCH)
4269   NODE_NAME_CASE(EXPORT)
4270   NODE_NAME_CASE(EXPORT_DONE)
4271   NODE_NAME_CASE(R600_EXPORT)
4272   NODE_NAME_CASE(CONST_ADDRESS)
4273   NODE_NAME_CASE(REGISTER_LOAD)
4274   NODE_NAME_CASE(REGISTER_STORE)
4275   NODE_NAME_CASE(SAMPLE)
4276   NODE_NAME_CASE(SAMPLEB)
4277   NODE_NAME_CASE(SAMPLED)
4278   NODE_NAME_CASE(SAMPLEL)
4279   NODE_NAME_CASE(CVT_F32_UBYTE0)
4280   NODE_NAME_CASE(CVT_F32_UBYTE1)
4281   NODE_NAME_CASE(CVT_F32_UBYTE2)
4282   NODE_NAME_CASE(CVT_F32_UBYTE3)
4283   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
4284   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
4285   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
4286   NODE_NAME_CASE(CVT_PK_I16_I32)
4287   NODE_NAME_CASE(CVT_PK_U16_U32)
4288   NODE_NAME_CASE(FP_TO_FP16)
4289   NODE_NAME_CASE(FP16_ZEXT)
4290   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
4291   NODE_NAME_CASE(CONST_DATA_PTR)
4292   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
4293   NODE_NAME_CASE(LDS)
4294   NODE_NAME_CASE(KILL)
4295   NODE_NAME_CASE(DUMMY_CHAIN)
4296   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4297   NODE_NAME_CASE(INTERP_P1LL_F16)
4298   NODE_NAME_CASE(INTERP_P1LV_F16)
4299   NODE_NAME_CASE(INTERP_P2_F16)
4300   NODE_NAME_CASE(LOAD_D16_HI)
4301   NODE_NAME_CASE(LOAD_D16_LO)
4302   NODE_NAME_CASE(LOAD_D16_HI_I8)
4303   NODE_NAME_CASE(LOAD_D16_HI_U8)
4304   NODE_NAME_CASE(LOAD_D16_LO_I8)
4305   NODE_NAME_CASE(LOAD_D16_LO_U8)
4306   NODE_NAME_CASE(STORE_MSKOR)
4307   NODE_NAME_CASE(LOAD_CONSTANT)
4308   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
4309   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
4310   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
4311   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
4312   NODE_NAME_CASE(DS_ORDERED_COUNT)
4313   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
4314   NODE_NAME_CASE(ATOMIC_INC)
4315   NODE_NAME_CASE(ATOMIC_DEC)
4316   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
4317   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
4318   NODE_NAME_CASE(BUFFER_LOAD)
4319   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
4320   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
4321   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
4322   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
4323   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
4324   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
4325   NODE_NAME_CASE(SBUFFER_LOAD)
4326   NODE_NAME_CASE(BUFFER_STORE)
4327   NODE_NAME_CASE(BUFFER_STORE_BYTE)
4328   NODE_NAME_CASE(BUFFER_STORE_SHORT)
4329   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
4330   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
4331   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
4332   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
4333   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
4334   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
4335   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
4336   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
4337   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
4338   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
4339   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
4340   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
4341   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
4342   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
4343   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
4344   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
4345   NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
4346   NODE_NAME_CASE(ATOMIC_FADD)
4347   NODE_NAME_CASE(ATOMIC_PK_FADD)
4348
4349   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4350   }
4351   return nullptr;
4352 }
4353
4354 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4355                                               SelectionDAG &DAG, int Enabled,
4356                                               int &RefinementSteps,
4357                                               bool &UseOneConstNR,
4358                                               bool Reciprocal) const {
4359   EVT VT = Operand.getValueType();
4360
4361   if (VT == MVT::f32) {
4362     RefinementSteps = 0;
4363     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4364   }
4365
4366   // TODO: There is also f64 rsq instruction, but the documentation is less
4367   // clear on its precision.
4368
4369   return SDValue();
4370 }
4371
4372 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4373                                                SelectionDAG &DAG, int Enabled,
4374                                                int &RefinementSteps) const {
4375   EVT VT = Operand.getValueType();
4376
4377   if (VT == MVT::f32) {
4378     // Reciprocal, < 1 ulp error.
4379     //
4380     // This reciprocal approximation converges to < 0.5 ulp error with one
4381     // newton rhapson performed with two fused multiple adds (FMAs).
4382
4383     RefinementSteps = 0;
4384     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4385   }
4386
4387   // TODO: There is also f64 rcp instruction, but the documentation is less
4388   // clear on its precision.
4389
4390   return SDValue();
4391 }
4392
4393 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4394     const SDValue Op, KnownBits &Known,
4395     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4396
4397   Known.resetAll(); // Don't know anything.
4398
4399   unsigned Opc = Op.getOpcode();
4400
4401   switch (Opc) {
4402   default:
4403     break;
4404   case AMDGPUISD::CARRY:
4405   case AMDGPUISD::BORROW: {
4406     Known.Zero = APInt::getHighBitsSet(32, 31);
4407     break;
4408   }
4409
4410   case AMDGPUISD::BFE_I32:
4411   case AMDGPUISD::BFE_U32: {
4412     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4413     if (!CWidth)
4414       return;
4415
4416     uint32_t Width = CWidth->getZExtValue() & 0x1f;
4417
4418     if (Opc == AMDGPUISD::BFE_U32)
4419       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4420
4421     break;
4422   }
4423   case AMDGPUISD::FP_TO_FP16:
4424   case AMDGPUISD::FP16_ZEXT: {
4425     unsigned BitWidth = Known.getBitWidth();
4426
4427     // High bits are zero.
4428     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4429     break;
4430   }
4431   case AMDGPUISD::MUL_U24:
4432   case AMDGPUISD::MUL_I24: {
4433     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4434     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4435     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4436                       RHSKnown.countMinTrailingZeros();
4437     Known.Zero.setLowBits(std::min(TrailZ, 32u));
4438
4439     // Truncate to 24 bits.
4440     LHSKnown = LHSKnown.trunc(24);
4441     RHSKnown = RHSKnown.trunc(24);
4442
4443     bool Negative = false;
4444     if (Opc == AMDGPUISD::MUL_I24) {
4445       unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4446       unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4447       unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4448       if (MaxValBits >= 32)
4449         break;
4450       bool LHSNegative = LHSKnown.isNegative();
4451       bool LHSPositive = LHSKnown.isNonNegative();
4452       bool RHSNegative = RHSKnown.isNegative();
4453       bool RHSPositive = RHSKnown.isNonNegative();
4454       if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
4455         break;
4456       Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
4457       if (Negative)
4458         Known.One.setHighBits(32 - MaxValBits);
4459       else
4460         Known.Zero.setHighBits(32 - MaxValBits);
4461     } else {
4462       unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4463       unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4464       unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4465       if (MaxValBits >= 32)
4466         break;
4467       Known.Zero.setHighBits(32 - MaxValBits);
4468     }
4469     break;
4470   }
4471   case AMDGPUISD::PERM: {
4472     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4473     if (!CMask)
4474       return;
4475
4476     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4477     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4478     unsigned Sel = CMask->getZExtValue();
4479
4480     for (unsigned I = 0; I < 32; I += 8) {
4481       unsigned SelBits = Sel & 0xff;
4482       if (SelBits < 4) {
4483         SelBits *= 8;
4484         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4485         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4486       } else if (SelBits < 7) {
4487         SelBits = (SelBits & 3) * 8;
4488         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4489         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4490       } else if (SelBits == 0x0c) {
4491         Known.Zero |= 0xFFull << I;
4492       } else if (SelBits > 0x0c) {
4493         Known.One |= 0xFFull << I;
4494       }
4495       Sel >>= 8;
4496     }
4497     break;
4498   }
4499   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
4500     Known.Zero.setHighBits(24);
4501     break;
4502   }
4503   case AMDGPUISD::BUFFER_LOAD_USHORT: {
4504     Known.Zero.setHighBits(16);
4505     break;
4506   }
4507   case AMDGPUISD::LDS: {
4508     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4509     unsigned Align = GA->getGlobal()->getAlignment();
4510
4511     Known.Zero.setHighBits(16);
4512     if (Align)
4513       Known.Zero.setLowBits(Log2_32(Align));
4514     break;
4515   }
4516   case ISD::INTRINSIC_WO_CHAIN: {
4517     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4518     switch (IID) {
4519     case Intrinsic::amdgcn_mbcnt_lo:
4520     case Intrinsic::amdgcn_mbcnt_hi: {
4521       const GCNSubtarget &ST =
4522           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4523       // These return at most the wavefront size - 1.
4524       unsigned Size = Op.getValueType().getSizeInBits();
4525       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4526       break;
4527     }
4528     default:
4529       break;
4530     }
4531   }
4532   }
4533 }
4534
4535 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4536     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4537     unsigned Depth) const {
4538   switch (Op.getOpcode()) {
4539   case AMDGPUISD::BFE_I32: {
4540     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4541     if (!Width)
4542       return 1;
4543
4544     unsigned SignBits = 32 - Width->getZExtValue() + 1;
4545     if (!isNullConstant(Op.getOperand(1)))
4546       return SignBits;
4547
4548     // TODO: Could probably figure something out with non-0 offsets.
4549     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4550     return std::max(SignBits, Op0SignBits);
4551   }
4552
4553   case AMDGPUISD::BFE_U32: {
4554     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4555     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4556   }
4557
4558   case AMDGPUISD::CARRY:
4559   case AMDGPUISD::BORROW:
4560     return 31;
4561   case AMDGPUISD::BUFFER_LOAD_BYTE:
4562     return 25;
4563   case AMDGPUISD::BUFFER_LOAD_SHORT:
4564     return 17;
4565   case AMDGPUISD::BUFFER_LOAD_UBYTE:
4566     return 24;
4567   case AMDGPUISD::BUFFER_LOAD_USHORT:
4568     return 16;
4569   case AMDGPUISD::FP_TO_FP16:
4570   case AMDGPUISD::FP16_ZEXT:
4571     return 16;
4572   default:
4573     return 1;
4574   }
4575 }
4576
4577 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4578                                                         const SelectionDAG &DAG,
4579                                                         bool SNaN,
4580                                                         unsigned Depth) const {
4581   unsigned Opcode = Op.getOpcode();
4582   switch (Opcode) {
4583   case AMDGPUISD::FMIN_LEGACY:
4584   case AMDGPUISD::FMAX_LEGACY: {
4585     if (SNaN)
4586       return true;
4587
4588     // TODO: Can check no nans on one of the operands for each one, but which
4589     // one?
4590     return false;
4591   }
4592   case AMDGPUISD::FMUL_LEGACY:
4593   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4594     if (SNaN)
4595       return true;
4596     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4597            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4598   }
4599   case AMDGPUISD::FMED3:
4600   case AMDGPUISD::FMIN3:
4601   case AMDGPUISD::FMAX3:
4602   case AMDGPUISD::FMAD_FTZ: {
4603     if (SNaN)
4604       return true;
4605     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4606            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4607            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4608   }
4609   case AMDGPUISD::CVT_F32_UBYTE0:
4610   case AMDGPUISD::CVT_F32_UBYTE1:
4611   case AMDGPUISD::CVT_F32_UBYTE2:
4612   case AMDGPUISD::CVT_F32_UBYTE3:
4613     return true;
4614
4615   case AMDGPUISD::RCP:
4616   case AMDGPUISD::RSQ:
4617   case AMDGPUISD::RCP_LEGACY:
4618   case AMDGPUISD::RSQ_LEGACY:
4619   case AMDGPUISD::RSQ_CLAMP: {
4620     if (SNaN)
4621       return true;
4622
4623     // TODO: Need is known positive check.
4624     return false;
4625   }
4626   case AMDGPUISD::LDEXP:
4627   case AMDGPUISD::FRACT: {
4628     if (SNaN)
4629       return true;
4630     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4631   }
4632   case AMDGPUISD::DIV_SCALE:
4633   case AMDGPUISD::DIV_FMAS:
4634   case AMDGPUISD::DIV_FIXUP:
4635   case AMDGPUISD::TRIG_PREOP:
4636     // TODO: Refine on operands.
4637     return SNaN;
4638   case AMDGPUISD::SIN_HW:
4639   case AMDGPUISD::COS_HW: {
4640     // TODO: Need check for infinity
4641     return SNaN;
4642   }
4643   case ISD::INTRINSIC_WO_CHAIN: {
4644     unsigned IntrinsicID
4645       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4646     // TODO: Handle more intrinsics
4647     switch (IntrinsicID) {
4648     case Intrinsic::amdgcn_cubeid:
4649       return true;
4650
4651     case Intrinsic::amdgcn_frexp_mant: {
4652       if (SNaN)
4653         return true;
4654       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4655     }
4656     case Intrinsic::amdgcn_cvt_pkrtz: {
4657       if (SNaN)
4658         return true;
4659       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4660              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4661     }
4662     case Intrinsic::amdgcn_fdot2:
4663       // TODO: Refine on operand
4664       return SNaN;
4665     default:
4666       return false;
4667     }
4668   }
4669   default:
4670     return false;
4671   }
4672 }
4673
4674 TargetLowering::AtomicExpansionKind
4675 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4676   switch (RMW->getOperation()) {
4677   case AtomicRMWInst::Nand:
4678   case AtomicRMWInst::FAdd:
4679   case AtomicRMWInst::FSub:
4680     return AtomicExpansionKind::CmpXChg;
4681   default:
4682     return AtomicExpansionKind::None;
4683   }
4684 }