llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

   1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // \file
  10 // This file implements a TargetTransformInfo analysis pass specific to the
  11 // AMDGPU target machine. It uses the target's detailed information to provide
  12 // more precise answers to certain TTI queries, while letting the target
  13 // independent and default TTI implementations handle the rest.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "AMDGPUInstrInfo.h"
  18 #include "AMDGPUTargetTransformInfo.h"
  19 #include "GCNSubtarget.h"
  20 #include "R600Subtarget.h"
  21 #include "llvm/IR/IntrinsicsAMDGPU.h"
  22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
  23
  24 using namespace llvm;
  25
  26 #define DEBUG_TYPE "AMDGPUtti"
  27
  28 namespace {
  29
  30 struct AMDGPUImageDMaskIntrinsic {
  31   unsigned Intr;
  32 };
  33
  34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
  35 #include "InstCombineTables.inc"
  36
  37 } // end anonymous namespace
  38
  39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
  40 //
  41 // A single NaN input is folded to minnum, so we rely on that folding for
  42 // handling NaNs.
  43 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
  44                            const APFloat &Src2) {
  45   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
  46
  47   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
  48   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
  49   if (Cmp0 == APFloat::cmpEqual)
  50     return maxnum(Src1, Src2);
  51
  52   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
  53   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
  54   if (Cmp1 == APFloat::cmpEqual)
  55     return maxnum(Src0, Src2);
  56
  57   return maxnum(Src0, Src1);
  58 }
  59
  60 // Check if a value can be converted to a 16-bit value without losing
  61 // precision.
  62 static bool canSafelyConvertTo16Bit(Value &V) {
  63   Type *VTy = V.getType();
  64   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
  65     // The value is already 16-bit, so we don't want to convert to 16-bit again!
  66     return false;
  67   }
  68   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
  69     // We need to check that if we cast the index down to a half, we do not lose
  70     // precision.
  71     APFloat FloatValue(ConstFloat->getValueAPF());
  72     bool LosesInfo = true;
  73     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
  74     return !LosesInfo;
  75   }
  76   Value *CastSrc;
  77   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
  78       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
  79       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
  80     Type *CastSrcTy = CastSrc->getType();
  81     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
  82       return true;
  83   }
  84
  85   return false;
  86 }
  87
  88 // Convert a value to 16-bit.
  89 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
  90   Type *VTy = V.getType();
  91   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
  92     return cast<Instruction>(&V)->getOperand(0);
  93   if (VTy->isIntegerTy())
  94     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
  95   if (VTy->isFloatingPointTy())
  96     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
  97
  98   llvm_unreachable("Should never be called!");
  99 }
 100
 101 static Optional<Instruction *>
 102 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
 103                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
 104                              IntrinsicInst &II, InstCombiner &IC) {
 105   if (!ST->hasA16() && !ST->hasG16())
 106     return None;
 107
 108   bool FloatCoord = false;
 109   // true means derivatives can be converted to 16 bit, coordinates not
 110   bool OnlyDerivatives = false;
 111
 112   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
 113        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
 114     Value *Coord = II.getOperand(OperandIndex);
 115     // If the values are not derived from 16-bit values, we cannot optimize.
 116     if (!canSafelyConvertTo16Bit(*Coord)) {
 117       if (OperandIndex < ImageDimIntr->CoordStart ||
 118           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
 119         return None;
 120       }
 121       // All gradients can be converted, so convert only them
 122       OnlyDerivatives = true;
 123       break;
 124     }
 125
 126     assert(OperandIndex == ImageDimIntr->GradientStart ||
 127            FloatCoord == Coord->getType()->isFloatingPointTy());
 128     FloatCoord = Coord->getType()->isFloatingPointTy();
 129   }
 130
 131   if (OnlyDerivatives) {
 132     if (!ST->hasG16())
 133       return None;
 134   } else {
 135     if (!ST->hasA16())
 136       OnlyDerivatives = true; // Only supports G16
 137   }
 138
 139   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
 140                                : Type::getInt16Ty(II.getContext());
 141
 142   SmallVector<Type *, 4> ArgTys;
 143   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
 144     return None;
 145
 146   ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
 147   if (!OnlyDerivatives)
 148     ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
 149   Function *I =
 150       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
 151
 152   SmallVector<Value *, 8> Args(II.arg_operands());
 153
 154   unsigned EndIndex =
 155       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
 156   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
 157        OperandIndex < EndIndex; OperandIndex++) {
 158     Args[OperandIndex] =
 159         convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
 160   }
 161
 162   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
 163   NewCall->takeName(&II);
 164   NewCall->copyMetadata(II);
 165   if (isa<FPMathOperator>(NewCall))
 166     NewCall->copyFastMathFlags(&II);
 167   return IC.replaceInstUsesWith(II, NewCall);
 168 }
 169
 170 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
 171                                            InstCombiner &IC) const {
 172   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
 173   // infinity, gives +0.0. If we can prove we don't have one of the special
 174   // cases then we can use a normal multiply instead.
 175   // TODO: Create and use isKnownFiniteNonZero instead of just matching
 176   // constants here.
 177   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
 178       match(Op1, PatternMatch::m_FiniteNonZero())) {
 179     // One operand is not zero or infinity or NaN.
 180     return true;
 181   }
 182   auto *TLI = &IC.getTargetLibraryInfo();
 183   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
 184       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
 185     // Neither operand is infinity or NaN.
 186     return true;
 187   }
 188   return false;
 189 }
 190
 191 Optional<Instruction *>
 192 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 193   Intrinsic::ID IID = II.getIntrinsicID();
 194   switch (IID) {
 195   case Intrinsic::amdgcn_rcp: {
 196     Value *Src = II.getArgOperand(0);
 197
 198     // TODO: Move to ConstantFolding/InstSimplify?
 199     if (isa<UndefValue>(Src)) {
 200       Type *Ty = II.getType();
 201       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
 202       return IC.replaceInstUsesWith(II, QNaN);
 203     }
 204
 205     if (II.isStrictFP())
 206       break;
 207
 208     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
 209       const APFloat &ArgVal = C->getValueAPF();
 210       APFloat Val(ArgVal.getSemantics(), 1);
 211       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
 212
 213       // This is more precise than the instruction may give.
 214       //
 215       // TODO: The instruction always flushes denormal results (except for f16),
 216       // should this also?
 217       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
 218     }
 219
 220     break;
 221   }
 222   case Intrinsic::amdgcn_rsq: {
 223     Value *Src = II.getArgOperand(0);
 224
 225     // TODO: Move to ConstantFolding/InstSimplify?
 226     if (isa<UndefValue>(Src)) {
 227       Type *Ty = II.getType();
 228       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
 229       return IC.replaceInstUsesWith(II, QNaN);
 230     }
 231
 232     break;
 233   }
 234   case Intrinsic::amdgcn_frexp_mant:
 235   case Intrinsic::amdgcn_frexp_exp: {
 236     Value *Src = II.getArgOperand(0);
 237     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
 238       int Exp;
 239       APFloat Significand =
 240           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
 241
 242       if (IID == Intrinsic::amdgcn_frexp_mant) {
 243         return IC.replaceInstUsesWith(
 244             II, ConstantFP::get(II.getContext(), Significand));
 245       }
 246
 247       // Match instruction special case behavior.
 248       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
 249         Exp = 0;
 250
 251       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
 252     }
 253
 254     if (isa<UndefValue>(Src)) {
 255       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 256     }
 257
 258     break;
 259   }
 260   case Intrinsic::amdgcn_class: {
 261     enum {
 262       S_NAN = 1 << 0,       // Signaling NaN
 263       Q_NAN = 1 << 1,       // Quiet NaN
 264       N_INFINITY = 1 << 2,  // Negative infinity
 265       N_NORMAL = 1 << 3,    // Negative normal
 266       N_SUBNORMAL = 1 << 4, // Negative subnormal
 267       N_ZERO = 1 << 5,      // Negative zero
 268       P_ZERO = 1 << 6,      // Positive zero
 269       P_SUBNORMAL = 1 << 7, // Positive subnormal
 270       P_NORMAL = 1 << 8,    // Positive normal
 271       P_INFINITY = 1 << 9   // Positive infinity
 272     };
 273
 274     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
 275                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
 276                               P_NORMAL | P_INFINITY;
 277
 278     Value *Src0 = II.getArgOperand(0);
 279     Value *Src1 = II.getArgOperand(1);
 280     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
 281     if (!CMask) {
 282       if (isa<UndefValue>(Src0)) {
 283         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 284       }
 285
 286       if (isa<UndefValue>(Src1)) {
 287         return IC.replaceInstUsesWith(II,
 288                                       ConstantInt::get(II.getType(), false));
 289       }
 290       break;
 291     }
 292
 293     uint32_t Mask = CMask->getZExtValue();
 294
 295     // If all tests are made, it doesn't matter what the value is.
 296     if ((Mask & FullMask) == FullMask) {
 297       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
 298     }
 299
 300     if ((Mask & FullMask) == 0) {
 301       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
 302     }
 303
 304     if (Mask == (S_NAN | Q_NAN)) {
 305       // Equivalent of isnan. Replace with standard fcmp.
 306       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
 307       FCmp->takeName(&II);
 308       return IC.replaceInstUsesWith(II, FCmp);
 309     }
 310
 311     if (Mask == (N_ZERO | P_ZERO)) {
 312       // Equivalent of == 0.
 313       Value *FCmp =
 314           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
 315
 316       FCmp->takeName(&II);
 317       return IC.replaceInstUsesWith(II, FCmp);
 318     }
 319
 320     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
 321     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
 322         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
 323       return IC.replaceOperand(
 324           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
 325     }
 326
 327     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
 328     if (!CVal) {
 329       if (isa<UndefValue>(Src0)) {
 330         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 331       }
 332
 333       // Clamp mask to used bits
 334       if ((Mask & FullMask) != Mask) {
 335         CallInst *NewCall = IC.Builder.CreateCall(
 336             II.getCalledFunction(),
 337             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
 338
 339         NewCall->takeName(&II);
 340         return IC.replaceInstUsesWith(II, NewCall);
 341       }
 342
 343       break;
 344     }
 345
 346     const APFloat &Val = CVal->getValueAPF();
 347
 348     bool Result =
 349         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
 350         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
 351         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
 352         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
 353         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
 354         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
 355         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
 356         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
 357         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
 358         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
 359
 360     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
 361   }
 362   case Intrinsic::amdgcn_cvt_pkrtz: {
 363     Value *Src0 = II.getArgOperand(0);
 364     Value *Src1 = II.getArgOperand(1);
 365     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
 366       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
 367         const fltSemantics &HalfSem =
 368             II.getType()->getScalarType()->getFltSemantics();
 369         bool LosesInfo;
 370         APFloat Val0 = C0->getValueAPF();
 371         APFloat Val1 = C1->getValueAPF();
 372         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
 373         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
 374
 375         Constant *Folded =
 376             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
 377                                  ConstantFP::get(II.getContext(), Val1)});
 378         return IC.replaceInstUsesWith(II, Folded);
 379       }
 380     }
 381
 382     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
 383       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 384     }
 385
 386     break;
 387   }
 388   case Intrinsic::amdgcn_cvt_pknorm_i16:
 389   case Intrinsic::amdgcn_cvt_pknorm_u16:
 390   case Intrinsic::amdgcn_cvt_pk_i16:
 391   case Intrinsic::amdgcn_cvt_pk_u16: {
 392     Value *Src0 = II.getArgOperand(0);
 393     Value *Src1 = II.getArgOperand(1);
 394
 395     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
 396       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 397     }
 398
 399     break;
 400   }
 401   case Intrinsic::amdgcn_ubfe:
 402   case Intrinsic::amdgcn_sbfe: {
 403     // Decompose simple cases into standard shifts.
 404     Value *Src = II.getArgOperand(0);
 405     if (isa<UndefValue>(Src)) {
 406       return IC.replaceInstUsesWith(II, Src);
 407     }
 408
 409     unsigned Width;
 410     Type *Ty = II.getType();
 411     unsigned IntSize = Ty->getIntegerBitWidth();
 412
 413     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
 414     if (CWidth) {
 415       Width = CWidth->getZExtValue();
 416       if ((Width & (IntSize - 1)) == 0) {
 417         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
 418       }
 419
 420       // Hardware ignores high bits, so remove those.
 421       if (Width >= IntSize) {
 422         return IC.replaceOperand(
 423             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
 424       }
 425     }
 426
 427     unsigned Offset;
 428     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
 429     if (COffset) {
 430       Offset = COffset->getZExtValue();
 431       if (Offset >= IntSize) {
 432         return IC.replaceOperand(
 433             II, 1,
 434             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
 435       }
 436     }
 437
 438     bool Signed = IID == Intrinsic::amdgcn_sbfe;
 439
 440     if (!CWidth || !COffset)
 441       break;
 442
 443     // The case of Width == 0 is handled above, which makes this tranformation
 444     // safe.  If Width == 0, then the ashr and lshr instructions become poison
 445     // value since the shift amount would be equal to the bit size.
 446     assert(Width != 0);
 447
 448     // TODO: This allows folding to undef when the hardware has specific
 449     // behavior?
 450     if (Offset + Width < IntSize) {
 451       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
 452       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
 453                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
 454       RightShift->takeName(&II);
 455       return IC.replaceInstUsesWith(II, RightShift);
 456     }
 457
 458     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
 459                                : IC.Builder.CreateLShr(Src, Offset);
 460
 461     RightShift->takeName(&II);
 462     return IC.replaceInstUsesWith(II, RightShift);
 463   }
 464   case Intrinsic::amdgcn_exp:
 465   case Intrinsic::amdgcn_exp_compr: {
 466     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
 467     unsigned EnBits = En->getZExtValue();
 468     if (EnBits == 0xf)
 469       break; // All inputs enabled.
 470
 471     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
 472     bool Changed = false;
 473     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
 474       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
 475           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
 476         Value *Src = II.getArgOperand(I + 2);
 477         if (!isa<UndefValue>(Src)) {
 478           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
 479           Changed = true;
 480         }
 481       }
 482     }
 483
 484     if (Changed) {
 485       return &II;
 486     }
 487
 488     break;
 489   }
 490   case Intrinsic::amdgcn_fmed3: {
 491     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
 492     // for the shader.
 493
 494     Value *Src0 = II.getArgOperand(0);
 495     Value *Src1 = II.getArgOperand(1);
 496     Value *Src2 = II.getArgOperand(2);
 497
 498     // Checking for NaN before canonicalization provides better fidelity when
 499     // mapping other operations onto fmed3 since the order of operands is
 500     // unchanged.
 501     CallInst *NewCall = nullptr;
 502     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
 503       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
 504     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
 505       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
 506     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
 507       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
 508     }
 509
 510     if (NewCall) {
 511       NewCall->copyFastMathFlags(&II);
 512       NewCall->takeName(&II);
 513       return IC.replaceInstUsesWith(II, NewCall);
 514     }
 515
 516     bool Swap = false;
 517     // Canonicalize constants to RHS operands.
 518     //
 519     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
 520     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
 521       std::swap(Src0, Src1);
 522       Swap = true;
 523     }
 524
 525     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
 526       std::swap(Src1, Src2);
 527       Swap = true;
 528     }
 529
 530     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
 531       std::swap(Src0, Src1);
 532       Swap = true;
 533     }
 534
 535     if (Swap) {
 536       II.setArgOperand(0, Src0);
 537       II.setArgOperand(1, Src1);
 538       II.setArgOperand(2, Src2);
 539       return &II;
 540     }
 541
 542     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
 543       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
 544         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
 545           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
 546                                        C2->getValueAPF());
 547           return IC.replaceInstUsesWith(
 548               II, ConstantFP::get(IC.Builder.getContext(), Result));
 549         }
 550       }
 551     }
 552
 553     break;
 554   }
 555   case Intrinsic::amdgcn_icmp:
 556   case Intrinsic::amdgcn_fcmp: {
 557     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
 558     // Guard against invalid arguments.
 559     int64_t CCVal = CC->getZExtValue();
 560     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
 561     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
 562                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
 563         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
 564                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
 565       break;
 566
 567     Value *Src0 = II.getArgOperand(0);
 568     Value *Src1 = II.getArgOperand(1);
 569
 570     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
 571       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
 572         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
 573         if (CCmp->isNullValue()) {
 574           return IC.replaceInstUsesWith(
 575               II, ConstantExpr::getSExt(CCmp, II.getType()));
 576         }
 577
 578         // The result of V_ICMP/V_FCMP assembly instructions (which this
 579         // intrinsic exposes) is one bit per thread, masked with the EXEC
 580         // register (which contains the bitmask of live threads). So a
 581         // comparison that always returns true is the same as a read of the
 582         // EXEC register.
 583         Function *NewF = Intrinsic::getDeclaration(
 584             II.getModule(), Intrinsic::read_register, II.getType());
 585         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
 586         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
 587         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
 588         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
 589         NewCall->addFnAttr(Attribute::Convergent);
 590         NewCall->takeName(&II);
 591         return IC.replaceInstUsesWith(II, NewCall);
 592       }
 593
 594       // Canonicalize constants to RHS.
 595       CmpInst::Predicate SwapPred =
 596           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
 597       II.setArgOperand(0, Src1);
 598       II.setArgOperand(1, Src0);
 599       II.setArgOperand(
 600           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
 601       return &II;
 602     }
 603
 604     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
 605       break;
 606
 607     // Canonicalize compare eq with true value to compare != 0
 608     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
 609     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
 610     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
 611     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
 612     Value *ExtSrc;
 613     if (CCVal == CmpInst::ICMP_EQ &&
 614         ((match(Src1, PatternMatch::m_One()) &&
 615           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
 616          (match(Src1, PatternMatch::m_AllOnes()) &&
 617           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
 618         ExtSrc->getType()->isIntegerTy(1)) {
 619       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
 620       IC.replaceOperand(II, 2,
 621                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
 622       return &II;
 623     }
 624
 625     CmpInst::Predicate SrcPred;
 626     Value *SrcLHS;
 627     Value *SrcRHS;
 628
 629     // Fold compare eq/ne with 0 from a compare result as the predicate to the
 630     // intrinsic. The typical use is a wave vote function in the library, which
 631     // will be fed from a user code condition compared with 0. Fold in the
 632     // redundant compare.
 633
 634     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
 635     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
 636     //
 637     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
 638     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
 639     if (match(Src1, PatternMatch::m_Zero()) &&
 640         match(Src0, PatternMatch::m_ZExtOrSExt(
 641                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
 642                               PatternMatch::m_Value(SrcRHS))))) {
 643       if (CCVal == CmpInst::ICMP_EQ)
 644         SrcPred = CmpInst::getInversePredicate(SrcPred);
 645
 646       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
 647                                  ? Intrinsic::amdgcn_fcmp
 648                                  : Intrinsic::amdgcn_icmp;
 649
 650       Type *Ty = SrcLHS->getType();
 651       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
 652         // Promote to next legal integer type.
 653         unsigned Width = CmpType->getBitWidth();
 654         unsigned NewWidth = Width;
 655
 656         // Don't do anything for i1 comparisons.
 657         if (Width == 1)
 658           break;
 659
 660         if (Width <= 16)
 661           NewWidth = 16;
 662         else if (Width <= 32)
 663           NewWidth = 32;
 664         else if (Width <= 64)
 665           NewWidth = 64;
 666         else if (Width > 64)
 667           break; // Can't handle this.
 668
 669         if (Width != NewWidth) {
 670           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
 671           if (CmpInst::isSigned(SrcPred)) {
 672             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
 673             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
 674           } else {
 675             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
 676             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
 677           }
 678         }
 679       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
 680         break;
 681
 682       Function *NewF = Intrinsic::getDeclaration(
 683           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
 684       Value *Args[] = {SrcLHS, SrcRHS,
 685                        ConstantInt::get(CC->getType(), SrcPred)};
 686       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
 687       NewCall->takeName(&II);
 688       return IC.replaceInstUsesWith(II, NewCall);
 689     }
 690
 691     break;
 692   }
 693   case Intrinsic::amdgcn_ballot: {
 694     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
 695       if (Src->isZero()) {
 696         // amdgcn.ballot(i1 0) is zero.
 697         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
 698       }
 699
 700       if (Src->isOne()) {
 701         // amdgcn.ballot(i1 1) is exec.
 702         const char *RegName = "exec";
 703         if (II.getType()->isIntegerTy(32))
 704           RegName = "exec_lo";
 705         else if (!II.getType()->isIntegerTy(64))
 706           break;
 707
 708         Function *NewF = Intrinsic::getDeclaration(
 709             II.getModule(), Intrinsic::read_register, II.getType());
 710         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
 711         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
 712         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
 713         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
 714         NewCall->addFnAttr(Attribute::Convergent);
 715         NewCall->takeName(&II);
 716         return IC.replaceInstUsesWith(II, NewCall);
 717       }
 718     }
 719     break;
 720   }
 721   case Intrinsic::amdgcn_wqm_vote: {
 722     // wqm_vote is identity when the argument is constant.
 723     if (!isa<Constant>(II.getArgOperand(0)))
 724       break;
 725
 726     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
 727   }
 728   case Intrinsic::amdgcn_kill: {
 729     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
 730     if (!C || !C->getZExtValue())
 731       break;
 732
 733     // amdgcn.kill(i1 1) is a no-op
 734     return IC.eraseInstFromFunction(II);
 735   }
 736   case Intrinsic::amdgcn_update_dpp: {
 737     Value *Old = II.getArgOperand(0);
 738
 739     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
 740     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
 741     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
 742     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
 743         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
 744       break;
 745
 746     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
 747     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
 748   }
 749   case Intrinsic::amdgcn_permlane16:
 750   case Intrinsic::amdgcn_permlanex16: {
 751     // Discard vdst_in if it's not going to be read.
 752     Value *VDstIn = II.getArgOperand(0);
 753     if (isa<UndefValue>(VDstIn))
 754       break;
 755
 756     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
 757     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
 758     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
 759       break;
 760
 761     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
 762   }
 763   case Intrinsic::amdgcn_readfirstlane:
 764   case Intrinsic::amdgcn_readlane: {
 765     // A constant value is trivially uniform.
 766     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
 767       return IC.replaceInstUsesWith(II, C);
 768     }
 769
 770     // The rest of these may not be safe if the exec may not be the same between
 771     // the def and use.
 772     Value *Src = II.getArgOperand(0);
 773     Instruction *SrcInst = dyn_cast<Instruction>(Src);
 774     if (SrcInst && SrcInst->getParent() != II.getParent())
 775       break;
 776
 777     // readfirstlane (readfirstlane x) -> readfirstlane x
 778     // readlane (readfirstlane x), y -> readfirstlane x
 779     if (match(Src,
 780               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
 781       return IC.replaceInstUsesWith(II, Src);
 782     }
 783
 784     if (IID == Intrinsic::amdgcn_readfirstlane) {
 785       // readfirstlane (readlane x, y) -> readlane x, y
 786       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
 787         return IC.replaceInstUsesWith(II, Src);
 788       }
 789     } else {
 790       // readlane (readlane x, y), y -> readlane x, y
 791       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
 792                          PatternMatch::m_Value(),
 793                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
 794         return IC.replaceInstUsesWith(II, Src);
 795       }
 796     }
 797
 798     break;
 799   }
 800   case Intrinsic::amdgcn_ldexp: {
 801     // FIXME: This doesn't introduce new instructions and belongs in
 802     // InstructionSimplify.
 803     Type *Ty = II.getType();
 804     Value *Op0 = II.getArgOperand(0);
 805     Value *Op1 = II.getArgOperand(1);
 806
 807     // Folding undef to qnan is safe regardless of the FP mode.
 808     if (isa<UndefValue>(Op0)) {
 809       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
 810       return IC.replaceInstUsesWith(II, QNaN);
 811     }
 812
 813     const APFloat *C = nullptr;
 814     match(Op0, PatternMatch::m_APFloat(C));
 815
 816     // FIXME: Should flush denorms depending on FP mode, but that's ignored
 817     // everywhere else.
 818     //
 819     // These cases should be safe, even with strictfp.
 820     // ldexp(0.0, x) -> 0.0
 821     // ldexp(-0.0, x) -> -0.0
 822     // ldexp(inf, x) -> inf
 823     // ldexp(-inf, x) -> -inf
 824     if (C && (C->isZero() || C->isInfinity())) {
 825       return IC.replaceInstUsesWith(II, Op0);
 826     }
 827
 828     // With strictfp, be more careful about possibly needing to flush denormals
 829     // or not, and snan behavior depends on ieee_mode.
 830     if (II.isStrictFP())
 831       break;
 832
 833     if (C && C->isNaN()) {
 834       // FIXME: We just need to make the nan quiet here, but that's unavailable
 835       // on APFloat, only IEEEfloat
 836       auto *Quieted =
 837           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
 838       return IC.replaceInstUsesWith(II, Quieted);
 839     }
 840
 841     // ldexp(x, 0) -> x
 842     // ldexp(x, undef) -> x
 843     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
 844       return IC.replaceInstUsesWith(II, Op0);
 845     }
 846
 847     break;
 848   }
 849   case Intrinsic::amdgcn_fmul_legacy: {
 850     Value *Op0 = II.getArgOperand(0);
 851     Value *Op1 = II.getArgOperand(1);
 852
 853     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
 854     // infinity, gives +0.0.
 855     // TODO: Move to InstSimplify?
 856     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
 857         match(Op1, PatternMatch::m_AnyZeroFP()))
 858       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
 859
 860     // If we can prove we don't have one of the special cases then we can use a
 861     // normal fmul instruction instead.
 862     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
 863       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
 864       FMul->takeName(&II);
 865       return IC.replaceInstUsesWith(II, FMul);
 866     }
 867     break;
 868   }
 869   case Intrinsic::amdgcn_fma_legacy: {
 870     Value *Op0 = II.getArgOperand(0);
 871     Value *Op1 = II.getArgOperand(1);
 872     Value *Op2 = II.getArgOperand(2);
 873
 874     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
 875     // infinity, gives +0.0.
 876     // TODO: Move to InstSimplify?
 877     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
 878         match(Op1, PatternMatch::m_AnyZeroFP())) {
 879       // It's tempting to just return Op2 here, but that would give the wrong
 880       // result if Op2 was -0.0.
 881       auto *Zero = ConstantFP::getNullValue(II.getType());
 882       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
 883       FAdd->takeName(&II);
 884       return IC.replaceInstUsesWith(II, FAdd);
 885     }
 886
 887     // If we can prove we don't have one of the special cases then we can use a
 888     // normal fma instead.
 889     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
 890       II.setCalledOperand(Intrinsic::getDeclaration(
 891           II.getModule(), Intrinsic::fma, II.getType()));
 892       return &II;
 893     }
 894     break;
 895   }
 896   default: {
 897     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
 898             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
 899       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
 900     }
 901   }
 902   }
 903   return None;
 904 }
 905
 906 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
 907 ///
 908 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
 909 ///       struct returns.
 910 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
 911                                                     IntrinsicInst &II,
 912                                                     APInt DemandedElts,
 913                                                     int DMaskIdx = -1) {
 914
 915   auto *IIVTy = cast<FixedVectorType>(II.getType());
 916   unsigned VWidth = IIVTy->getNumElements();
 917   if (VWidth == 1)
 918     return nullptr;
 919
 920   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
 921   IC.Builder.SetInsertPoint(&II);
 922
 923   // Assume the arguments are unchanged and later override them, if needed.
 924   SmallVector<Value *, 16> Args(II.args());
 925
 926   if (DMaskIdx < 0) {
 927     // Buffer case.
 928
 929     const unsigned ActiveBits = DemandedElts.getActiveBits();
 930     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
 931
 932     // Start assuming the prefix of elements is demanded, but possibly clear
 933     // some other bits if there are trailing zeros (unused components at front)
 934     // and update offset.
 935     DemandedElts = (1 << ActiveBits) - 1;
 936
 937     if (UnusedComponentsAtFront > 0) {
 938       static const unsigned InvalidOffsetIdx = 0xf;
 939
 940       unsigned OffsetIdx;
 941       switch (II.getIntrinsicID()) {
 942       case Intrinsic::amdgcn_raw_buffer_load:
 943         OffsetIdx = 1;
 944         break;
 945       case Intrinsic::amdgcn_s_buffer_load:
 946         // If resulting type is vec3, there is no point in trimming the
 947         // load with updated offset, as the vec3 would most likely be widened to
 948         // vec4 anyway during lowering.
 949         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
 950           OffsetIdx = InvalidOffsetIdx;
 951         else
 952           OffsetIdx = 1;
 953         break;
 954       case Intrinsic::amdgcn_struct_buffer_load:
 955         OffsetIdx = 2;
 956         break;
 957       default:
 958         // TODO: handle tbuffer* intrinsics.
 959         OffsetIdx = InvalidOffsetIdx;
 960         break;
 961       }
 962
 963       if (OffsetIdx != InvalidOffsetIdx) {
 964         // Clear demanded bits and update the offset.
 965         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
 966         auto *Offset = II.getArgOperand(OffsetIdx);
 967         unsigned SingleComponentSizeInBits =
 968             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
 969         unsigned OffsetAdd =
 970             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
 971         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
 972         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
 973       }
 974     }
 975   } else {
 976     // Image case.
 977
 978     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
 979     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
 980
 981     // Mask off values that are undefined because the dmask doesn't cover them
 982     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
 983
 984     unsigned NewDMaskVal = 0;
 985     unsigned OrigLoadIdx = 0;
 986     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
 987       const unsigned Bit = 1 << SrcIdx;
 988       if (!!(DMaskVal & Bit)) {
 989         if (!!DemandedElts[OrigLoadIdx])
 990           NewDMaskVal |= Bit;
 991         OrigLoadIdx++;
 992       }
 993     }
 994
 995     if (DMaskVal != NewDMaskVal)
 996       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
 997   }
 998
 999   unsigned NewNumElts = DemandedElts.countPopulation();
1000   if (!NewNumElts)
1001     return UndefValue::get(II.getType());
1002
1003   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1004     if (DMaskIdx >= 0)
1005       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1006     return nullptr;
1007   }
1008
1009   // Validate function argument and return types, extracting overloaded types
1010   // along the way.
1011   SmallVector<Type *, 6> OverloadTys;
1012   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1013     return nullptr;
1014
1015   Module *M = II.getParent()->getParent()->getParent();
1016   Type *EltTy = IIVTy->getElementType();
1017   Type *NewTy =
1018       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1019
1020   OverloadTys[0] = NewTy;
1021   Function *NewIntrin =
1022       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1023
1024   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1025   NewCall->takeName(&II);
1026   NewCall->copyMetadata(II);
1027
1028   if (NewNumElts == 1) {
1029     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1030                                           NewCall,
1031                                           DemandedElts.countTrailingZeros());
1032   }
1033
1034   SmallVector<int, 8> EltMask;
1035   unsigned NewLoadIdx = 0;
1036   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1037     if (!!DemandedElts[OrigLoadIdx])
1038       EltMask.push_back(NewLoadIdx++);
1039     else
1040       EltMask.push_back(NewNumElts);
1041   }
1042
1043   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1044
1045   return Shuffle;
1046 }
1047
1048 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1049     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1050     APInt &UndefElts2, APInt &UndefElts3,
1051     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1052         SimplifyAndSetOp) const {
1053   switch (II.getIntrinsicID()) {
1054   case Intrinsic::amdgcn_buffer_load:
1055   case Intrinsic::amdgcn_buffer_load_format:
1056   case Intrinsic::amdgcn_raw_buffer_load:
1057   case Intrinsic::amdgcn_raw_buffer_load_format:
1058   case Intrinsic::amdgcn_raw_tbuffer_load:
1059   case Intrinsic::amdgcn_s_buffer_load:
1060   case Intrinsic::amdgcn_struct_buffer_load:
1061   case Intrinsic::amdgcn_struct_buffer_load_format:
1062   case Intrinsic::amdgcn_struct_tbuffer_load:
1063   case Intrinsic::amdgcn_tbuffer_load:
1064     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1065   default: {
1066     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1067       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1068     }
1069     break;
1070   }
1071   }
1072   return None;
1073 }