llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

   1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // \file
  10 // This file implements a TargetTransformInfo analysis pass specific to the
  11 // AMDGPU target machine. It uses the target's detailed information to provide
  12 // more precise answers to certain TTI queries, while letting the target
  13 // independent and default TTI implementations handle the rest.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "AMDGPUInstrInfo.h"
  18 #include "AMDGPUTargetTransformInfo.h"
  19 #include "GCNSubtarget.h"
  20 #include "llvm/ADT/FloatingPointMode.h"
  21 #include "llvm/IR/IntrinsicsAMDGPU.h"
  22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
  23 #include <optional>
  24
  25 using namespace llvm;
  26 using namespace llvm::PatternMatch;
  27
  28 #define DEBUG_TYPE "AMDGPUtti"
  29
  30 namespace {
  31
  32 struct AMDGPUImageDMaskIntrinsic {
  33   unsigned Intr;
  34 };
  35
  36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
  37 #include "InstCombineTables.inc"
  38
  39 } // end anonymous namespace
  40
  41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
  42 //
  43 // A single NaN input is folded to minnum, so we rely on that folding for
  44 // handling NaNs.
  45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
  46                            const APFloat &Src2) {
  47   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
  48
  49   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
  50   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
  51   if (Cmp0 == APFloat::cmpEqual)
  52     return maxnum(Src1, Src2);
  53
  54   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
  55   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
  56   if (Cmp1 == APFloat::cmpEqual)
  57     return maxnum(Src0, Src2);
  58
  59   return maxnum(Src0, Src1);
  60 }
  61
  62 // Check if a value can be converted to a 16-bit value without losing
  63 // precision.
  64 // The value is expected to be either a float (IsFloat = true) or an unsigned
  65 // integer (IsFloat = false).
  66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
  67   Type *VTy = V.getType();
  68   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
  69     // The value is already 16-bit, so we don't want to convert to 16-bit again!
  70     return false;
  71   }
  72   if (IsFloat) {
  73     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
  74       // We need to check that if we cast the index down to a half, we do not
  75       // lose precision.
  76       APFloat FloatValue(ConstFloat->getValueAPF());
  77       bool LosesInfo = true;
  78       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
  79                          &LosesInfo);
  80       return !LosesInfo;
  81     }
  82   } else {
  83     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
  84       // We need to check that if we cast the index down to an i16, we do not
  85       // lose precision.
  86       APInt IntValue(ConstInt->getValue());
  87       return IntValue.getActiveBits() <= 16;
  88     }
  89   }
  90
  91   Value *CastSrc;
  92   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
  93                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
  94   if (IsExt) {
  95     Type *CastSrcTy = CastSrc->getType();
  96     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
  97       return true;
  98   }
  99
 100   return false;
 101 }
 102
 103 // Convert a value to 16-bit.
 104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
 105   Type *VTy = V.getType();
 106   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
 107     return cast<Instruction>(&V)->getOperand(0);
 108   if (VTy->isIntegerTy())
 109     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
 110   if (VTy->isFloatingPointTy())
 111     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
 112
 113   llvm_unreachable("Should never be called!");
 114 }
 115
 116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
 117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
 118 /// this newly created intrinsic call.
 119 static std::optional<Instruction *> modifyIntrinsicCall(
 120     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
 121     InstCombiner &IC,
 122     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
 123         Func) {
 124   SmallVector<Type *, 4> ArgTys;
 125   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
 126     return std::nullopt;
 127
 128   SmallVector<Value *, 8> Args(OldIntr.args());
 129
 130   // Modify arguments and types
 131   Func(Args, ArgTys);
 132
 133   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
 134
 135   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
 136   NewCall->takeName(&OldIntr);
 137   NewCall->copyMetadata(OldIntr);
 138   if (isa<FPMathOperator>(NewCall))
 139     NewCall->copyFastMathFlags(&OldIntr);
 140
 141   // Erase and replace uses
 142   if (!InstToReplace.getType()->isVoidTy())
 143     IC.replaceInstUsesWith(InstToReplace, NewCall);
 144
 145   bool RemoveOldIntr = &OldIntr != &InstToReplace;
 146
 147   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
 148   if (RemoveOldIntr)
 149     IC.eraseInstFromFunction(OldIntr);
 150
 151   return RetValue;
 152 }
 153
 154 static std::optional<Instruction *>
 155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
 156                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
 157                              IntrinsicInst &II, InstCombiner &IC) {
 158   // Optimize _L to _LZ when _L is zero
 159   if (const auto *LZMappingInfo =
 160           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
 161     if (auto *ConstantLod =
 162             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
 163       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
 164         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
 165             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
 166                                                      ImageDimIntr->Dim);
 167         return modifyIntrinsicCall(
 168             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
 169               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
 170             });
 171       }
 172     }
 173   }
 174
 175   // Optimize _mip away, when 'lod' is zero
 176   if (const auto *MIPMappingInfo =
 177           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
 178     if (auto *ConstantMip =
 179             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
 180       if (ConstantMip->isZero()) {
 181         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
 182             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
 183                                                      ImageDimIntr->Dim);
 184         return modifyIntrinsicCall(
 185             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
 186               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
 187             });
 188       }
 189     }
 190   }
 191
 192   // Optimize _bias away when 'bias' is zero
 193   if (const auto *BiasMappingInfo =
 194           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
 195     if (auto *ConstantBias =
 196             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
 197       if (ConstantBias->isZero()) {
 198         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
 199             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
 200                                                      ImageDimIntr->Dim);
 201         return modifyIntrinsicCall(
 202             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
 203               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
 204               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
 205             });
 206       }
 207     }
 208   }
 209
 210   // Optimize _offset away when 'offset' is zero
 211   if (const auto *OffsetMappingInfo =
 212           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
 213     if (auto *ConstantOffset =
 214             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
 215       if (ConstantOffset->isZero()) {
 216         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
 217             AMDGPU::getImageDimIntrinsicByBaseOpcode(
 218                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
 219         return modifyIntrinsicCall(
 220             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
 221               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
 222             });
 223       }
 224     }
 225   }
 226
 227   // Try to use D16
 228   if (ST->hasD16Images()) {
 229
 230     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
 231         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
 232
 233     if (BaseOpcode->HasD16) {
 234
 235       // If the only use of image intrinsic is a fptrunc (with conversion to
 236       // half) then both fptrunc and image intrinsic will be replaced with image
 237       // intrinsic with D16 flag.
 238       if (II.hasOneUse()) {
 239         Instruction *User = II.user_back();
 240
 241         if (User->getOpcode() == Instruction::FPTrunc &&
 242             User->getType()->getScalarType()->isHalfTy()) {
 243
 244           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
 245                                      [&](auto &Args, auto &ArgTys) {
 246                                        // Change return type of image intrinsic.
 247                                        // Set it to return type of fptrunc.
 248                                        ArgTys[0] = User->getType();
 249                                      });
 250         }
 251       }
 252     }
 253   }
 254
 255   // Try to use A16 or G16
 256   if (!ST->hasA16() && !ST->hasG16())
 257     return std::nullopt;
 258
 259   // Address is interpreted as float if the instruction has a sampler or as
 260   // unsigned int if there is no sampler.
 261   bool HasSampler =
 262       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
 263   bool FloatCoord = false;
 264   // true means derivatives can be converted to 16 bit, coordinates not
 265   bool OnlyDerivatives = false;
 266
 267   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
 268        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
 269     Value *Coord = II.getOperand(OperandIndex);
 270     // If the values are not derived from 16-bit values, we cannot optimize.
 271     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
 272       if (OperandIndex < ImageDimIntr->CoordStart ||
 273           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
 274         return std::nullopt;
 275       }
 276       // All gradients can be converted, so convert only them
 277       OnlyDerivatives = true;
 278       break;
 279     }
 280
 281     assert(OperandIndex == ImageDimIntr->GradientStart ||
 282            FloatCoord == Coord->getType()->isFloatingPointTy());
 283     FloatCoord = Coord->getType()->isFloatingPointTy();
 284   }
 285
 286   if (!OnlyDerivatives && !ST->hasA16())
 287     OnlyDerivatives = true; // Only supports G16
 288
 289   // Check if there is a bias parameter and if it can be converted to f16
 290   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
 291     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
 292     assert(HasSampler &&
 293            "Only image instructions with a sampler can have a bias");
 294     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
 295       OnlyDerivatives = true;
 296   }
 297
 298   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
 299                                                ImageDimIntr->CoordStart))
 300     return std::nullopt;
 301
 302   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
 303                                : Type::getInt16Ty(II.getContext());
 304
 305   return modifyIntrinsicCall(
 306       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
 307         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
 308         if (!OnlyDerivatives) {
 309           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
 310
 311           // Change the bias type
 312           if (ImageDimIntr->NumBiasArgs != 0)
 313             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
 314         }
 315
 316         unsigned EndIndex =
 317             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
 318         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
 319              OperandIndex < EndIndex; OperandIndex++) {
 320           Args[OperandIndex] =
 321               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
 322         }
 323
 324         // Convert the bias
 325         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
 326           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
 327           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
 328         }
 329       });
 330 }
 331
 332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
 333                                            const Value *Op0, const Value *Op1,
 334                                            InstCombiner &IC) const {
 335   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
 336   // infinity, gives +0.0. If we can prove we don't have one of the special
 337   // cases then we can use a normal multiply instead.
 338   // TODO: Create and use isKnownFiniteNonZero instead of just matching
 339   // constants here.
 340   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
 341       match(Op1, PatternMatch::m_FiniteNonZero())) {
 342     // One operand is not zero or infinity or NaN.
 343     return true;
 344   }
 345
 346   SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
 347   if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
 348       isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
 349     // Neither operand is infinity or NaN.
 350     return true;
 351   }
 352   return false;
 353 }
 354
 355 /// Match an fpext from half to float, or a constant we can convert.
 356 static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
 357   if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
 358     return FPExtSrc->getType()->isHalfTy();
 359
 360   ConstantFP *CFP;
 361   if (match(Arg, m_ConstantFP(CFP))) {
 362     bool LosesInfo;
 363     APFloat Val(CFP->getValueAPF());
 364     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
 365     if (LosesInfo)
 366       return false;
 367
 368     FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
 369     return true;
 370   }
 371
 372   return false;
 373 }
 374
 375 // Trim all zero components from the end of the vector \p UseV and return
 376 // an appropriate bitset with known elements.
 377 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
 378                                        Instruction *I) {
 379   auto *VTy = cast<FixedVectorType>(UseV->getType());
 380   unsigned VWidth = VTy->getNumElements();
 381   APInt DemandedElts = APInt::getAllOnes(VWidth);
 382
 383   for (int i = VWidth - 1; i > 0; --i) {
 384     auto *Elt = findScalarElement(UseV, i);
 385     if (!Elt)
 386       break;
 387
 388     if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
 389       if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
 390         break;
 391     } else {
 392       break;
 393     }
 394
 395     DemandedElts.clearBit(i);
 396   }
 397
 398   return DemandedElts;
 399 }
 400
 401 // Trim elements of the end of the vector \p V, if they are
 402 // equal to the first element of the vector.
 403 static APInt defaultComponentBroadcast(Value *V) {
 404   auto *VTy = cast<FixedVectorType>(V->getType());
 405   unsigned VWidth = VTy->getNumElements();
 406   APInt DemandedElts = APInt::getAllOnes(VWidth);
 407   Value *FirstComponent = findScalarElement(V, 0);
 408
 409   SmallVector<int> ShuffleMask;
 410   if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
 411     SVI->getShuffleMask(ShuffleMask);
 412
 413   for (int I = VWidth - 1; I > 0; --I) {
 414     if (ShuffleMask.empty()) {
 415       auto *Elt = findScalarElement(V, I);
 416       if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
 417         break;
 418     } else {
 419       // Detect identical elements in the shufflevector result, even though
 420       // findScalarElement cannot tell us what that element is.
 421       if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
 422         break;
 423     }
 424     DemandedElts.clearBit(I);
 425   }
 426
 427   return DemandedElts;
 428 }
 429
 430 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
 431                                                     IntrinsicInst &II,
 432                                                     APInt DemandedElts,
 433                                                     int DMaskIdx = -1,
 434                                                     bool IsLoad = true);
 435
 436 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
 437 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
 438   return (SqrtOp->getType()->isFloatTy() &&
 439           (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
 440          SqrtOp->getType()->isHalfTy();
 441 }
 442
 443 std::optional<Instruction *>
 444 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 445   Intrinsic::ID IID = II.getIntrinsicID();
 446   switch (IID) {
 447   case Intrinsic::amdgcn_rcp: {
 448     Value *Src = II.getArgOperand(0);
 449
 450     // TODO: Move to ConstantFolding/InstSimplify?
 451     if (isa<UndefValue>(Src)) {
 452       Type *Ty = II.getType();
 453       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
 454       return IC.replaceInstUsesWith(II, QNaN);
 455     }
 456
 457     if (II.isStrictFP())
 458       break;
 459
 460     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
 461       const APFloat &ArgVal = C->getValueAPF();
 462       APFloat Val(ArgVal.getSemantics(), 1);
 463       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
 464
 465       // This is more precise than the instruction may give.
 466       //
 467       // TODO: The instruction always flushes denormal results (except for f16),
 468       // should this also?
 469       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
 470     }
 471
 472     FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
 473     if (!FMF.allowContract())
 474       break;
 475     auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
 476     if (!SrcCI)
 477       break;
 478
 479     auto IID = SrcCI->getIntrinsicID();
 480     // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
 481     //
 482     // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
 483     // relaxed.
 484     if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
 485       const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
 486       FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
 487       if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
 488         break;
 489
 490       if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
 491         break;
 492
 493       Function *NewDecl = Intrinsic::getDeclaration(
 494           SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
 495
 496       InnerFMF |= FMF;
 497       II.setFastMathFlags(InnerFMF);
 498
 499       II.setCalledFunction(NewDecl);
 500       return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
 501     }
 502
 503     break;
 504   }
 505   case Intrinsic::amdgcn_sqrt:
 506   case Intrinsic::amdgcn_rsq: {
 507     Value *Src = II.getArgOperand(0);
 508
 509     // TODO: Move to ConstantFolding/InstSimplify?
 510     if (isa<UndefValue>(Src)) {
 511       Type *Ty = II.getType();
 512       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
 513       return IC.replaceInstUsesWith(II, QNaN);
 514     }
 515
 516     // f16 amdgcn.sqrt is identical to regular sqrt.
 517     if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
 518       Function *NewDecl = Intrinsic::getDeclaration(
 519           II.getModule(), Intrinsic::sqrt, {II.getType()});
 520       II.setCalledFunction(NewDecl);
 521       return &II;
 522     }
 523
 524     break;
 525   }
 526   case Intrinsic::amdgcn_log:
 527   case Intrinsic::amdgcn_exp2: {
 528     const bool IsLog = IID == Intrinsic::amdgcn_log;
 529     const bool IsExp = IID == Intrinsic::amdgcn_exp2;
 530     Value *Src = II.getArgOperand(0);
 531     Type *Ty = II.getType();
 532
 533     if (isa<PoisonValue>(Src))
 534       return IC.replaceInstUsesWith(II, Src);
 535
 536     if (IC.getSimplifyQuery().isUndefValue(Src))
 537       return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
 538
 539     if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
 540       if (C->isInfinity()) {
 541         // exp2(+inf) -> +inf
 542         // log2(+inf) -> +inf
 543         if (!C->isNegative())
 544           return IC.replaceInstUsesWith(II, C);
 545
 546         // exp2(-inf) -> 0
 547         if (IsExp && C->isNegative())
 548           return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
 549       }
 550
 551       if (II.isStrictFP())
 552         break;
 553
 554       if (C->isNaN()) {
 555         Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
 556         return IC.replaceInstUsesWith(II, Quieted);
 557       }
 558
 559       // f32 instruction doesn't handle denormals, f16 does.
 560       if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
 561         Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
 562                                       : ConstantFP::get(Ty, 1.0);
 563         return IC.replaceInstUsesWith(II, FoldedValue);
 564       }
 565
 566       if (IsLog && C->isNegative())
 567         return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
 568
 569       // TODO: Full constant folding matching hardware behavior.
 570     }
 571
 572     break;
 573   }
 574   case Intrinsic::amdgcn_frexp_mant:
 575   case Intrinsic::amdgcn_frexp_exp: {
 576     Value *Src = II.getArgOperand(0);
 577     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
 578       int Exp;
 579       APFloat Significand =
 580           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
 581
 582       if (IID == Intrinsic::amdgcn_frexp_mant) {
 583         return IC.replaceInstUsesWith(
 584             II, ConstantFP::get(II.getContext(), Significand));
 585       }
 586
 587       // Match instruction special case behavior.
 588       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
 589         Exp = 0;
 590
 591       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
 592     }
 593
 594     if (isa<UndefValue>(Src)) {
 595       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 596     }
 597
 598     break;
 599   }
 600   case Intrinsic::amdgcn_class: {
 601     Value *Src0 = II.getArgOperand(0);
 602     Value *Src1 = II.getArgOperand(1);
 603     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
 604     if (CMask) {
 605       II.setCalledOperand(Intrinsic::getDeclaration(
 606           II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
 607
 608       // Clamp any excess bits, as they're illegal for the generic intrinsic.
 609       II.setArgOperand(1, ConstantInt::get(Src1->getType(),
 610                                            CMask->getZExtValue() & fcAllFlags));
 611       return &II;
 612     }
 613
 614     // Propagate poison.
 615     if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
 616       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 617
 618     // llvm.amdgcn.class(_, undef) -> false
 619     if (IC.getSimplifyQuery().isUndefValue(Src1))
 620       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
 621
 622     // llvm.amdgcn.class(undef, mask) -> mask != 0
 623     if (IC.getSimplifyQuery().isUndefValue(Src0)) {
 624       Value *CmpMask = IC.Builder.CreateICmpNE(
 625           Src1, ConstantInt::getNullValue(Src1->getType()));
 626       return IC.replaceInstUsesWith(II, CmpMask);
 627     }
 628     break;
 629   }
 630   case Intrinsic::amdgcn_cvt_pkrtz: {
 631     Value *Src0 = II.getArgOperand(0);
 632     Value *Src1 = II.getArgOperand(1);
 633     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
 634       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
 635         const fltSemantics &HalfSem =
 636             II.getType()->getScalarType()->getFltSemantics();
 637         bool LosesInfo;
 638         APFloat Val0 = C0->getValueAPF();
 639         APFloat Val1 = C1->getValueAPF();
 640         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
 641         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
 642
 643         Constant *Folded =
 644             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
 645                                  ConstantFP::get(II.getContext(), Val1)});
 646         return IC.replaceInstUsesWith(II, Folded);
 647       }
 648     }
 649
 650     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
 651       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 652     }
 653
 654     break;
 655   }
 656   case Intrinsic::amdgcn_cvt_pknorm_i16:
 657   case Intrinsic::amdgcn_cvt_pknorm_u16:
 658   case Intrinsic::amdgcn_cvt_pk_i16:
 659   case Intrinsic::amdgcn_cvt_pk_u16: {
 660     Value *Src0 = II.getArgOperand(0);
 661     Value *Src1 = II.getArgOperand(1);
 662
 663     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
 664       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 665     }
 666
 667     break;
 668   }
 669   case Intrinsic::amdgcn_ubfe:
 670   case Intrinsic::amdgcn_sbfe: {
 671     // Decompose simple cases into standard shifts.
 672     Value *Src = II.getArgOperand(0);
 673     if (isa<UndefValue>(Src)) {
 674       return IC.replaceInstUsesWith(II, Src);
 675     }
 676
 677     unsigned Width;
 678     Type *Ty = II.getType();
 679     unsigned IntSize = Ty->getIntegerBitWidth();
 680
 681     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
 682     if (CWidth) {
 683       Width = CWidth->getZExtValue();
 684       if ((Width & (IntSize - 1)) == 0) {
 685         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
 686       }
 687
 688       // Hardware ignores high bits, so remove those.
 689       if (Width >= IntSize) {
 690         return IC.replaceOperand(
 691             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
 692       }
 693     }
 694
 695     unsigned Offset;
 696     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
 697     if (COffset) {
 698       Offset = COffset->getZExtValue();
 699       if (Offset >= IntSize) {
 700         return IC.replaceOperand(
 701             II, 1,
 702             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
 703       }
 704     }
 705
 706     bool Signed = IID == Intrinsic::amdgcn_sbfe;
 707
 708     if (!CWidth || !COffset)
 709       break;
 710
 711     // The case of Width == 0 is handled above, which makes this transformation
 712     // safe.  If Width == 0, then the ashr and lshr instructions become poison
 713     // value since the shift amount would be equal to the bit size.
 714     assert(Width != 0);
 715
 716     // TODO: This allows folding to undef when the hardware has specific
 717     // behavior?
 718     if (Offset + Width < IntSize) {
 719       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
 720       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
 721                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
 722       RightShift->takeName(&II);
 723       return IC.replaceInstUsesWith(II, RightShift);
 724     }
 725
 726     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
 727                                : IC.Builder.CreateLShr(Src, Offset);
 728
 729     RightShift->takeName(&II);
 730     return IC.replaceInstUsesWith(II, RightShift);
 731   }
 732   case Intrinsic::amdgcn_exp:
 733   case Intrinsic::amdgcn_exp_row:
 734   case Intrinsic::amdgcn_exp_compr: {
 735     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
 736     unsigned EnBits = En->getZExtValue();
 737     if (EnBits == 0xf)
 738       break; // All inputs enabled.
 739
 740     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
 741     bool Changed = false;
 742     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
 743       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
 744           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
 745         Value *Src = II.getArgOperand(I + 2);
 746         if (!isa<UndefValue>(Src)) {
 747           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
 748           Changed = true;
 749         }
 750       }
 751     }
 752
 753     if (Changed) {
 754       return &II;
 755     }
 756
 757     break;
 758   }
 759   case Intrinsic::amdgcn_fmed3: {
 760     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
 761     // for the shader.
 762
 763     Value *Src0 = II.getArgOperand(0);
 764     Value *Src1 = II.getArgOperand(1);
 765     Value *Src2 = II.getArgOperand(2);
 766
 767     // Checking for NaN before canonicalization provides better fidelity when
 768     // mapping other operations onto fmed3 since the order of operands is
 769     // unchanged.
 770     Value *V = nullptr;
 771     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
 772       V = IC.Builder.CreateMinNum(Src1, Src2);
 773     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
 774       V = IC.Builder.CreateMinNum(Src0, Src2);
 775     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
 776       V = IC.Builder.CreateMaxNum(Src0, Src1);
 777     }
 778
 779     if (V) {
 780       if (auto *CI = dyn_cast<CallInst>(V)) {
 781         CI->copyFastMathFlags(&II);
 782         CI->takeName(&II);
 783       }
 784       return IC.replaceInstUsesWith(II, V);
 785     }
 786
 787     bool Swap = false;
 788     // Canonicalize constants to RHS operands.
 789     //
 790     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
 791     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
 792       std::swap(Src0, Src1);
 793       Swap = true;
 794     }
 795
 796     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
 797       std::swap(Src1, Src2);
 798       Swap = true;
 799     }
 800
 801     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
 802       std::swap(Src0, Src1);
 803       Swap = true;
 804     }
 805
 806     if (Swap) {
 807       II.setArgOperand(0, Src0);
 808       II.setArgOperand(1, Src1);
 809       II.setArgOperand(2, Src2);
 810       return &II;
 811     }
 812
 813     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
 814       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
 815         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
 816           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
 817                                        C2->getValueAPF());
 818           return IC.replaceInstUsesWith(
 819               II, ConstantFP::get(IC.Builder.getContext(), Result));
 820         }
 821       }
 822     }
 823
 824     if (!ST->hasMed3_16())
 825       break;
 826
 827     Value *X, *Y, *Z;
 828
 829     // Repeat floating-point width reduction done for minnum/maxnum.
 830     // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
 831     if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
 832         matchFPExtFromF16(Src2, Z)) {
 833       Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
 834                                                   {X, Y, Z}, &II, II.getName());
 835       return new FPExtInst(NewCall, II.getType());
 836     }
 837
 838     break;
 839   }
 840   case Intrinsic::amdgcn_icmp:
 841   case Intrinsic::amdgcn_fcmp: {
 842     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
 843     // Guard against invalid arguments.
 844     int64_t CCVal = CC->getZExtValue();
 845     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
 846     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
 847                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
 848         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
 849                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
 850       break;
 851
 852     Value *Src0 = II.getArgOperand(0);
 853     Value *Src1 = II.getArgOperand(1);
 854
 855     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
 856       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
 857         Constant *CCmp = ConstantFoldCompareInstOperands(
 858             (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
 859         if (CCmp && CCmp->isNullValue()) {
 860           return IC.replaceInstUsesWith(
 861               II, IC.Builder.CreateSExt(CCmp, II.getType()));
 862         }
 863
 864         // The result of V_ICMP/V_FCMP assembly instructions (which this
 865         // intrinsic exposes) is one bit per thread, masked with the EXEC
 866         // register (which contains the bitmask of live threads). So a
 867         // comparison that always returns true is the same as a read of the
 868         // EXEC register.
 869         Function *NewF = Intrinsic::getDeclaration(
 870             II.getModule(), Intrinsic::read_register, II.getType());
 871         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
 872         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
 873         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
 874         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
 875         NewCall->addFnAttr(Attribute::Convergent);
 876         NewCall->takeName(&II);
 877         return IC.replaceInstUsesWith(II, NewCall);
 878       }
 879
 880       // Canonicalize constants to RHS.
 881       CmpInst::Predicate SwapPred =
 882           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
 883       II.setArgOperand(0, Src1);
 884       II.setArgOperand(1, Src0);
 885       II.setArgOperand(
 886           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
 887       return &II;
 888     }
 889
 890     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
 891       break;
 892
 893     // Canonicalize compare eq with true value to compare != 0
 894     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
 895     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
 896     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
 897     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
 898     Value *ExtSrc;
 899     if (CCVal == CmpInst::ICMP_EQ &&
 900         ((match(Src1, PatternMatch::m_One()) &&
 901           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
 902          (match(Src1, PatternMatch::m_AllOnes()) &&
 903           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
 904         ExtSrc->getType()->isIntegerTy(1)) {
 905       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
 906       IC.replaceOperand(II, 2,
 907                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
 908       return &II;
 909     }
 910
 911     CmpInst::Predicate SrcPred;
 912     Value *SrcLHS;
 913     Value *SrcRHS;
 914
 915     // Fold compare eq/ne with 0 from a compare result as the predicate to the
 916     // intrinsic. The typical use is a wave vote function in the library, which
 917     // will be fed from a user code condition compared with 0. Fold in the
 918     // redundant compare.
 919
 920     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
 921     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
 922     //
 923     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
 924     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
 925     if (match(Src1, PatternMatch::m_Zero()) &&
 926         match(Src0, PatternMatch::m_ZExtOrSExt(
 927                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
 928                               PatternMatch::m_Value(SrcRHS))))) {
 929       if (CCVal == CmpInst::ICMP_EQ)
 930         SrcPred = CmpInst::getInversePredicate(SrcPred);
 931
 932       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
 933                                  ? Intrinsic::amdgcn_fcmp
 934                                  : Intrinsic::amdgcn_icmp;
 935
 936       Type *Ty = SrcLHS->getType();
 937       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
 938         // Promote to next legal integer type.
 939         unsigned Width = CmpType->getBitWidth();
 940         unsigned NewWidth = Width;
 941
 942         // Don't do anything for i1 comparisons.
 943         if (Width == 1)
 944           break;
 945
 946         if (Width <= 16)
 947           NewWidth = 16;
 948         else if (Width <= 32)
 949           NewWidth = 32;
 950         else if (Width <= 64)
 951           NewWidth = 64;
 952         else
 953           break; // Can't handle this.
 954
 955         if (Width != NewWidth) {
 956           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
 957           if (CmpInst::isSigned(SrcPred)) {
 958             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
 959             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
 960           } else {
 961             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
 962             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
 963           }
 964         }
 965       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
 966         break;
 967
 968       Function *NewF = Intrinsic::getDeclaration(
 969           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
 970       Value *Args[] = {SrcLHS, SrcRHS,
 971                        ConstantInt::get(CC->getType(), SrcPred)};
 972       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
 973       NewCall->takeName(&II);
 974       return IC.replaceInstUsesWith(II, NewCall);
 975     }
 976
 977     break;
 978   }
 979   case Intrinsic::amdgcn_mbcnt_hi: {
 980     // exec_hi is all 0, so this is just a copy.
 981     if (ST->isWave32())
 982       return IC.replaceInstUsesWith(II, II.getArgOperand(1));
 983     break;
 984   }
 985   case Intrinsic::amdgcn_ballot: {
 986     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
 987       if (Src->isZero()) {
 988         // amdgcn.ballot(i1 0) is zero.
 989         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
 990       }
 991     }
 992     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
 993       // %b64 = call i64 ballot.i64(...)
 994       // =>
 995       // %b32 = call i32 ballot.i32(...)
 996       // %b64 = zext i32 %b32 to i64
 997       Value *Call = IC.Builder.CreateZExt(
 998           IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
 999                                      {IC.Builder.getInt32Ty()},
1000                                      {II.getArgOperand(0)}),
1001           II.getType());
1002       Call->takeName(&II);
1003       return IC.replaceInstUsesWith(II, Call);
1004     }
1005     break;
1006   }
1007   case Intrinsic::amdgcn_wqm_vote: {
1008     // wqm_vote is identity when the argument is constant.
1009     if (!isa<Constant>(II.getArgOperand(0)))
1010       break;
1011
1012     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1013   }
1014   case Intrinsic::amdgcn_kill: {
1015     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1016     if (!C || !C->getZExtValue())
1017       break;
1018
1019     // amdgcn.kill(i1 1) is a no-op
1020     return IC.eraseInstFromFunction(II);
1021   }
1022   case Intrinsic::amdgcn_update_dpp: {
1023     Value *Old = II.getArgOperand(0);
1024
1025     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1026     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1027     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1028     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1029         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1030       break;
1031
1032     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1033     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1034   }
1035   case Intrinsic::amdgcn_permlane16:
1036   case Intrinsic::amdgcn_permlane16_var:
1037   case Intrinsic::amdgcn_permlanex16:
1038   case Intrinsic::amdgcn_permlanex16_var: {
1039     // Discard vdst_in if it's not going to be read.
1040     Value *VDstIn = II.getArgOperand(0);
1041     if (isa<UndefValue>(VDstIn))
1042       break;
1043
1044     // FetchInvalid operand idx.
1045     unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1046                           IID == Intrinsic::amdgcn_permlanex16)
1047                              ? 4  /* for permlane16 and permlanex16 */
1048                              : 3; /* for permlane16_var and permlanex16_var */
1049
1050     // BoundCtrl operand idx.
1051     // For permlane16 and permlanex16 it should be 5
1052     // For Permlane16_var and permlanex16_var it should be 4
1053     unsigned int BcIdx = FiIdx + 1;
1054
1055     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1056     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1057     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1058       break;
1059
1060     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1061   }
1062   case Intrinsic::amdgcn_permlane64:
1063     // A constant value is trivially uniform.
1064     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1065       return IC.replaceInstUsesWith(II, C);
1066     }
1067     break;
1068   case Intrinsic::amdgcn_readfirstlane:
1069   case Intrinsic::amdgcn_readlane: {
1070     // A constant value is trivially uniform.
1071     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1072       return IC.replaceInstUsesWith(II, C);
1073     }
1074
1075     // The rest of these may not be safe if the exec may not be the same between
1076     // the def and use.
1077     Value *Src = II.getArgOperand(0);
1078     Instruction *SrcInst = dyn_cast<Instruction>(Src);
1079     if (SrcInst && SrcInst->getParent() != II.getParent())
1080       break;
1081
1082     // readfirstlane (readfirstlane x) -> readfirstlane x
1083     // readlane (readfirstlane x), y -> readfirstlane x
1084     if (match(Src,
1085               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1086       return IC.replaceInstUsesWith(II, Src);
1087     }
1088
1089     if (IID == Intrinsic::amdgcn_readfirstlane) {
1090       // readfirstlane (readlane x, y) -> readlane x, y
1091       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1092         return IC.replaceInstUsesWith(II, Src);
1093       }
1094     } else {
1095       // readlane (readlane x, y), y -> readlane x, y
1096       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1097                          PatternMatch::m_Value(),
1098                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
1099         return IC.replaceInstUsesWith(II, Src);
1100       }
1101     }
1102
1103     break;
1104   }
1105   case Intrinsic::amdgcn_trig_preop: {
1106     // The intrinsic is declared with name mangling, but currently the
1107     // instruction only exists for f64
1108     if (!II.getType()->isDoubleTy())
1109       break;
1110
1111     Value *Src = II.getArgOperand(0);
1112     Value *Segment = II.getArgOperand(1);
1113     if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1114       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1115
1116     if (isa<UndefValue>(Src)) {
1117       auto *QNaN = ConstantFP::get(
1118           II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1119       return IC.replaceInstUsesWith(II, QNaN);
1120     }
1121
1122     const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1123     if (!Csrc)
1124       break;
1125
1126     if (II.isStrictFP())
1127       break;
1128
1129     const APFloat &Fsrc = Csrc->getValueAPF();
1130     if (Fsrc.isNaN()) {
1131       auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1132       return IC.replaceInstUsesWith(II, Quieted);
1133     }
1134
1135     const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1136     if (!Cseg)
1137       break;
1138
1139     unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1140     unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1141     unsigned Shift = SegmentVal * 53;
1142     if (Exponent > 1077)
1143       Shift += Exponent - 1077;
1144
1145     // 2.0/PI table.
1146     static const uint32_t TwoByPi[] = {
1147         0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1148         0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1149         0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1150         0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1151         0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1152         0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1153         0x56033046};
1154
1155     // Return 0 for outbound segment (hardware behavior).
1156     unsigned Idx = Shift >> 5;
1157     if (Idx + 2 >= std::size(TwoByPi)) {
1158       APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1159       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1160     }
1161
1162     unsigned BShift = Shift & 0x1f;
1163     uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1164     uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1165     if (BShift)
1166       Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1167     Thi = Thi >> 11;
1168     APFloat Result = APFloat((double)Thi);
1169
1170     int Scale = -53 - Shift;
1171     if (Exponent >= 1968)
1172       Scale += 128;
1173
1174     Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1175     return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1176   }
1177   case Intrinsic::amdgcn_fmul_legacy: {
1178     Value *Op0 = II.getArgOperand(0);
1179     Value *Op1 = II.getArgOperand(1);
1180
1181     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182     // infinity, gives +0.0.
1183     // TODO: Move to InstSimplify?
1184     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1185         match(Op1, PatternMatch::m_AnyZeroFP()))
1186       return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1187
1188     // If we can prove we don't have one of the special cases then we can use a
1189     // normal fmul instruction instead.
1190     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1191       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1192       FMul->takeName(&II);
1193       return IC.replaceInstUsesWith(II, FMul);
1194     }
1195     break;
1196   }
1197   case Intrinsic::amdgcn_fma_legacy: {
1198     Value *Op0 = II.getArgOperand(0);
1199     Value *Op1 = II.getArgOperand(1);
1200     Value *Op2 = II.getArgOperand(2);
1201
1202     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1203     // infinity, gives +0.0.
1204     // TODO: Move to InstSimplify?
1205     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1206         match(Op1, PatternMatch::m_AnyZeroFP())) {
1207       // It's tempting to just return Op2 here, but that would give the wrong
1208       // result if Op2 was -0.0.
1209       auto *Zero = ConstantFP::getZero(II.getType());
1210       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1211       FAdd->takeName(&II);
1212       return IC.replaceInstUsesWith(II, FAdd);
1213     }
1214
1215     // If we can prove we don't have one of the special cases then we can use a
1216     // normal fma instead.
1217     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1218       II.setCalledOperand(Intrinsic::getDeclaration(
1219           II.getModule(), Intrinsic::fma, II.getType()));
1220       return &II;
1221     }
1222     break;
1223   }
1224   case Intrinsic::amdgcn_is_shared:
1225   case Intrinsic::amdgcn_is_private: {
1226     if (isa<UndefValue>(II.getArgOperand(0)))
1227       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1228
1229     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1230       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1231     break;
1232   }
1233   case Intrinsic::amdgcn_raw_buffer_store_format:
1234   case Intrinsic::amdgcn_struct_buffer_store_format:
1235   case Intrinsic::amdgcn_raw_tbuffer_store:
1236   case Intrinsic::amdgcn_struct_tbuffer_store:
1237   case Intrinsic::amdgcn_image_store_1d:
1238   case Intrinsic::amdgcn_image_store_1darray:
1239   case Intrinsic::amdgcn_image_store_2d:
1240   case Intrinsic::amdgcn_image_store_2darray:
1241   case Intrinsic::amdgcn_image_store_2darraymsaa:
1242   case Intrinsic::amdgcn_image_store_2dmsaa:
1243   case Intrinsic::amdgcn_image_store_3d:
1244   case Intrinsic::amdgcn_image_store_cube:
1245   case Intrinsic::amdgcn_image_store_mip_1d:
1246   case Intrinsic::amdgcn_image_store_mip_1darray:
1247   case Intrinsic::amdgcn_image_store_mip_2d:
1248   case Intrinsic::amdgcn_image_store_mip_2darray:
1249   case Intrinsic::amdgcn_image_store_mip_3d:
1250   case Intrinsic::amdgcn_image_store_mip_cube: {
1251     if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1252       break;
1253
1254     APInt DemandedElts;
1255     if (ST->hasDefaultComponentBroadcast())
1256       DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1257     else if (ST->hasDefaultComponentZero())
1258       DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1259     else
1260       break;
1261
1262     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1263     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1264                                               false)) {
1265       return IC.eraseInstFromFunction(II);
1266     }
1267
1268     break;
1269   }
1270   }
1271   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1272             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1273     return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1274   }
1275   return std::nullopt;
1276 }
1277
1278 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1279 ///
1280 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1281 /// definitions of the intrinsics vector argument, not Uses of the result like
1282 /// image and buffer loads.
1283 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1284 ///       struct returns.
1285 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1286                                                     IntrinsicInst &II,
1287                                                     APInt DemandedElts,
1288                                                     int DMaskIdx, bool IsLoad) {
1289
1290   auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1291                                              : II.getOperand(0)->getType());
1292   unsigned VWidth = IIVTy->getNumElements();
1293   if (VWidth == 1)
1294     return nullptr;
1295   Type *EltTy = IIVTy->getElementType();
1296
1297   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1298   IC.Builder.SetInsertPoint(&II);
1299
1300   // Assume the arguments are unchanged and later override them, if needed.
1301   SmallVector<Value *, 16> Args(II.args());
1302
1303   if (DMaskIdx < 0) {
1304     // Buffer case.
1305
1306     const unsigned ActiveBits = DemandedElts.getActiveBits();
1307     const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1308
1309     // Start assuming the prefix of elements is demanded, but possibly clear
1310     // some other bits if there are trailing zeros (unused components at front)
1311     // and update offset.
1312     DemandedElts = (1 << ActiveBits) - 1;
1313
1314     if (UnusedComponentsAtFront > 0) {
1315       static const unsigned InvalidOffsetIdx = 0xf;
1316
1317       unsigned OffsetIdx;
1318       switch (II.getIntrinsicID()) {
1319       case Intrinsic::amdgcn_raw_buffer_load:
1320       case Intrinsic::amdgcn_raw_ptr_buffer_load:
1321         OffsetIdx = 1;
1322         break;
1323       case Intrinsic::amdgcn_s_buffer_load:
1324         // If resulting type is vec3, there is no point in trimming the
1325         // load with updated offset, as the vec3 would most likely be widened to
1326         // vec4 anyway during lowering.
1327         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1328           OffsetIdx = InvalidOffsetIdx;
1329         else
1330           OffsetIdx = 1;
1331         break;
1332       case Intrinsic::amdgcn_struct_buffer_load:
1333       case Intrinsic::amdgcn_struct_ptr_buffer_load:
1334         OffsetIdx = 2;
1335         break;
1336       default:
1337         // TODO: handle tbuffer* intrinsics.
1338         OffsetIdx = InvalidOffsetIdx;
1339         break;
1340       }
1341
1342       if (OffsetIdx != InvalidOffsetIdx) {
1343         // Clear demanded bits and update the offset.
1344         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1345         auto *Offset = Args[OffsetIdx];
1346         unsigned SingleComponentSizeInBits =
1347             IC.getDataLayout().getTypeSizeInBits(EltTy);
1348         unsigned OffsetAdd =
1349             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1350         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1351         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1352       }
1353     }
1354   } else {
1355     // Image case.
1356
1357     ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1358     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1359
1360     // dmask 0 has special semantics, do not simplify.
1361     if (DMaskVal == 0)
1362       return nullptr;
1363
1364     // Mask off values that are undefined because the dmask doesn't cover them
1365     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1366
1367     unsigned NewDMaskVal = 0;
1368     unsigned OrigLdStIdx = 0;
1369     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1370       const unsigned Bit = 1 << SrcIdx;
1371       if (!!(DMaskVal & Bit)) {
1372         if (!!DemandedElts[OrigLdStIdx])
1373           NewDMaskVal |= Bit;
1374         OrigLdStIdx++;
1375       }
1376     }
1377
1378     if (DMaskVal != NewDMaskVal)
1379       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1380   }
1381
1382   unsigned NewNumElts = DemandedElts.popcount();
1383   if (!NewNumElts)
1384     return PoisonValue::get(IIVTy);
1385
1386   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1387     if (DMaskIdx >= 0)
1388       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1389     return nullptr;
1390   }
1391
1392   // Validate function argument and return types, extracting overloaded types
1393   // along the way.
1394   SmallVector<Type *, 6> OverloadTys;
1395   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1396     return nullptr;
1397
1398   Type *NewTy =
1399       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1400   OverloadTys[0] = NewTy;
1401
1402   if (!IsLoad) {
1403     SmallVector<int, 8> EltMask;
1404     for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1405       if (DemandedElts[OrigStoreIdx])
1406         EltMask.push_back(OrigStoreIdx);
1407
1408     if (NewNumElts == 1)
1409       Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1410     else
1411       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1412   }
1413
1414   Function *NewIntrin = Intrinsic::getDeclaration(
1415       II.getModule(), II.getIntrinsicID(), OverloadTys);
1416   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1417   NewCall->takeName(&II);
1418   NewCall->copyMetadata(II);
1419
1420   if (IsLoad) {
1421     if (NewNumElts == 1) {
1422       return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1423                                             DemandedElts.countr_zero());
1424     }
1425
1426     SmallVector<int, 8> EltMask;
1427     unsigned NewLoadIdx = 0;
1428     for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1429       if (!!DemandedElts[OrigLoadIdx])
1430         EltMask.push_back(NewLoadIdx++);
1431       else
1432         EltMask.push_back(NewNumElts);
1433     }
1434
1435     auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1436
1437     return Shuffle;
1438   }
1439
1440   return NewCall;
1441 }
1442
1443 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1444     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1445     APInt &UndefElts2, APInt &UndefElts3,
1446     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1447         SimplifyAndSetOp) const {
1448   switch (II.getIntrinsicID()) {
1449   case Intrinsic::amdgcn_raw_buffer_load:
1450   case Intrinsic::amdgcn_raw_ptr_buffer_load:
1451   case Intrinsic::amdgcn_raw_buffer_load_format:
1452   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1453   case Intrinsic::amdgcn_raw_tbuffer_load:
1454   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1455   case Intrinsic::amdgcn_s_buffer_load:
1456   case Intrinsic::amdgcn_struct_buffer_load:
1457   case Intrinsic::amdgcn_struct_ptr_buffer_load:
1458   case Intrinsic::amdgcn_struct_buffer_load_format:
1459   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1460   case Intrinsic::amdgcn_struct_tbuffer_load:
1461   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1462     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1463   default: {
1464     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1465       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1466     }
1467     break;
1468   }
1469   }
1470   return std::nullopt;
1471 }