llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp

   1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements a TargetTransformInfo analysis pass specific to the
  10 /// X86 target machine. It uses the target's detailed information to provide
  11 /// more precise answers to certain TTI queries, while letting the target
  12 /// independent and default TTI implementations handle the rest.
  13 ///
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "X86TargetTransformInfo.h"
  17 #include "llvm/IR/IntrinsicInst.h"
  18 #include "llvm/IR/IntrinsicsX86.h"
  19 #include "llvm/Support/KnownBits.h"
  20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
  21 #include <optional>
  22
  23 using namespace llvm;
  24
  25 #define DEBUG_TYPE "x86tti"
  26
  27 /// Return a constant boolean vector that has true elements in all positions
  28 /// where the input constant data vector has an element with the sign bit set.
  29 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
  30   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
  31   V = ConstantExpr::getBitCast(V, IntTy);
  32   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
  33                             V);
  34   return V;
  35 }
  36
  37 /// Convert the x86 XMM integer vector mask to a vector of bools based on
  38 /// each element's most significant bit (the sign bit).
  39 static Value *getBoolVecFromMask(Value *Mask) {
  40   // Fold Constant Mask.
  41   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
  42     return getNegativeIsTrueBoolVec(ConstantMask);
  43
  44   // Mask was extended from a boolean vector.
  45   Value *ExtMask;
  46   if (PatternMatch::match(
  47           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
  48       ExtMask->getType()->isIntOrIntVectorTy(1))
  49     return ExtMask;
  50
  51   return nullptr;
  52 }
  53
  54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  55 // XMM register mask efficiently, we could transform all x86 masked intrinsics
  56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  57 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
  58   Value *Ptr = II.getOperand(0);
  59   Value *Mask = II.getOperand(1);
  60   Constant *ZeroVec = Constant::getNullValue(II.getType());
  61
  62   // Zero Mask - masked load instruction creates a zero vector.
  63   if (isa<ConstantAggregateZero>(Mask))
  64     return IC.replaceInstUsesWith(II, ZeroVec);
  65
  66   // The mask is constant or extended from a bool vector. Convert this x86
  67   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
  68   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
  69     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
  70     // the LLVM intrinsic definition for the pointer argument.
  71     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
  72     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
  73     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
  74
  75     // The pass-through vector for an x86 masked load is a zero vector.
  76     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
  77         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
  78     return IC.replaceInstUsesWith(II, NewMaskedLoad);
  79   }
  80
  81   return nullptr;
  82 }
  83
  84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  85 // XMM register mask efficiently, we could transform all x86 masked intrinsics
  86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  87 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
  88   Value *Ptr = II.getOperand(0);
  89   Value *Mask = II.getOperand(1);
  90   Value *Vec = II.getOperand(2);
  91
  92   // Zero Mask - this masked store instruction does nothing.
  93   if (isa<ConstantAggregateZero>(Mask)) {
  94     IC.eraseInstFromFunction(II);
  95     return true;
  96   }
  97
  98   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
  99   // anything else at this level.
 100   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
 101     return false;
 102
 103   // The mask is constant or extended from a bool vector. Convert this x86
 104   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
 105   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
 106     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
 107     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
 108     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
 109
 110     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
 111
 112     // 'Replace uses' doesn't work for stores. Erase the original masked store.
 113     IC.eraseInstFromFunction(II);
 114     return true;
 115   }
 116
 117   return false;
 118 }
 119
 120 static Value *simplifyX86immShift(const IntrinsicInst &II,
 121                                   InstCombiner::BuilderTy &Builder) {
 122   bool LogicalShift = false;
 123   bool ShiftLeft = false;
 124   bool IsImm = false;
 125
 126   switch (II.getIntrinsicID()) {
 127   default:
 128     llvm_unreachable("Unexpected intrinsic!");
 129   case Intrinsic::x86_sse2_psrai_d:
 130   case Intrinsic::x86_sse2_psrai_w:
 131   case Intrinsic::x86_avx2_psrai_d:
 132   case Intrinsic::x86_avx2_psrai_w:
 133   case Intrinsic::x86_avx512_psrai_q_128:
 134   case Intrinsic::x86_avx512_psrai_q_256:
 135   case Intrinsic::x86_avx512_psrai_d_512:
 136   case Intrinsic::x86_avx512_psrai_q_512:
 137   case Intrinsic::x86_avx512_psrai_w_512:
 138     IsImm = true;
 139     [[fallthrough]];
 140   case Intrinsic::x86_sse2_psra_d:
 141   case Intrinsic::x86_sse2_psra_w:
 142   case Intrinsic::x86_avx2_psra_d:
 143   case Intrinsic::x86_avx2_psra_w:
 144   case Intrinsic::x86_avx512_psra_q_128:
 145   case Intrinsic::x86_avx512_psra_q_256:
 146   case Intrinsic::x86_avx512_psra_d_512:
 147   case Intrinsic::x86_avx512_psra_q_512:
 148   case Intrinsic::x86_avx512_psra_w_512:
 149     LogicalShift = false;
 150     ShiftLeft = false;
 151     break;
 152   case Intrinsic::x86_sse2_psrli_d:
 153   case Intrinsic::x86_sse2_psrli_q:
 154   case Intrinsic::x86_sse2_psrli_w:
 155   case Intrinsic::x86_avx2_psrli_d:
 156   case Intrinsic::x86_avx2_psrli_q:
 157   case Intrinsic::x86_avx2_psrli_w:
 158   case Intrinsic::x86_avx512_psrli_d_512:
 159   case Intrinsic::x86_avx512_psrli_q_512:
 160   case Intrinsic::x86_avx512_psrli_w_512:
 161     IsImm = true;
 162     [[fallthrough]];
 163   case Intrinsic::x86_sse2_psrl_d:
 164   case Intrinsic::x86_sse2_psrl_q:
 165   case Intrinsic::x86_sse2_psrl_w:
 166   case Intrinsic::x86_avx2_psrl_d:
 167   case Intrinsic::x86_avx2_psrl_q:
 168   case Intrinsic::x86_avx2_psrl_w:
 169   case Intrinsic::x86_avx512_psrl_d_512:
 170   case Intrinsic::x86_avx512_psrl_q_512:
 171   case Intrinsic::x86_avx512_psrl_w_512:
 172     LogicalShift = true;
 173     ShiftLeft = false;
 174     break;
 175   case Intrinsic::x86_sse2_pslli_d:
 176   case Intrinsic::x86_sse2_pslli_q:
 177   case Intrinsic::x86_sse2_pslli_w:
 178   case Intrinsic::x86_avx2_pslli_d:
 179   case Intrinsic::x86_avx2_pslli_q:
 180   case Intrinsic::x86_avx2_pslli_w:
 181   case Intrinsic::x86_avx512_pslli_d_512:
 182   case Intrinsic::x86_avx512_pslli_q_512:
 183   case Intrinsic::x86_avx512_pslli_w_512:
 184     IsImm = true;
 185     [[fallthrough]];
 186   case Intrinsic::x86_sse2_psll_d:
 187   case Intrinsic::x86_sse2_psll_q:
 188   case Intrinsic::x86_sse2_psll_w:
 189   case Intrinsic::x86_avx2_psll_d:
 190   case Intrinsic::x86_avx2_psll_q:
 191   case Intrinsic::x86_avx2_psll_w:
 192   case Intrinsic::x86_avx512_psll_d_512:
 193   case Intrinsic::x86_avx512_psll_q_512:
 194   case Intrinsic::x86_avx512_psll_w_512:
 195     LogicalShift = true;
 196     ShiftLeft = true;
 197     break;
 198   }
 199   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 200
 201   Value *Vec = II.getArgOperand(0);
 202   Value *Amt = II.getArgOperand(1);
 203   auto *VT = cast<FixedVectorType>(Vec->getType());
 204   Type *SVT = VT->getElementType();
 205   Type *AmtVT = Amt->getType();
 206   unsigned VWidth = VT->getNumElements();
 207   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
 208
 209   // If the shift amount is guaranteed to be in-range we can replace it with a
 210   // generic shift. If its guaranteed to be out of range, logical shifts combine
 211   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
 212   if (IsImm) {
 213     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
 214     KnownBits KnownAmtBits =
 215         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
 216     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
 217       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
 218       Amt = Builder.CreateVectorSplat(VWidth, Amt);
 219       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
 220                                         : Builder.CreateLShr(Vec, Amt))
 221                            : Builder.CreateAShr(Vec, Amt));
 222     }
 223     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
 224       if (LogicalShift)
 225         return ConstantAggregateZero::get(VT);
 226       Amt = ConstantInt::get(SVT, BitWidth - 1);
 227       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
 228     }
 229   } else {
 230     // Ensure the first element has an in-range value and the rest of the
 231     // elements in the bottom 64 bits are zero.
 232     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
 233            cast<VectorType>(AmtVT)->getElementType() == SVT &&
 234            "Unexpected shift-by-scalar type");
 235     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
 236     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
 237     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
 238     KnownBits KnownLowerBits = llvm::computeKnownBits(
 239         Amt, DemandedLower, II.getModule()->getDataLayout());
 240     KnownBits KnownUpperBits = llvm::computeKnownBits(
 241         Amt, DemandedUpper, II.getModule()->getDataLayout());
 242     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
 243         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
 244       SmallVector<int, 16> ZeroSplat(VWidth, 0);
 245       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
 246       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
 247                                         : Builder.CreateLShr(Vec, Amt))
 248                            : Builder.CreateAShr(Vec, Amt));
 249     }
 250   }
 251
 252   // Simplify if count is constant vector.
 253   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
 254   if (!CDV)
 255     return nullptr;
 256
 257   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
 258   // operand to compute the shift amount.
 259   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
 260          cast<VectorType>(AmtVT)->getElementType() == SVT &&
 261          "Unexpected shift-by-scalar type");
 262
 263   // Concatenate the sub-elements to create the 64-bit value.
 264   APInt Count(64, 0);
 265   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
 266     unsigned SubEltIdx = (NumSubElts - 1) - i;
 267     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
 268     Count <<= BitWidth;
 269     Count |= SubElt->getValue().zextOrTrunc(64);
 270   }
 271
 272   // If shift-by-zero then just return the original value.
 273   if (Count.isZero())
 274     return Vec;
 275
 276   // Handle cases when Shift >= BitWidth.
 277   if (Count.uge(BitWidth)) {
 278     // If LogicalShift - just return zero.
 279     if (LogicalShift)
 280       return ConstantAggregateZero::get(VT);
 281
 282     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
 283     Count = APInt(64, BitWidth - 1);
 284   }
 285
 286   // Get a constant vector of the same type as the first operand.
 287   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
 288   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
 289
 290   if (ShiftLeft)
 291     return Builder.CreateShl(Vec, ShiftVec);
 292
 293   if (LogicalShift)
 294     return Builder.CreateLShr(Vec, ShiftVec);
 295
 296   return Builder.CreateAShr(Vec, ShiftVec);
 297 }
 298
 299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
 300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
 301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
 302 static Value *simplifyX86varShift(const IntrinsicInst &II,
 303                                   InstCombiner::BuilderTy &Builder) {
 304   bool LogicalShift = false;
 305   bool ShiftLeft = false;
 306
 307   switch (II.getIntrinsicID()) {
 308   default:
 309     llvm_unreachable("Unexpected intrinsic!");
 310   case Intrinsic::x86_avx2_psrav_d:
 311   case Intrinsic::x86_avx2_psrav_d_256:
 312   case Intrinsic::x86_avx512_psrav_q_128:
 313   case Intrinsic::x86_avx512_psrav_q_256:
 314   case Intrinsic::x86_avx512_psrav_d_512:
 315   case Intrinsic::x86_avx512_psrav_q_512:
 316   case Intrinsic::x86_avx512_psrav_w_128:
 317   case Intrinsic::x86_avx512_psrav_w_256:
 318   case Intrinsic::x86_avx512_psrav_w_512:
 319     LogicalShift = false;
 320     ShiftLeft = false;
 321     break;
 322   case Intrinsic::x86_avx2_psrlv_d:
 323   case Intrinsic::x86_avx2_psrlv_d_256:
 324   case Intrinsic::x86_avx2_psrlv_q:
 325   case Intrinsic::x86_avx2_psrlv_q_256:
 326   case Intrinsic::x86_avx512_psrlv_d_512:
 327   case Intrinsic::x86_avx512_psrlv_q_512:
 328   case Intrinsic::x86_avx512_psrlv_w_128:
 329   case Intrinsic::x86_avx512_psrlv_w_256:
 330   case Intrinsic::x86_avx512_psrlv_w_512:
 331     LogicalShift = true;
 332     ShiftLeft = false;
 333     break;
 334   case Intrinsic::x86_avx2_psllv_d:
 335   case Intrinsic::x86_avx2_psllv_d_256:
 336   case Intrinsic::x86_avx2_psllv_q:
 337   case Intrinsic::x86_avx2_psllv_q_256:
 338   case Intrinsic::x86_avx512_psllv_d_512:
 339   case Intrinsic::x86_avx512_psllv_q_512:
 340   case Intrinsic::x86_avx512_psllv_w_128:
 341   case Intrinsic::x86_avx512_psllv_w_256:
 342   case Intrinsic::x86_avx512_psllv_w_512:
 343     LogicalShift = true;
 344     ShiftLeft = true;
 345     break;
 346   }
 347   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 348
 349   Value *Vec = II.getArgOperand(0);
 350   Value *Amt = II.getArgOperand(1);
 351   auto *VT = cast<FixedVectorType>(II.getType());
 352   Type *SVT = VT->getElementType();
 353   int NumElts = VT->getNumElements();
 354   int BitWidth = SVT->getIntegerBitWidth();
 355
 356   // If the shift amount is guaranteed to be in-range we can replace it with a
 357   // generic shift.
 358   KnownBits KnownAmt =
 359       llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
 360   if (KnownAmt.getMaxValue().ult(BitWidth)) {
 361     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
 362                                       : Builder.CreateLShr(Vec, Amt))
 363                          : Builder.CreateAShr(Vec, Amt));
 364   }
 365
 366   // Simplify if all shift amounts are constant/undef.
 367   auto *CShift = dyn_cast<Constant>(Amt);
 368   if (!CShift)
 369     return nullptr;
 370
 371   // Collect each element's shift amount.
 372   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
 373   bool AnyOutOfRange = false;
 374   SmallVector<int, 8> ShiftAmts;
 375   for (int I = 0; I < NumElts; ++I) {
 376     auto *CElt = CShift->getAggregateElement(I);
 377     if (isa_and_nonnull<UndefValue>(CElt)) {
 378       ShiftAmts.push_back(-1);
 379       continue;
 380     }
 381
 382     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
 383     if (!COp)
 384       return nullptr;
 385
 386     // Handle out of range shifts.
 387     // If LogicalShift - set to BitWidth (special case).
 388     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
 389     APInt ShiftVal = COp->getValue();
 390     if (ShiftVal.uge(BitWidth)) {
 391       AnyOutOfRange = LogicalShift;
 392       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
 393       continue;
 394     }
 395
 396     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
 397   }
 398
 399   // If all elements out of range or UNDEF, return vector of zeros/undefs.
 400   // ArithmeticShift should only hit this if they are all UNDEF.
 401   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
 402   if (llvm::all_of(ShiftAmts, OutOfRange)) {
 403     SmallVector<Constant *, 8> ConstantVec;
 404     for (int Idx : ShiftAmts) {
 405       if (Idx < 0) {
 406         ConstantVec.push_back(UndefValue::get(SVT));
 407       } else {
 408         assert(LogicalShift && "Logical shift expected");
 409         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
 410       }
 411     }
 412     return ConstantVector::get(ConstantVec);
 413   }
 414
 415   // We can't handle only some out of range values with generic logical shifts.
 416   if (AnyOutOfRange)
 417     return nullptr;
 418
 419   // Build the shift amount constant vector.
 420   SmallVector<Constant *, 8> ShiftVecAmts;
 421   for (int Idx : ShiftAmts) {
 422     if (Idx < 0)
 423       ShiftVecAmts.push_back(UndefValue::get(SVT));
 424     else
 425       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
 426   }
 427   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
 428
 429   if (ShiftLeft)
 430     return Builder.CreateShl(Vec, ShiftVec);
 431
 432   if (LogicalShift)
 433     return Builder.CreateLShr(Vec, ShiftVec);
 434
 435   return Builder.CreateAShr(Vec, ShiftVec);
 436 }
 437
 438 static Value *simplifyX86pack(IntrinsicInst &II,
 439                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
 440   Value *Arg0 = II.getArgOperand(0);
 441   Value *Arg1 = II.getArgOperand(1);
 442   Type *ResTy = II.getType();
 443
 444   // Fast all undef handling.
 445   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
 446     return UndefValue::get(ResTy);
 447
 448   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
 449   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
 450   unsigned NumSrcElts = ArgTy->getNumElements();
 451   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
 452          "Unexpected packing types");
 453
 454   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
 455   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
 456   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
 457   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
 458          "Unexpected packing types");
 459
 460   // Constant folding.
 461   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
 462     return nullptr;
 463
 464   // Clamp Values - signed/unsigned both use signed clamp values, but they
 465   // differ on the min/max values.
 466   APInt MinValue, MaxValue;
 467   if (IsSigned) {
 468     // PACKSS: Truncate signed value with signed saturation.
 469     // Source values less than dst minint are saturated to minint.
 470     // Source values greater than dst maxint are saturated to maxint.
 471     MinValue =
 472         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
 473     MaxValue =
 474         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
 475   } else {
 476     // PACKUS: Truncate signed value with unsigned saturation.
 477     // Source values less than zero are saturated to zero.
 478     // Source values greater than dst maxuint are saturated to maxuint.
 479     MinValue = APInt::getZero(SrcScalarSizeInBits);
 480     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
 481   }
 482
 483   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
 484   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
 485   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
 486   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
 487   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
 488   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
 489
 490   // Shuffle clamped args together at the lane level.
 491   SmallVector<int, 32> PackMask;
 492   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
 493     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
 494       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
 495     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
 496       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
 497   }
 498   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
 499
 500   // Truncate to dst size.
 501   return Builder.CreateTrunc(Shuffle, ResTy);
 502 }
 503
 504 static Value *simplifyX86movmsk(const IntrinsicInst &II,
 505                                 InstCombiner::BuilderTy &Builder) {
 506   Value *Arg = II.getArgOperand(0);
 507   Type *ResTy = II.getType();
 508
 509   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
 510   if (isa<UndefValue>(Arg))
 511     return Constant::getNullValue(ResTy);
 512
 513   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
 514   // We can't easily peek through x86_mmx types.
 515   if (!ArgTy)
 516     return nullptr;
 517
 518   // Expand MOVMSK to compare/bitcast/zext:
 519   // e.g. PMOVMSKB(v16i8 x):
 520   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
 521   // %int = bitcast <16 x i1> %cmp to i16
 522   // %res = zext i16 %int to i32
 523   unsigned NumElts = ArgTy->getNumElements();
 524   Type *IntegerTy = Builder.getIntNTy(NumElts);
 525
 526   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
 527   Res = Builder.CreateIsNeg(Res);
 528   Res = Builder.CreateBitCast(Res, IntegerTy);
 529   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
 530   return Res;
 531 }
 532
 533 static Value *simplifyX86addcarry(const IntrinsicInst &II,
 534                                   InstCombiner::BuilderTy &Builder) {
 535   Value *CarryIn = II.getArgOperand(0);
 536   Value *Op1 = II.getArgOperand(1);
 537   Value *Op2 = II.getArgOperand(2);
 538   Type *RetTy = II.getType();
 539   Type *OpTy = Op1->getType();
 540   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
 541          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
 542          "Unexpected types for x86 addcarry");
 543
 544   // If carry-in is zero, this is just an unsigned add with overflow.
 545   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
 546     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
 547                                           {Op1, Op2});
 548     // The types have to be adjusted to match the x86 call types.
 549     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
 550     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
 551                                        Builder.getInt8Ty());
 552     Value *Res = PoisonValue::get(RetTy);
 553     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
 554     return Builder.CreateInsertValue(Res, UAddResult, 1);
 555   }
 556
 557   return nullptr;
 558 }
 559
 560 static Value *simplifyTernarylogic(const IntrinsicInst &II,
 561                                    InstCombiner::BuilderTy &Builder) {
 562
 563   auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
 564   if (!ArgImm || ArgImm->getValue().uge(256))
 565     return nullptr;
 566
 567   Value *ArgA = II.getArgOperand(0);
 568   Value *ArgB = II.getArgOperand(1);
 569   Value *ArgC = II.getArgOperand(2);
 570
 571   Type *Ty = II.getType();
 572
 573   auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
 574     return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
 575   };
 576   auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
 577     return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
 578   };
 579   auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
 580     return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
 581   };
 582   auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
 583     return {Builder.CreateNot(V.first), ~V.second};
 584   };
 585   auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
 586   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
 587   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
 588
 589   bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
 590   bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
 591   bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
 592
 593   bool ABIsConst = AIsConst && BIsConst;
 594   bool ACIsConst = AIsConst && CIsConst;
 595   bool BCIsConst = BIsConst && CIsConst;
 596   bool ABCIsConst = AIsConst && BIsConst && CIsConst;
 597
 598   // Use for verification. Its a big table. Its difficult to go from Imm ->
 599   // logic ops, but easy to verify that a set of logic ops is correct. We track
 600   // the logic ops through the second value in the pair. At the end it should
 601   // equal Imm.
 602   std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
 603   std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
 604   std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
 605   std::pair<Value *, uint8_t> Res = {nullptr, 0};
 606
 607   // Currently we only handle cases that convert directly to another instruction
 608   // or cases where all the ops are constant.  This is because we don't properly
 609   // handle creating ternary ops in the backend, so splitting them here may
 610   // cause regressions. As the backend improves, uncomment more cases.
 611
 612   uint8_t Imm = ArgImm->getValue().getZExtValue();
 613   switch (Imm) {
 614   case 0x0:
 615     Res = {Constant::getNullValue(Ty), 0};
 616     break;
 617   case 0x1:
 618     if (ABCIsConst)
 619       Res = Nor(Or(A, B), C);
 620     break;
 621   case 0x2:
 622     if (ABCIsConst)
 623       Res = And(Nor(A, B), C);
 624     break;
 625   case 0x3:
 626     if (ABIsConst)
 627       Res = Nor(A, B);
 628     break;
 629   case 0x4:
 630     if (ABCIsConst)
 631       Res = And(Nor(A, C), B);
 632     break;
 633   case 0x5:
 634     if (ACIsConst)
 635       Res = Nor(A, C);
 636     break;
 637   case 0x6:
 638     if (ABCIsConst)
 639       Res = Nor(A, Xnor(B, C));
 640     break;
 641   case 0x7:
 642     if (ABCIsConst)
 643       Res = Nor(A, And(B, C));
 644     break;
 645   case 0x8:
 646     if (ABCIsConst)
 647       Res = Nor(A, Nand(B, C));
 648     break;
 649   case 0x9:
 650     if (ABCIsConst)
 651       Res = Nor(A, Xor(B, C));
 652     break;
 653   case 0xa:
 654     if (ACIsConst)
 655       Res = Nor(A, Not(C));
 656     break;
 657   case 0xb:
 658     if (ABCIsConst)
 659       Res = Nor(A, Nor(C, Not(B)));
 660     break;
 661   case 0xc:
 662     if (ABIsConst)
 663       Res = Nor(A, Not(B));
 664     break;
 665   case 0xd:
 666     if (ABCIsConst)
 667       Res = Nor(A, Nor(B, Not(C)));
 668     break;
 669   case 0xe:
 670     if (ABCIsConst)
 671       Res = Nor(A, Nor(B, C));
 672     break;
 673   case 0xf:
 674     Res = Not(A);
 675     break;
 676   case 0x10:
 677     if (ABCIsConst)
 678       Res = And(A, Nor(B, C));
 679     break;
 680   case 0x11:
 681     if (BCIsConst)
 682       Res = Nor(B, C);
 683     break;
 684   case 0x12:
 685     if (ABCIsConst)
 686       Res = Nor(Xnor(A, C), B);
 687     break;
 688   case 0x13:
 689     if (ABCIsConst)
 690       Res = Nor(And(A, C), B);
 691     break;
 692   case 0x14:
 693     if (ABCIsConst)
 694       Res = Nor(Xnor(A, B), C);
 695     break;
 696   case 0x15:
 697     if (ABCIsConst)
 698       Res = Nor(And(A, B), C);
 699     break;
 700   case 0x16:
 701     if (ABCIsConst)
 702       Res = Xor(Xor(A, B), And(Nand(A, B), C));
 703     break;
 704   case 0x17:
 705     if (ABCIsConst)
 706       Res = Xor(Or(A, B), Or(Xnor(A, B), C));
 707     break;
 708   case 0x18:
 709     if (ABCIsConst)
 710       Res = Nor(Xnor(A, B), Xnor(A, C));
 711     break;
 712   case 0x19:
 713     if (ABCIsConst)
 714       Res = And(Nand(A, B), Xnor(B, C));
 715     break;
 716   case 0x1a:
 717     if (ABCIsConst)
 718       Res = Xor(A, Or(And(A, B), C));
 719     break;
 720   case 0x1b:
 721     if (ABCIsConst)
 722       Res = Xor(A, Or(Xnor(A, B), C));
 723     break;
 724   case 0x1c:
 725     if (ABCIsConst)
 726       Res = Xor(A, Or(And(A, C), B));
 727     break;
 728   case 0x1d:
 729     if (ABCIsConst)
 730       Res = Xor(A, Or(Xnor(A, C), B));
 731     break;
 732   case 0x1e:
 733     if (ABCIsConst)
 734       Res = Xor(A, Or(B, C));
 735     break;
 736   case 0x1f:
 737     if (ABCIsConst)
 738       Res = Nand(A, Or(B, C));
 739     break;
 740   case 0x20:
 741     if (ABCIsConst)
 742       Res = Nor(Nand(A, C), B);
 743     break;
 744   case 0x21:
 745     if (ABCIsConst)
 746       Res = Nor(Xor(A, C), B);
 747     break;
 748   case 0x22:
 749     if (BCIsConst)
 750       Res = Nor(B, Not(C));
 751     break;
 752   case 0x23:
 753     if (ABCIsConst)
 754       Res = Nor(B, Nor(C, Not(A)));
 755     break;
 756   case 0x24:
 757     if (ABCIsConst)
 758       Res = Nor(Xnor(A, B), Xor(A, C));
 759     break;
 760   case 0x25:
 761     if (ABCIsConst)
 762       Res = Xor(A, Nand(Nand(A, B), C));
 763     break;
 764   case 0x26:
 765     if (ABCIsConst)
 766       Res = And(Nand(A, B), Xor(B, C));
 767     break;
 768   case 0x27:
 769     if (ABCIsConst)
 770       Res = Xor(Or(Xnor(A, B), C), B);
 771     break;
 772   case 0x28:
 773     if (ABCIsConst)
 774       Res = And(Xor(A, B), C);
 775     break;
 776   case 0x29:
 777     if (ABCIsConst)
 778       Res = Xor(Xor(A, B), Nor(And(A, B), C));
 779     break;
 780   case 0x2a:
 781     if (ABCIsConst)
 782       Res = And(Nand(A, B), C);
 783     break;
 784   case 0x2b:
 785     if (ABCIsConst)
 786       Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
 787     break;
 788   case 0x2c:
 789     if (ABCIsConst)
 790       Res = Nor(Xnor(A, B), Nor(B, C));
 791     break;
 792   case 0x2d:
 793     if (ABCIsConst)
 794       Res = Xor(A, Or(B, Not(C)));
 795     break;
 796   case 0x2e:
 797     if (ABCIsConst)
 798       Res = Xor(A, Or(Xor(A, C), B));
 799     break;
 800   case 0x2f:
 801     if (ABCIsConst)
 802       Res = Nand(A, Or(B, Not(C)));
 803     break;
 804   case 0x30:
 805     if (ABIsConst)
 806       Res = Nor(B, Not(A));
 807     break;
 808   case 0x31:
 809     if (ABCIsConst)
 810       Res = Nor(Nor(A, Not(C)), B);
 811     break;
 812   case 0x32:
 813     if (ABCIsConst)
 814       Res = Nor(Nor(A, C), B);
 815     break;
 816   case 0x33:
 817     Res = Not(B);
 818     break;
 819   case 0x34:
 820     if (ABCIsConst)
 821       Res = And(Xor(A, B), Nand(B, C));
 822     break;
 823   case 0x35:
 824     if (ABCIsConst)
 825       Res = Xor(B, Or(A, Xnor(B, C)));
 826     break;
 827   case 0x36:
 828     if (ABCIsConst)
 829       Res = Xor(Or(A, C), B);
 830     break;
 831   case 0x37:
 832     if (ABCIsConst)
 833       Res = Nand(Or(A, C), B);
 834     break;
 835   case 0x38:
 836     if (ABCIsConst)
 837       Res = Nor(Xnor(A, B), Nor(A, C));
 838     break;
 839   case 0x39:
 840     if (ABCIsConst)
 841       Res = Xor(Or(A, Not(C)), B);
 842     break;
 843   case 0x3a:
 844     if (ABCIsConst)
 845       Res = Xor(B, Or(A, Xor(B, C)));
 846     break;
 847   case 0x3b:
 848     if (ABCIsConst)
 849       Res = Nand(Or(A, Not(C)), B);
 850     break;
 851   case 0x3c:
 852     Res = Xor(A, B);
 853     break;
 854   case 0x3d:
 855     if (ABCIsConst)
 856       Res = Xor(A, Or(Nor(A, C), B));
 857     break;
 858   case 0x3e:
 859     if (ABCIsConst)
 860       Res = Xor(A, Or(Nor(A, Not(C)), B));
 861     break;
 862   case 0x3f:
 863     if (ABIsConst)
 864       Res = Nand(A, B);
 865     break;
 866   case 0x40:
 867     if (ABCIsConst)
 868       Res = Nor(Nand(A, B), C);
 869     break;
 870   case 0x41:
 871     if (ABCIsConst)
 872       Res = Nor(Xor(A, B), C);
 873     break;
 874   case 0x42:
 875     if (ABCIsConst)
 876       Res = Nor(Xor(A, B), Xnor(A, C));
 877     break;
 878   case 0x43:
 879     if (ABCIsConst)
 880       Res = Xor(A, Nand(Nand(A, C), B));
 881     break;
 882   case 0x44:
 883     if (BCIsConst)
 884       Res = Nor(C, Not(B));
 885     break;
 886   case 0x45:
 887     if (ABCIsConst)
 888       Res = Nor(Nor(B, Not(A)), C);
 889     break;
 890   case 0x46:
 891     if (ABCIsConst)
 892       Res = Xor(Or(And(A, C), B), C);
 893     break;
 894   case 0x47:
 895     if (ABCIsConst)
 896       Res = Xor(Or(Xnor(A, C), B), C);
 897     break;
 898   case 0x48:
 899     if (ABCIsConst)
 900       Res = And(Xor(A, C), B);
 901     break;
 902   case 0x49:
 903     if (ABCIsConst)
 904       Res = Xor(Or(Xnor(A, B), And(A, C)), C);
 905     break;
 906   case 0x4a:
 907     if (ABCIsConst)
 908       Res = Nor(Xnor(A, C), Nor(B, C));
 909     break;
 910   case 0x4b:
 911     if (ABCIsConst)
 912       Res = Xor(A, Or(C, Not(B)));
 913     break;
 914   case 0x4c:
 915     if (ABCIsConst)
 916       Res = And(Nand(A, C), B);
 917     break;
 918   case 0x4d:
 919     if (ABCIsConst)
 920       Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
 921     break;
 922   case 0x4e:
 923     if (ABCIsConst)
 924       Res = Xor(A, Or(Xor(A, B), C));
 925     break;
 926   case 0x4f:
 927     if (ABCIsConst)
 928       Res = Nand(A, Nand(B, Not(C)));
 929     break;
 930   case 0x50:
 931     if (ACIsConst)
 932       Res = Nor(C, Not(A));
 933     break;
 934   case 0x51:
 935     if (ABCIsConst)
 936       Res = Nor(Nor(A, Not(B)), C);
 937     break;
 938   case 0x52:
 939     if (ABCIsConst)
 940       Res = And(Xor(A, C), Nand(B, C));
 941     break;
 942   case 0x53:
 943     if (ABCIsConst)
 944       Res = Xor(Or(Xnor(B, C), A), C);
 945     break;
 946   case 0x54:
 947     if (ABCIsConst)
 948       Res = Nor(Nor(A, B), C);
 949     break;
 950   case 0x55:
 951     Res = Not(C);
 952     break;
 953   case 0x56:
 954     if (ABCIsConst)
 955       Res = Xor(Or(A, B), C);
 956     break;
 957   case 0x57:
 958     if (ABCIsConst)
 959       Res = Nand(Or(A, B), C);
 960     break;
 961   case 0x58:
 962     if (ABCIsConst)
 963       Res = Nor(Nor(A, B), Xnor(A, C));
 964     break;
 965   case 0x59:
 966     if (ABCIsConst)
 967       Res = Xor(Or(A, Not(B)), C);
 968     break;
 969   case 0x5a:
 970     Res = Xor(A, C);
 971     break;
 972   case 0x5b:
 973     if (ABCIsConst)
 974       Res = Xor(A, Or(Nor(A, B), C));
 975     break;
 976   case 0x5c:
 977     if (ABCIsConst)
 978       Res = Xor(Or(Xor(B, C), A), C);
 979     break;
 980   case 0x5d:
 981     if (ABCIsConst)
 982       Res = Nand(Or(A, Not(B)), C);
 983     break;
 984   case 0x5e:
 985     if (ABCIsConst)
 986       Res = Xor(A, Or(Nor(A, Not(B)), C));
 987     break;
 988   case 0x5f:
 989     if (ACIsConst)
 990       Res = Nand(A, C);
 991     break;
 992   case 0x60:
 993     if (ABCIsConst)
 994       Res = And(A, Xor(B, C));
 995     break;
 996   case 0x61:
 997     if (ABCIsConst)
 998       Res = Xor(Or(Xnor(A, B), And(B, C)), C);
 999     break;
1000   case 0x62:
1001     if (ABCIsConst)
1002       Res = Nor(Nor(A, C), Xnor(B, C));
1003     break;
1004   case 0x63:
1005     if (ABCIsConst)
1006       Res = Xor(B, Or(C, Not(A)));
1007     break;
1008   case 0x64:
1009     if (ABCIsConst)
1010       Res = Nor(Nor(A, B), Xnor(B, C));
1011     break;
1012   case 0x65:
1013     if (ABCIsConst)
1014       Res = Xor(Or(B, Not(A)), C);
1015     break;
1016   case 0x66:
1017     Res = Xor(B, C);
1018     break;
1019   case 0x67:
1020     if (ABCIsConst)
1021       Res = Or(Nor(A, B), Xor(B, C));
1022     break;
1023   case 0x68:
1024     if (ABCIsConst)
1025       Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1026     break;
1027   case 0x69:
1028     if (ABCIsConst)
1029       Res = Xor(Xnor(A, B), C);
1030     break;
1031   case 0x6a:
1032     if (ABCIsConst)
1033       Res = Xor(And(A, B), C);
1034     break;
1035   case 0x6b:
1036     if (ABCIsConst)
1037       Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1038     break;
1039   case 0x6c:
1040     if (ABCIsConst)
1041       Res = Xor(And(A, C), B);
1042     break;
1043   case 0x6d:
1044     if (ABCIsConst)
1045       Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1046     break;
1047   case 0x6e:
1048     if (ABCIsConst)
1049       Res = Or(Nor(A, Not(B)), Xor(B, C));
1050     break;
1051   case 0x6f:
1052     if (ABCIsConst)
1053       Res = Nand(A, Xnor(B, C));
1054     break;
1055   case 0x70:
1056     if (ABCIsConst)
1057       Res = And(A, Nand(B, C));
1058     break;
1059   case 0x71:
1060     if (ABCIsConst)
1061       Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1062     break;
1063   case 0x72:
1064     if (ABCIsConst)
1065       Res = Xor(Or(Xor(A, B), C), B);
1066     break;
1067   case 0x73:
1068     if (ABCIsConst)
1069       Res = Nand(Nand(A, Not(C)), B);
1070     break;
1071   case 0x74:
1072     if (ABCIsConst)
1073       Res = Xor(Or(Xor(A, C), B), C);
1074     break;
1075   case 0x75:
1076     if (ABCIsConst)
1077       Res = Nand(Nand(A, Not(B)), C);
1078     break;
1079   case 0x76:
1080     if (ABCIsConst)
1081       Res = Xor(B, Or(Nor(B, Not(A)), C));
1082     break;
1083   case 0x77:
1084     if (BCIsConst)
1085       Res = Nand(B, C);
1086     break;
1087   case 0x78:
1088     if (ABCIsConst)
1089       Res = Xor(A, And(B, C));
1090     break;
1091   case 0x79:
1092     if (ABCIsConst)
1093       Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1094     break;
1095   case 0x7a:
1096     if (ABCIsConst)
1097       Res = Or(Xor(A, C), Nor(B, Not(A)));
1098     break;
1099   case 0x7b:
1100     if (ABCIsConst)
1101       Res = Nand(Xnor(A, C), B);
1102     break;
1103   case 0x7c:
1104     if (ABCIsConst)
1105       Res = Or(Xor(A, B), Nor(C, Not(A)));
1106     break;
1107   case 0x7d:
1108     if (ABCIsConst)
1109       Res = Nand(Xnor(A, B), C);
1110     break;
1111   case 0x7e:
1112     if (ABCIsConst)
1113       Res = Or(Xor(A, B), Xor(A, C));
1114     break;
1115   case 0x7f:
1116     if (ABCIsConst)
1117       Res = Nand(And(A, B), C);
1118     break;
1119   case 0x80:
1120     if (ABCIsConst)
1121       Res = And(And(A, B), C);
1122     break;
1123   case 0x81:
1124     if (ABCIsConst)
1125       Res = Nor(Xor(A, B), Xor(A, C));
1126     break;
1127   case 0x82:
1128     if (ABCIsConst)
1129       Res = And(Xnor(A, B), C);
1130     break;
1131   case 0x83:
1132     if (ABCIsConst)
1133       Res = Nor(Xor(A, B), Nor(C, Not(A)));
1134     break;
1135   case 0x84:
1136     if (ABCIsConst)
1137       Res = And(Xnor(A, C), B);
1138     break;
1139   case 0x85:
1140     if (ABCIsConst)
1141       Res = Nor(Xor(A, C), Nor(B, Not(A)));
1142     break;
1143   case 0x86:
1144     if (ABCIsConst)
1145       Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1146     break;
1147   case 0x87:
1148     if (ABCIsConst)
1149       Res = Xor(A, Nand(B, C));
1150     break;
1151   case 0x88:
1152     Res = And(B, C);
1153     break;
1154   case 0x89:
1155     if (ABCIsConst)
1156       Res = Xor(B, Nor(Nor(B, Not(A)), C));
1157     break;
1158   case 0x8a:
1159     if (ABCIsConst)
1160       Res = And(Nand(A, Not(B)), C);
1161     break;
1162   case 0x8b:
1163     if (ABCIsConst)
1164       Res = Xor(Nor(Xor(A, C), B), C);
1165     break;
1166   case 0x8c:
1167     if (ABCIsConst)
1168       Res = And(Nand(A, Not(C)), B);
1169     break;
1170   case 0x8d:
1171     if (ABCIsConst)
1172       Res = Xor(Nor(Xor(A, B), C), B);
1173     break;
1174   case 0x8e:
1175     if (ABCIsConst)
1176       Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1177     break;
1178   case 0x8f:
1179     if (ABCIsConst)
1180       Res = Nand(A, Nand(B, C));
1181     break;
1182   case 0x90:
1183     if (ABCIsConst)
1184       Res = And(A, Xnor(B, C));
1185     break;
1186   case 0x91:
1187     if (ABCIsConst)
1188       Res = Nor(Nor(A, Not(B)), Xor(B, C));
1189     break;
1190   case 0x92:
1191     if (ABCIsConst)
1192       Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1193     break;
1194   case 0x93:
1195     if (ABCIsConst)
1196       Res = Xor(Nand(A, C), B);
1197     break;
1198   case 0x94:
1199     if (ABCIsConst)
1200       Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1201     break;
1202   case 0x95:
1203     if (ABCIsConst)
1204       Res = Xor(Nand(A, B), C);
1205     break;
1206   case 0x96:
1207     if (ABCIsConst)
1208       Res = Xor(Xor(A, B), C);
1209     break;
1210   case 0x97:
1211     if (ABCIsConst)
1212       Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1213     break;
1214   case 0x98:
1215     if (ABCIsConst)
1216       Res = Nor(Nor(A, B), Xor(B, C));
1217     break;
1218   case 0x99:
1219     if (BCIsConst)
1220       Res = Xnor(B, C);
1221     break;
1222   case 0x9a:
1223     if (ABCIsConst)
1224       Res = Xor(Nor(B, Not(A)), C);
1225     break;
1226   case 0x9b:
1227     if (ABCIsConst)
1228       Res = Or(Nor(A, B), Xnor(B, C));
1229     break;
1230   case 0x9c:
1231     if (ABCIsConst)
1232       Res = Xor(B, Nor(C, Not(A)));
1233     break;
1234   case 0x9d:
1235     if (ABCIsConst)
1236       Res = Or(Nor(A, C), Xnor(B, C));
1237     break;
1238   case 0x9e:
1239     if (ABCIsConst)
1240       Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1241     break;
1242   case 0x9f:
1243     if (ABCIsConst)
1244       Res = Nand(A, Xor(B, C));
1245     break;
1246   case 0xa0:
1247     Res = And(A, C);
1248     break;
1249   case 0xa1:
1250     if (ABCIsConst)
1251       Res = Xor(A, Nor(Nor(A, Not(B)), C));
1252     break;
1253   case 0xa2:
1254     if (ABCIsConst)
1255       Res = And(Or(A, Not(B)), C);
1256     break;
1257   case 0xa3:
1258     if (ABCIsConst)
1259       Res = Xor(Nor(Xor(B, C), A), C);
1260     break;
1261   case 0xa4:
1262     if (ABCIsConst)
1263       Res = Xor(A, Nor(Nor(A, B), C));
1264     break;
1265   case 0xa5:
1266     if (ACIsConst)
1267       Res = Xnor(A, C);
1268     break;
1269   case 0xa6:
1270     if (ABCIsConst)
1271       Res = Xor(Nor(A, Not(B)), C);
1272     break;
1273   case 0xa7:
1274     if (ABCIsConst)
1275       Res = Or(Nor(A, B), Xnor(A, C));
1276     break;
1277   case 0xa8:
1278     if (ABCIsConst)
1279       Res = And(Or(A, B), C);
1280     break;
1281   case 0xa9:
1282     if (ABCIsConst)
1283       Res = Xor(Nor(A, B), C);
1284     break;
1285   case 0xaa:
1286     Res = C;
1287     break;
1288   case 0xab:
1289     if (ABCIsConst)
1290       Res = Or(Nor(A, B), C);
1291     break;
1292   case 0xac:
1293     if (ABCIsConst)
1294       Res = Xor(Nor(Xnor(B, C), A), C);
1295     break;
1296   case 0xad:
1297     if (ABCIsConst)
1298       Res = Or(Xnor(A, C), And(B, C));
1299     break;
1300   case 0xae:
1301     if (ABCIsConst)
1302       Res = Or(Nor(A, Not(B)), C);
1303     break;
1304   case 0xaf:
1305     if (ACIsConst)
1306       Res = Or(C, Not(A));
1307     break;
1308   case 0xb0:
1309     if (ABCIsConst)
1310       Res = And(A, Nand(B, Not(C)));
1311     break;
1312   case 0xb1:
1313     if (ABCIsConst)
1314       Res = Xor(A, Nor(Xor(A, B), C));
1315     break;
1316   case 0xb2:
1317     if (ABCIsConst)
1318       Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1319     break;
1320   case 0xb3:
1321     if (ABCIsConst)
1322       Res = Nand(Nand(A, C), B);
1323     break;
1324   case 0xb4:
1325     if (ABCIsConst)
1326       Res = Xor(A, Nor(C, Not(B)));
1327     break;
1328   case 0xb5:
1329     if (ABCIsConst)
1330       Res = Or(Xnor(A, C), Nor(B, C));
1331     break;
1332   case 0xb6:
1333     if (ABCIsConst)
1334       Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1335     break;
1336   case 0xb7:
1337     if (ABCIsConst)
1338       Res = Nand(Xor(A, C), B);
1339     break;
1340   case 0xb8:
1341     if (ABCIsConst)
1342       Res = Xor(Nor(Xnor(A, C), B), C);
1343     break;
1344   case 0xb9:
1345     if (ABCIsConst)
1346       Res = Xor(Nor(And(A, C), B), C);
1347     break;
1348   case 0xba:
1349     if (ABCIsConst)
1350       Res = Or(Nor(B, Not(A)), C);
1351     break;
1352   case 0xbb:
1353     if (BCIsConst)
1354       Res = Or(C, Not(B));
1355     break;
1356   case 0xbc:
1357     if (ABCIsConst)
1358       Res = Xor(A, And(Nand(A, C), B));
1359     break;
1360   case 0xbd:
1361     if (ABCIsConst)
1362       Res = Or(Xor(A, B), Xnor(A, C));
1363     break;
1364   case 0xbe:
1365     if (ABCIsConst)
1366       Res = Or(Xor(A, B), C);
1367     break;
1368   case 0xbf:
1369     if (ABCIsConst)
1370       Res = Or(Nand(A, B), C);
1371     break;
1372   case 0xc0:
1373     Res = And(A, B);
1374     break;
1375   case 0xc1:
1376     if (ABCIsConst)
1377       Res = Xor(A, Nor(Nor(A, Not(C)), B));
1378     break;
1379   case 0xc2:
1380     if (ABCIsConst)
1381       Res = Xor(A, Nor(Nor(A, C), B));
1382     break;
1383   case 0xc3:
1384     if (ABIsConst)
1385       Res = Xnor(A, B);
1386     break;
1387   case 0xc4:
1388     if (ABCIsConst)
1389       Res = And(Or(A, Not(C)), B);
1390     break;
1391   case 0xc5:
1392     if (ABCIsConst)
1393       Res = Xor(B, Nor(A, Xor(B, C)));
1394     break;
1395   case 0xc6:
1396     if (ABCIsConst)
1397       Res = Xor(Nor(A, Not(C)), B);
1398     break;
1399   case 0xc7:
1400     if (ABCIsConst)
1401       Res = Or(Xnor(A, B), Nor(A, C));
1402     break;
1403   case 0xc8:
1404     if (ABCIsConst)
1405       Res = And(Or(A, C), B);
1406     break;
1407   case 0xc9:
1408     if (ABCIsConst)
1409       Res = Xor(Nor(A, C), B);
1410     break;
1411   case 0xca:
1412     if (ABCIsConst)
1413       Res = Xor(B, Nor(A, Xnor(B, C)));
1414     break;
1415   case 0xcb:
1416     if (ABCIsConst)
1417       Res = Or(Xnor(A, B), And(B, C));
1418     break;
1419   case 0xcc:
1420     Res = B;
1421     break;
1422   case 0xcd:
1423     if (ABCIsConst)
1424       Res = Or(Nor(A, C), B);
1425     break;
1426   case 0xce:
1427     if (ABCIsConst)
1428       Res = Or(Nor(A, Not(C)), B);
1429     break;
1430   case 0xcf:
1431     if (ABIsConst)
1432       Res = Or(B, Not(A));
1433     break;
1434   case 0xd0:
1435     if (ABCIsConst)
1436       Res = And(A, Or(B, Not(C)));
1437     break;
1438   case 0xd1:
1439     if (ABCIsConst)
1440       Res = Xor(A, Nor(Xor(A, C), B));
1441     break;
1442   case 0xd2:
1443     if (ABCIsConst)
1444       Res = Xor(A, Nor(B, Not(C)));
1445     break;
1446   case 0xd3:
1447     if (ABCIsConst)
1448       Res = Or(Xnor(A, B), Nor(B, C));
1449     break;
1450   case 0xd4:
1451     if (ABCIsConst)
1452       Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1453     break;
1454   case 0xd5:
1455     if (ABCIsConst)
1456       Res = Nand(Nand(A, B), C);
1457     break;
1458   case 0xd6:
1459     if (ABCIsConst)
1460       Res = Xor(Xor(A, B), Or(And(A, B), C));
1461     break;
1462   case 0xd7:
1463     if (ABCIsConst)
1464       Res = Nand(Xor(A, B), C);
1465     break;
1466   case 0xd8:
1467     if (ABCIsConst)
1468       Res = Xor(Nor(Xnor(A, B), C), B);
1469     break;
1470   case 0xd9:
1471     if (ABCIsConst)
1472       Res = Or(And(A, B), Xnor(B, C));
1473     break;
1474   case 0xda:
1475     if (ABCIsConst)
1476       Res = Xor(A, And(Nand(A, B), C));
1477     break;
1478   case 0xdb:
1479     if (ABCIsConst)
1480       Res = Or(Xnor(A, B), Xor(A, C));
1481     break;
1482   case 0xdc:
1483     if (ABCIsConst)
1484       Res = Or(B, Nor(C, Not(A)));
1485     break;
1486   case 0xdd:
1487     if (BCIsConst)
1488       Res = Or(B, Not(C));
1489     break;
1490   case 0xde:
1491     if (ABCIsConst)
1492       Res = Or(Xor(A, C), B);
1493     break;
1494   case 0xdf:
1495     if (ABCIsConst)
1496       Res = Or(Nand(A, C), B);
1497     break;
1498   case 0xe0:
1499     if (ABCIsConst)
1500       Res = And(A, Or(B, C));
1501     break;
1502   case 0xe1:
1503     if (ABCIsConst)
1504       Res = Xor(A, Nor(B, C));
1505     break;
1506   case 0xe2:
1507     if (ABCIsConst)
1508       Res = Xor(A, Nor(Xnor(A, C), B));
1509     break;
1510   case 0xe3:
1511     if (ABCIsConst)
1512       Res = Xor(A, Nor(And(A, C), B));
1513     break;
1514   case 0xe4:
1515     if (ABCIsConst)
1516       Res = Xor(A, Nor(Xnor(A, B), C));
1517     break;
1518   case 0xe5:
1519     if (ABCIsConst)
1520       Res = Xor(A, Nor(And(A, B), C));
1521     break;
1522   case 0xe6:
1523     if (ABCIsConst)
1524       Res = Or(And(A, B), Xor(B, C));
1525     break;
1526   case 0xe7:
1527     if (ABCIsConst)
1528       Res = Or(Xnor(A, B), Xnor(A, C));
1529     break;
1530   case 0xe8:
1531     if (ABCIsConst)
1532       Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1533     break;
1534   case 0xe9:
1535     if (ABCIsConst)
1536       Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1537     break;
1538   case 0xea:
1539     if (ABCIsConst)
1540       Res = Or(And(A, B), C);
1541     break;
1542   case 0xeb:
1543     if (ABCIsConst)
1544       Res = Or(Xnor(A, B), C);
1545     break;
1546   case 0xec:
1547     if (ABCIsConst)
1548       Res = Or(And(A, C), B);
1549     break;
1550   case 0xed:
1551     if (ABCIsConst)
1552       Res = Or(Xnor(A, C), B);
1553     break;
1554   case 0xee:
1555     Res = Or(B, C);
1556     break;
1557   case 0xef:
1558     if (ABCIsConst)
1559       Res = Nand(A, Nor(B, C));
1560     break;
1561   case 0xf0:
1562     Res = A;
1563     break;
1564   case 0xf1:
1565     if (ABCIsConst)
1566       Res = Or(A, Nor(B, C));
1567     break;
1568   case 0xf2:
1569     if (ABCIsConst)
1570       Res = Or(A, Nor(B, Not(C)));
1571     break;
1572   case 0xf3:
1573     if (ABIsConst)
1574       Res = Or(A, Not(B));
1575     break;
1576   case 0xf4:
1577     if (ABCIsConst)
1578       Res = Or(A, Nor(C, Not(B)));
1579     break;
1580   case 0xf5:
1581     if (ACIsConst)
1582       Res = Or(A, Not(C));
1583     break;
1584   case 0xf6:
1585     if (ABCIsConst)
1586       Res = Or(A, Xor(B, C));
1587     break;
1588   case 0xf7:
1589     if (ABCIsConst)
1590       Res = Or(A, Nand(B, C));
1591     break;
1592   case 0xf8:
1593     if (ABCIsConst)
1594       Res = Or(A, And(B, C));
1595     break;
1596   case 0xf9:
1597     if (ABCIsConst)
1598       Res = Or(A, Xnor(B, C));
1599     break;
1600   case 0xfa:
1601     Res = Or(A, C);
1602     break;
1603   case 0xfb:
1604     if (ABCIsConst)
1605       Res = Nand(Nor(A, C), B);
1606     break;
1607   case 0xfc:
1608     Res = Or(A, B);
1609     break;
1610   case 0xfd:
1611     if (ABCIsConst)
1612       Res = Nand(Nor(A, B), C);
1613     break;
1614   case 0xfe:
1615     if (ABCIsConst)
1616       Res = Or(Or(A, B), C);
1617     break;
1618   case 0xff:
1619     Res = {Constant::getAllOnesValue(Ty), 0xff};
1620     break;
1621   }
1622
1623   assert((Res.first == nullptr || Res.second == Imm) &&
1624          "Simplification of ternary logic does not verify!");
1625   return Res.first;
1626 }
1627
1628 static Value *simplifyX86insertps(const IntrinsicInst &II,
1629                                   InstCombiner::BuilderTy &Builder) {
1630   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1631   if (!CInt)
1632     return nullptr;
1633
1634   auto *VecTy = cast<FixedVectorType>(II.getType());
1635   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1636
1637   // The immediate permute control byte looks like this:
1638   //    [3:0] - zero mask for each 32-bit lane
1639   //    [5:4] - select one 32-bit destination lane
1640   //    [7:6] - select one 32-bit source lane
1641
1642   uint8_t Imm = CInt->getZExtValue();
1643   uint8_t ZMask = Imm & 0xf;
1644   uint8_t DestLane = (Imm >> 4) & 0x3;
1645   uint8_t SourceLane = (Imm >> 6) & 0x3;
1646
1647   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1648
1649   // If all zero mask bits are set, this was just a weird way to
1650   // generate a zero vector.
1651   if (ZMask == 0xf)
1652     return ZeroVector;
1653
1654   // Initialize by passing all of the first source bits through.
1655   int ShuffleMask[4] = {0, 1, 2, 3};
1656
1657   // We may replace the second operand with the zero vector.
1658   Value *V1 = II.getArgOperand(1);
1659
1660   if (ZMask) {
1661     // If the zero mask is being used with a single input or the zero mask
1662     // overrides the destination lane, this is a shuffle with the zero vector.
1663     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1664         (ZMask & (1 << DestLane))) {
1665       V1 = ZeroVector;
1666       // We may still move 32-bits of the first source vector from one lane
1667       // to another.
1668       ShuffleMask[DestLane] = SourceLane;
1669       // The zero mask may override the previous insert operation.
1670       for (unsigned i = 0; i < 4; ++i)
1671         if ((ZMask >> i) & 0x1)
1672           ShuffleMask[i] = i + 4;
1673     } else {
1674       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1675       return nullptr;
1676     }
1677   } else {
1678     // Replace the selected destination lane with the selected source lane.
1679     ShuffleMask[DestLane] = SourceLane + 4;
1680   }
1681
1682   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1683 }
1684
1685 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1686 /// or conversion to a shuffle vector.
1687 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1688                                ConstantInt *CILength, ConstantInt *CIIndex,
1689                                InstCombiner::BuilderTy &Builder) {
1690   auto LowConstantHighUndef = [&](uint64_t Val) {
1691     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1692     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1693                         UndefValue::get(IntTy64)};
1694     return ConstantVector::get(Args);
1695   };
1696
1697   // See if we're dealing with constant values.
1698   auto *C0 = dyn_cast<Constant>(Op0);
1699   auto *CI0 =
1700       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1701          : nullptr;
1702
1703   // Attempt to constant fold.
1704   if (CILength && CIIndex) {
1705     // From AMD documentation: "The bit index and field length are each six
1706     // bits in length other bits of the field are ignored."
1707     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1708     APInt APLength = CILength->getValue().zextOrTrunc(6);
1709
1710     unsigned Index = APIndex.getZExtValue();
1711
1712     // From AMD documentation: "a value of zero in the field length is
1713     // defined as length of 64".
1714     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1715
1716     // From AMD documentation: "If the sum of the bit index + length field
1717     // is greater than 64, the results are undefined".
1718     unsigned End = Index + Length;
1719
1720     // Note that both field index and field length are 8-bit quantities.
1721     // Since variables 'Index' and 'Length' are unsigned values
1722     // obtained from zero-extending field index and field length
1723     // respectively, their sum should never wrap around.
1724     if (End > 64)
1725       return UndefValue::get(II.getType());
1726
1727     // If we are inserting whole bytes, we can convert this to a shuffle.
1728     // Lowering can recognize EXTRQI shuffle masks.
1729     if ((Length % 8) == 0 && (Index % 8) == 0) {
1730       // Convert bit indices to byte indices.
1731       Length /= 8;
1732       Index /= 8;
1733
1734       Type *IntTy8 = Type::getInt8Ty(II.getContext());
1735       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1736
1737       SmallVector<int, 16> ShuffleMask;
1738       for (int i = 0; i != (int)Length; ++i)
1739         ShuffleMask.push_back(i + Index);
1740       for (int i = Length; i != 8; ++i)
1741         ShuffleMask.push_back(i + 16);
1742       for (int i = 8; i != 16; ++i)
1743         ShuffleMask.push_back(-1);
1744
1745       Value *SV = Builder.CreateShuffleVector(
1746           Builder.CreateBitCast(Op0, ShufTy),
1747           ConstantAggregateZero::get(ShufTy), ShuffleMask);
1748       return Builder.CreateBitCast(SV, II.getType());
1749     }
1750
1751     // Constant Fold - shift Index'th bit to lowest position and mask off
1752     // Length bits.
1753     if (CI0) {
1754       APInt Elt = CI0->getValue();
1755       Elt.lshrInPlace(Index);
1756       Elt = Elt.zextOrTrunc(Length);
1757       return LowConstantHighUndef(Elt.getZExtValue());
1758     }
1759
1760     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1761     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1762       Value *Args[] = {Op0, CILength, CIIndex};
1763       Module *M = II.getModule();
1764       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1765       return Builder.CreateCall(F, Args);
1766     }
1767   }
1768
1769   // Constant Fold - extraction from zero is always {zero, undef}.
1770   if (CI0 && CI0->isZero())
1771     return LowConstantHighUndef(0);
1772
1773   return nullptr;
1774 }
1775
1776 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1777 /// folding or conversion to a shuffle vector.
1778 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1779                                  APInt APLength, APInt APIndex,
1780                                  InstCombiner::BuilderTy &Builder) {
1781   // From AMD documentation: "The bit index and field length are each six bits
1782   // in length other bits of the field are ignored."
1783   APIndex = APIndex.zextOrTrunc(6);
1784   APLength = APLength.zextOrTrunc(6);
1785
1786   // Attempt to constant fold.
1787   unsigned Index = APIndex.getZExtValue();
1788
1789   // From AMD documentation: "a value of zero in the field length is
1790   // defined as length of 64".
1791   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1792
1793   // From AMD documentation: "If the sum of the bit index + length field
1794   // is greater than 64, the results are undefined".
1795   unsigned End = Index + Length;
1796
1797   // Note that both field index and field length are 8-bit quantities.
1798   // Since variables 'Index' and 'Length' are unsigned values
1799   // obtained from zero-extending field index and field length
1800   // respectively, their sum should never wrap around.
1801   if (End > 64)
1802     return UndefValue::get(II.getType());
1803
1804   // If we are inserting whole bytes, we can convert this to a shuffle.
1805   // Lowering can recognize INSERTQI shuffle masks.
1806   if ((Length % 8) == 0 && (Index % 8) == 0) {
1807     // Convert bit indices to byte indices.
1808     Length /= 8;
1809     Index /= 8;
1810
1811     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1812     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1813
1814     SmallVector<int, 16> ShuffleMask;
1815     for (int i = 0; i != (int)Index; ++i)
1816       ShuffleMask.push_back(i);
1817     for (int i = 0; i != (int)Length; ++i)
1818       ShuffleMask.push_back(i + 16);
1819     for (int i = Index + Length; i != 8; ++i)
1820       ShuffleMask.push_back(i);
1821     for (int i = 8; i != 16; ++i)
1822       ShuffleMask.push_back(-1);
1823
1824     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1825                                             Builder.CreateBitCast(Op1, ShufTy),
1826                                             ShuffleMask);
1827     return Builder.CreateBitCast(SV, II.getType());
1828   }
1829
1830   // See if we're dealing with constant values.
1831   auto *C0 = dyn_cast<Constant>(Op0);
1832   auto *C1 = dyn_cast<Constant>(Op1);
1833   auto *CI00 =
1834       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1835          : nullptr;
1836   auto *CI10 =
1837       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1838          : nullptr;
1839
1840   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1841   if (CI00 && CI10) {
1842     APInt V00 = CI00->getValue();
1843     APInt V10 = CI10->getValue();
1844     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1845     V00 = V00 & ~Mask;
1846     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1847     APInt Val = V00 | V10;
1848     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1849     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1850                         UndefValue::get(IntTy64)};
1851     return ConstantVector::get(Args);
1852   }
1853
1854   // If we were an INSERTQ call, we'll save demanded elements if we convert to
1855   // INSERTQI.
1856   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1857     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1858     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1859     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1860
1861     Value *Args[] = {Op0, Op1, CILength, CIIndex};
1862     Module *M = II.getModule();
1863     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1864     return Builder.CreateCall(F, Args);
1865   }
1866
1867   return nullptr;
1868 }
1869
1870 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1871 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1872                                 InstCombiner::BuilderTy &Builder) {
1873   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1874   if (!V)
1875     return nullptr;
1876
1877   auto *VecTy = cast<FixedVectorType>(II.getType());
1878   unsigned NumElts = VecTy->getNumElements();
1879   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1880          "Unexpected number of elements in shuffle mask!");
1881
1882   // Construct a shuffle mask from constant integers or UNDEFs.
1883   int Indexes[64];
1884
1885   // Each byte in the shuffle control mask forms an index to permute the
1886   // corresponding byte in the destination operand.
1887   for (unsigned I = 0; I < NumElts; ++I) {
1888     Constant *COp = V->getAggregateElement(I);
1889     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1890       return nullptr;
1891
1892     if (isa<UndefValue>(COp)) {
1893       Indexes[I] = -1;
1894       continue;
1895     }
1896
1897     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1898
1899     // If the most significant bit (bit[7]) of each byte of the shuffle
1900     // control mask is set, then zero is written in the result byte.
1901     // The zero vector is in the right-hand side of the resulting
1902     // shufflevector.
1903
1904     // The value of each index for the high 128-bit lane is the least
1905     // significant 4 bits of the respective shuffle control byte.
1906     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1907     Indexes[I] = Index;
1908   }
1909
1910   auto V1 = II.getArgOperand(0);
1911   auto V2 = Constant::getNullValue(VecTy);
1912   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1913 }
1914
1915 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1916 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1917                                     InstCombiner::BuilderTy &Builder) {
1918   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1919   if (!V)
1920     return nullptr;
1921
1922   auto *VecTy = cast<FixedVectorType>(II.getType());
1923   unsigned NumElts = VecTy->getNumElements();
1924   bool IsPD = VecTy->getScalarType()->isDoubleTy();
1925   unsigned NumLaneElts = IsPD ? 2 : 4;
1926   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1927
1928   // Construct a shuffle mask from constant integers or UNDEFs.
1929   int Indexes[16];
1930
1931   // The intrinsics only read one or two bits, clear the rest.
1932   for (unsigned I = 0; I < NumElts; ++I) {
1933     Constant *COp = V->getAggregateElement(I);
1934     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1935       return nullptr;
1936
1937     if (isa<UndefValue>(COp)) {
1938       Indexes[I] = -1;
1939       continue;
1940     }
1941
1942     APInt Index = cast<ConstantInt>(COp)->getValue();
1943     Index = Index.zextOrTrunc(32).getLoBits(2);
1944
1945     // The PD variants uses bit 1 to select per-lane element index, so
1946     // shift down to convert to generic shuffle mask index.
1947     if (IsPD)
1948       Index.lshrInPlace(1);
1949
1950     // The _256 variants are a bit trickier since the mask bits always index
1951     // into the corresponding 128 half. In order to convert to a generic
1952     // shuffle, we have to make that explicit.
1953     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
1954
1955     Indexes[I] = Index.getZExtValue();
1956   }
1957
1958   auto V1 = II.getArgOperand(0);
1959   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
1960 }
1961
1962 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1963 static Value *simplifyX86vpermv(const IntrinsicInst &II,
1964                                 InstCombiner::BuilderTy &Builder) {
1965   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1966   if (!V)
1967     return nullptr;
1968
1969   auto *VecTy = cast<FixedVectorType>(II.getType());
1970   unsigned Size = VecTy->getNumElements();
1971   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
1972          "Unexpected shuffle mask size");
1973
1974   // Construct a shuffle mask from constant integers or UNDEFs.
1975   int Indexes[64];
1976
1977   for (unsigned I = 0; I < Size; ++I) {
1978     Constant *COp = V->getAggregateElement(I);
1979     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1980       return nullptr;
1981
1982     if (isa<UndefValue>(COp)) {
1983       Indexes[I] = -1;
1984       continue;
1985     }
1986
1987     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
1988     Index &= Size - 1;
1989     Indexes[I] = Index;
1990   }
1991
1992   auto V1 = II.getArgOperand(0);
1993   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
1994 }
1995
1996 std::optional<Instruction *>
1997 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1998   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
1999                                              unsigned DemandedWidth) {
2000     APInt UndefElts(Width, 0);
2001     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2002     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2003   };
2004
2005   Intrinsic::ID IID = II.getIntrinsicID();
2006   switch (IID) {
2007   case Intrinsic::x86_bmi_bextr_32:
2008   case Intrinsic::x86_bmi_bextr_64:
2009   case Intrinsic::x86_tbm_bextri_u32:
2010   case Intrinsic::x86_tbm_bextri_u64:
2011     // If the RHS is a constant we can try some simplifications.
2012     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2013       uint64_t Shift = C->getZExtValue();
2014       uint64_t Length = (Shift >> 8) & 0xff;
2015       Shift &= 0xff;
2016       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2017       // If the length is 0 or the shift is out of range, replace with zero.
2018       if (Length == 0 || Shift >= BitWidth) {
2019         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2020       }
2021       // If the LHS is also a constant, we can completely constant fold this.
2022       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2023         uint64_t Result = InC->getZExtValue() >> Shift;
2024         if (Length > BitWidth)
2025           Length = BitWidth;
2026         Result &= maskTrailingOnes<uint64_t>(Length);
2027         return IC.replaceInstUsesWith(II,
2028                                       ConstantInt::get(II.getType(), Result));
2029       }
2030       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2031       // are only masking bits that a shift already cleared?
2032     }
2033     break;
2034
2035   case Intrinsic::x86_bmi_bzhi_32:
2036   case Intrinsic::x86_bmi_bzhi_64:
2037     // If the RHS is a constant we can try some simplifications.
2038     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2039       uint64_t Index = C->getZExtValue() & 0xff;
2040       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2041       if (Index >= BitWidth) {
2042         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2043       }
2044       if (Index == 0) {
2045         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2046       }
2047       // If the LHS is also a constant, we can completely constant fold this.
2048       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2049         uint64_t Result = InC->getZExtValue();
2050         Result &= maskTrailingOnes<uint64_t>(Index);
2051         return IC.replaceInstUsesWith(II,
2052                                       ConstantInt::get(II.getType(), Result));
2053       }
2054       // TODO should we convert this to an AND if the RHS is constant?
2055     }
2056     break;
2057   case Intrinsic::x86_bmi_pext_32:
2058   case Intrinsic::x86_bmi_pext_64:
2059     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2060       if (MaskC->isNullValue()) {
2061         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2062       }
2063       if (MaskC->isAllOnesValue()) {
2064         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2065       }
2066
2067       unsigned MaskIdx, MaskLen;
2068       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2069         // any single contingous sequence of 1s anywhere in the mask simply
2070         // describes a subset of the input bits shifted to the appropriate
2071         // position.  Replace with the straight forward IR.
2072         Value *Input = II.getArgOperand(0);
2073         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2074         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2075         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2076         return IC.replaceInstUsesWith(II, Shifted);
2077       }
2078
2079       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2080         uint64_t Src = SrcC->getZExtValue();
2081         uint64_t Mask = MaskC->getZExtValue();
2082         uint64_t Result = 0;
2083         uint64_t BitToSet = 1;
2084
2085         while (Mask) {
2086           // Isolate lowest set bit.
2087           uint64_t BitToTest = Mask & -Mask;
2088           if (BitToTest & Src)
2089             Result |= BitToSet;
2090
2091           BitToSet <<= 1;
2092           // Clear lowest set bit.
2093           Mask &= Mask - 1;
2094         }
2095
2096         return IC.replaceInstUsesWith(II,
2097                                       ConstantInt::get(II.getType(), Result));
2098       }
2099     }
2100     break;
2101   case Intrinsic::x86_bmi_pdep_32:
2102   case Intrinsic::x86_bmi_pdep_64:
2103     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2104       if (MaskC->isNullValue()) {
2105         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2106       }
2107       if (MaskC->isAllOnesValue()) {
2108         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2109       }
2110
2111       unsigned MaskIdx, MaskLen;
2112       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2113         // any single contingous sequence of 1s anywhere in the mask simply
2114         // describes a subset of the input bits shifted to the appropriate
2115         // position.  Replace with the straight forward IR.
2116         Value *Input = II.getArgOperand(0);
2117         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2118         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2119         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2120         return IC.replaceInstUsesWith(II, Masked);
2121       }
2122
2123       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2124         uint64_t Src = SrcC->getZExtValue();
2125         uint64_t Mask = MaskC->getZExtValue();
2126         uint64_t Result = 0;
2127         uint64_t BitToTest = 1;
2128
2129         while (Mask) {
2130           // Isolate lowest set bit.
2131           uint64_t BitToSet = Mask & -Mask;
2132           if (BitToTest & Src)
2133             Result |= BitToSet;
2134
2135           BitToTest <<= 1;
2136           // Clear lowest set bit;
2137           Mask &= Mask - 1;
2138         }
2139
2140         return IC.replaceInstUsesWith(II,
2141                                       ConstantInt::get(II.getType(), Result));
2142       }
2143     }
2144     break;
2145
2146   case Intrinsic::x86_sse_cvtss2si:
2147   case Intrinsic::x86_sse_cvtss2si64:
2148   case Intrinsic::x86_sse_cvttss2si:
2149   case Intrinsic::x86_sse_cvttss2si64:
2150   case Intrinsic::x86_sse2_cvtsd2si:
2151   case Intrinsic::x86_sse2_cvtsd2si64:
2152   case Intrinsic::x86_sse2_cvttsd2si:
2153   case Intrinsic::x86_sse2_cvttsd2si64:
2154   case Intrinsic::x86_avx512_vcvtss2si32:
2155   case Intrinsic::x86_avx512_vcvtss2si64:
2156   case Intrinsic::x86_avx512_vcvtss2usi32:
2157   case Intrinsic::x86_avx512_vcvtss2usi64:
2158   case Intrinsic::x86_avx512_vcvtsd2si32:
2159   case Intrinsic::x86_avx512_vcvtsd2si64:
2160   case Intrinsic::x86_avx512_vcvtsd2usi32:
2161   case Intrinsic::x86_avx512_vcvtsd2usi64:
2162   case Intrinsic::x86_avx512_cvttss2si:
2163   case Intrinsic::x86_avx512_cvttss2si64:
2164   case Intrinsic::x86_avx512_cvttss2usi:
2165   case Intrinsic::x86_avx512_cvttss2usi64:
2166   case Intrinsic::x86_avx512_cvttsd2si:
2167   case Intrinsic::x86_avx512_cvttsd2si64:
2168   case Intrinsic::x86_avx512_cvttsd2usi:
2169   case Intrinsic::x86_avx512_cvttsd2usi64: {
2170     // These intrinsics only demand the 0th element of their input vectors. If
2171     // we can simplify the input based on that, do so now.
2172     Value *Arg = II.getArgOperand(0);
2173     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2174     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2175       return IC.replaceOperand(II, 0, V);
2176     }
2177     break;
2178   }
2179
2180   case Intrinsic::x86_mmx_pmovmskb:
2181   case Intrinsic::x86_sse_movmsk_ps:
2182   case Intrinsic::x86_sse2_movmsk_pd:
2183   case Intrinsic::x86_sse2_pmovmskb_128:
2184   case Intrinsic::x86_avx_movmsk_pd_256:
2185   case Intrinsic::x86_avx_movmsk_ps_256:
2186   case Intrinsic::x86_avx2_pmovmskb:
2187     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2188       return IC.replaceInstUsesWith(II, V);
2189     }
2190     break;
2191
2192   case Intrinsic::x86_sse_comieq_ss:
2193   case Intrinsic::x86_sse_comige_ss:
2194   case Intrinsic::x86_sse_comigt_ss:
2195   case Intrinsic::x86_sse_comile_ss:
2196   case Intrinsic::x86_sse_comilt_ss:
2197   case Intrinsic::x86_sse_comineq_ss:
2198   case Intrinsic::x86_sse_ucomieq_ss:
2199   case Intrinsic::x86_sse_ucomige_ss:
2200   case Intrinsic::x86_sse_ucomigt_ss:
2201   case Intrinsic::x86_sse_ucomile_ss:
2202   case Intrinsic::x86_sse_ucomilt_ss:
2203   case Intrinsic::x86_sse_ucomineq_ss:
2204   case Intrinsic::x86_sse2_comieq_sd:
2205   case Intrinsic::x86_sse2_comige_sd:
2206   case Intrinsic::x86_sse2_comigt_sd:
2207   case Intrinsic::x86_sse2_comile_sd:
2208   case Intrinsic::x86_sse2_comilt_sd:
2209   case Intrinsic::x86_sse2_comineq_sd:
2210   case Intrinsic::x86_sse2_ucomieq_sd:
2211   case Intrinsic::x86_sse2_ucomige_sd:
2212   case Intrinsic::x86_sse2_ucomigt_sd:
2213   case Intrinsic::x86_sse2_ucomile_sd:
2214   case Intrinsic::x86_sse2_ucomilt_sd:
2215   case Intrinsic::x86_sse2_ucomineq_sd:
2216   case Intrinsic::x86_avx512_vcomi_ss:
2217   case Intrinsic::x86_avx512_vcomi_sd:
2218   case Intrinsic::x86_avx512_mask_cmp_ss:
2219   case Intrinsic::x86_avx512_mask_cmp_sd: {
2220     // These intrinsics only demand the 0th element of their input vectors. If
2221     // we can simplify the input based on that, do so now.
2222     bool MadeChange = false;
2223     Value *Arg0 = II.getArgOperand(0);
2224     Value *Arg1 = II.getArgOperand(1);
2225     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2226     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2227       IC.replaceOperand(II, 0, V);
2228       MadeChange = true;
2229     }
2230     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2231       IC.replaceOperand(II, 1, V);
2232       MadeChange = true;
2233     }
2234     if (MadeChange) {
2235       return &II;
2236     }
2237     break;
2238   }
2239
2240   case Intrinsic::x86_avx512_add_ps_512:
2241   case Intrinsic::x86_avx512_div_ps_512:
2242   case Intrinsic::x86_avx512_mul_ps_512:
2243   case Intrinsic::x86_avx512_sub_ps_512:
2244   case Intrinsic::x86_avx512_add_pd_512:
2245   case Intrinsic::x86_avx512_div_pd_512:
2246   case Intrinsic::x86_avx512_mul_pd_512:
2247   case Intrinsic::x86_avx512_sub_pd_512:
2248     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2249     // IR operations.
2250     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2251       if (R->getValue() == 4) {
2252         Value *Arg0 = II.getArgOperand(0);
2253         Value *Arg1 = II.getArgOperand(1);
2254
2255         Value *V;
2256         switch (IID) {
2257         default:
2258           llvm_unreachable("Case stmts out of sync!");
2259         case Intrinsic::x86_avx512_add_ps_512:
2260         case Intrinsic::x86_avx512_add_pd_512:
2261           V = IC.Builder.CreateFAdd(Arg0, Arg1);
2262           break;
2263         case Intrinsic::x86_avx512_sub_ps_512:
2264         case Intrinsic::x86_avx512_sub_pd_512:
2265           V = IC.Builder.CreateFSub(Arg0, Arg1);
2266           break;
2267         case Intrinsic::x86_avx512_mul_ps_512:
2268         case Intrinsic::x86_avx512_mul_pd_512:
2269           V = IC.Builder.CreateFMul(Arg0, Arg1);
2270           break;
2271         case Intrinsic::x86_avx512_div_ps_512:
2272         case Intrinsic::x86_avx512_div_pd_512:
2273           V = IC.Builder.CreateFDiv(Arg0, Arg1);
2274           break;
2275         }
2276
2277         return IC.replaceInstUsesWith(II, V);
2278       }
2279     }
2280     break;
2281
2282   case Intrinsic::x86_avx512_mask_add_ss_round:
2283   case Intrinsic::x86_avx512_mask_div_ss_round:
2284   case Intrinsic::x86_avx512_mask_mul_ss_round:
2285   case Intrinsic::x86_avx512_mask_sub_ss_round:
2286   case Intrinsic::x86_avx512_mask_add_sd_round:
2287   case Intrinsic::x86_avx512_mask_div_sd_round:
2288   case Intrinsic::x86_avx512_mask_mul_sd_round:
2289   case Intrinsic::x86_avx512_mask_sub_sd_round:
2290     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2291     // IR operations.
2292     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2293       if (R->getValue() == 4) {
2294         // Extract the element as scalars.
2295         Value *Arg0 = II.getArgOperand(0);
2296         Value *Arg1 = II.getArgOperand(1);
2297         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2298         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2299
2300         Value *V;
2301         switch (IID) {
2302         default:
2303           llvm_unreachable("Case stmts out of sync!");
2304         case Intrinsic::x86_avx512_mask_add_ss_round:
2305         case Intrinsic::x86_avx512_mask_add_sd_round:
2306           V = IC.Builder.CreateFAdd(LHS, RHS);
2307           break;
2308         case Intrinsic::x86_avx512_mask_sub_ss_round:
2309         case Intrinsic::x86_avx512_mask_sub_sd_round:
2310           V = IC.Builder.CreateFSub(LHS, RHS);
2311           break;
2312         case Intrinsic::x86_avx512_mask_mul_ss_round:
2313         case Intrinsic::x86_avx512_mask_mul_sd_round:
2314           V = IC.Builder.CreateFMul(LHS, RHS);
2315           break;
2316         case Intrinsic::x86_avx512_mask_div_ss_round:
2317         case Intrinsic::x86_avx512_mask_div_sd_round:
2318           V = IC.Builder.CreateFDiv(LHS, RHS);
2319           break;
2320         }
2321
2322         // Handle the masking aspect of the intrinsic.
2323         Value *Mask = II.getArgOperand(3);
2324         auto *C = dyn_cast<ConstantInt>(Mask);
2325         // We don't need a select if we know the mask bit is a 1.
2326         if (!C || !C->getValue()[0]) {
2327           // Cast the mask to an i1 vector and then extract the lowest element.
2328           auto *MaskTy = FixedVectorType::get(
2329               IC.Builder.getInt1Ty(),
2330               cast<IntegerType>(Mask->getType())->getBitWidth());
2331           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2332           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2333           // Extract the lowest element from the passthru operand.
2334           Value *Passthru =
2335               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2336           V = IC.Builder.CreateSelect(Mask, V, Passthru);
2337         }
2338
2339         // Insert the result back into the original argument 0.
2340         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2341
2342         return IC.replaceInstUsesWith(II, V);
2343       }
2344     }
2345     break;
2346
2347   // Constant fold ashr( <A x Bi>, Ci ).
2348   // Constant fold lshr( <A x Bi>, Ci ).
2349   // Constant fold shl( <A x Bi>, Ci ).
2350   case Intrinsic::x86_sse2_psrai_d:
2351   case Intrinsic::x86_sse2_psrai_w:
2352   case Intrinsic::x86_avx2_psrai_d:
2353   case Intrinsic::x86_avx2_psrai_w:
2354   case Intrinsic::x86_avx512_psrai_q_128:
2355   case Intrinsic::x86_avx512_psrai_q_256:
2356   case Intrinsic::x86_avx512_psrai_d_512:
2357   case Intrinsic::x86_avx512_psrai_q_512:
2358   case Intrinsic::x86_avx512_psrai_w_512:
2359   case Intrinsic::x86_sse2_psrli_d:
2360   case Intrinsic::x86_sse2_psrli_q:
2361   case Intrinsic::x86_sse2_psrli_w:
2362   case Intrinsic::x86_avx2_psrli_d:
2363   case Intrinsic::x86_avx2_psrli_q:
2364   case Intrinsic::x86_avx2_psrli_w:
2365   case Intrinsic::x86_avx512_psrli_d_512:
2366   case Intrinsic::x86_avx512_psrli_q_512:
2367   case Intrinsic::x86_avx512_psrli_w_512:
2368   case Intrinsic::x86_sse2_pslli_d:
2369   case Intrinsic::x86_sse2_pslli_q:
2370   case Intrinsic::x86_sse2_pslli_w:
2371   case Intrinsic::x86_avx2_pslli_d:
2372   case Intrinsic::x86_avx2_pslli_q:
2373   case Intrinsic::x86_avx2_pslli_w:
2374   case Intrinsic::x86_avx512_pslli_d_512:
2375   case Intrinsic::x86_avx512_pslli_q_512:
2376   case Intrinsic::x86_avx512_pslli_w_512:
2377     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2378       return IC.replaceInstUsesWith(II, V);
2379     }
2380     break;
2381
2382   case Intrinsic::x86_sse2_psra_d:
2383   case Intrinsic::x86_sse2_psra_w:
2384   case Intrinsic::x86_avx2_psra_d:
2385   case Intrinsic::x86_avx2_psra_w:
2386   case Intrinsic::x86_avx512_psra_q_128:
2387   case Intrinsic::x86_avx512_psra_q_256:
2388   case Intrinsic::x86_avx512_psra_d_512:
2389   case Intrinsic::x86_avx512_psra_q_512:
2390   case Intrinsic::x86_avx512_psra_w_512:
2391   case Intrinsic::x86_sse2_psrl_d:
2392   case Intrinsic::x86_sse2_psrl_q:
2393   case Intrinsic::x86_sse2_psrl_w:
2394   case Intrinsic::x86_avx2_psrl_d:
2395   case Intrinsic::x86_avx2_psrl_q:
2396   case Intrinsic::x86_avx2_psrl_w:
2397   case Intrinsic::x86_avx512_psrl_d_512:
2398   case Intrinsic::x86_avx512_psrl_q_512:
2399   case Intrinsic::x86_avx512_psrl_w_512:
2400   case Intrinsic::x86_sse2_psll_d:
2401   case Intrinsic::x86_sse2_psll_q:
2402   case Intrinsic::x86_sse2_psll_w:
2403   case Intrinsic::x86_avx2_psll_d:
2404   case Intrinsic::x86_avx2_psll_q:
2405   case Intrinsic::x86_avx2_psll_w:
2406   case Intrinsic::x86_avx512_psll_d_512:
2407   case Intrinsic::x86_avx512_psll_q_512:
2408   case Intrinsic::x86_avx512_psll_w_512: {
2409     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2410       return IC.replaceInstUsesWith(II, V);
2411     }
2412
2413     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2414     // operand to compute the shift amount.
2415     Value *Arg1 = II.getArgOperand(1);
2416     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2417            "Unexpected packed shift size");
2418     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2419
2420     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2421       return IC.replaceOperand(II, 1, V);
2422     }
2423     break;
2424   }
2425
2426   case Intrinsic::x86_avx2_psllv_d:
2427   case Intrinsic::x86_avx2_psllv_d_256:
2428   case Intrinsic::x86_avx2_psllv_q:
2429   case Intrinsic::x86_avx2_psllv_q_256:
2430   case Intrinsic::x86_avx512_psllv_d_512:
2431   case Intrinsic::x86_avx512_psllv_q_512:
2432   case Intrinsic::x86_avx512_psllv_w_128:
2433   case Intrinsic::x86_avx512_psllv_w_256:
2434   case Intrinsic::x86_avx512_psllv_w_512:
2435   case Intrinsic::x86_avx2_psrav_d:
2436   case Intrinsic::x86_avx2_psrav_d_256:
2437   case Intrinsic::x86_avx512_psrav_q_128:
2438   case Intrinsic::x86_avx512_psrav_q_256:
2439   case Intrinsic::x86_avx512_psrav_d_512:
2440   case Intrinsic::x86_avx512_psrav_q_512:
2441   case Intrinsic::x86_avx512_psrav_w_128:
2442   case Intrinsic::x86_avx512_psrav_w_256:
2443   case Intrinsic::x86_avx512_psrav_w_512:
2444   case Intrinsic::x86_avx2_psrlv_d:
2445   case Intrinsic::x86_avx2_psrlv_d_256:
2446   case Intrinsic::x86_avx2_psrlv_q:
2447   case Intrinsic::x86_avx2_psrlv_q_256:
2448   case Intrinsic::x86_avx512_psrlv_d_512:
2449   case Intrinsic::x86_avx512_psrlv_q_512:
2450   case Intrinsic::x86_avx512_psrlv_w_128:
2451   case Intrinsic::x86_avx512_psrlv_w_256:
2452   case Intrinsic::x86_avx512_psrlv_w_512:
2453     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2454       return IC.replaceInstUsesWith(II, V);
2455     }
2456     break;
2457
2458   case Intrinsic::x86_sse2_packssdw_128:
2459   case Intrinsic::x86_sse2_packsswb_128:
2460   case Intrinsic::x86_avx2_packssdw:
2461   case Intrinsic::x86_avx2_packsswb:
2462   case Intrinsic::x86_avx512_packssdw_512:
2463   case Intrinsic::x86_avx512_packsswb_512:
2464     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2465       return IC.replaceInstUsesWith(II, V);
2466     }
2467     break;
2468
2469   case Intrinsic::x86_sse2_packuswb_128:
2470   case Intrinsic::x86_sse41_packusdw:
2471   case Intrinsic::x86_avx2_packusdw:
2472   case Intrinsic::x86_avx2_packuswb:
2473   case Intrinsic::x86_avx512_packusdw_512:
2474   case Intrinsic::x86_avx512_packuswb_512:
2475     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2476       return IC.replaceInstUsesWith(II, V);
2477     }
2478     break;
2479
2480   case Intrinsic::x86_pclmulqdq:
2481   case Intrinsic::x86_pclmulqdq_256:
2482   case Intrinsic::x86_pclmulqdq_512: {
2483     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2484       unsigned Imm = C->getZExtValue();
2485
2486       bool MadeChange = false;
2487       Value *Arg0 = II.getArgOperand(0);
2488       Value *Arg1 = II.getArgOperand(1);
2489       unsigned VWidth =
2490           cast<FixedVectorType>(Arg0->getType())->getNumElements();
2491
2492       APInt UndefElts1(VWidth, 0);
2493       APInt DemandedElts1 =
2494           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2495       if (Value *V =
2496               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2497         IC.replaceOperand(II, 0, V);
2498         MadeChange = true;
2499       }
2500
2501       APInt UndefElts2(VWidth, 0);
2502       APInt DemandedElts2 =
2503           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2504       if (Value *V =
2505               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2506         IC.replaceOperand(II, 1, V);
2507         MadeChange = true;
2508       }
2509
2510       // If either input elements are undef, the result is zero.
2511       if (DemandedElts1.isSubsetOf(UndefElts1) ||
2512           DemandedElts2.isSubsetOf(UndefElts2)) {
2513         return IC.replaceInstUsesWith(II,
2514                                       ConstantAggregateZero::get(II.getType()));
2515       }
2516
2517       if (MadeChange) {
2518         return &II;
2519       }
2520     }
2521     break;
2522   }
2523
2524   case Intrinsic::x86_sse41_insertps:
2525     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2526       return IC.replaceInstUsesWith(II, V);
2527     }
2528     break;
2529
2530   case Intrinsic::x86_sse4a_extrq: {
2531     Value *Op0 = II.getArgOperand(0);
2532     Value *Op1 = II.getArgOperand(1);
2533     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2534     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2535     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2536            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2537            VWidth1 == 16 && "Unexpected operand sizes");
2538
2539     // See if we're dealing with constant values.
2540     auto *C1 = dyn_cast<Constant>(Op1);
2541     auto *CILength =
2542         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2543            : nullptr;
2544     auto *CIIndex =
2545         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2546            : nullptr;
2547
2548     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2549     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2550       return IC.replaceInstUsesWith(II, V);
2551     }
2552
2553     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2554     // operands and the lowest 16-bits of the second.
2555     bool MadeChange = false;
2556     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2557       IC.replaceOperand(II, 0, V);
2558       MadeChange = true;
2559     }
2560     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2561       IC.replaceOperand(II, 1, V);
2562       MadeChange = true;
2563     }
2564     if (MadeChange) {
2565       return &II;
2566     }
2567     break;
2568   }
2569
2570   case Intrinsic::x86_sse4a_extrqi: {
2571     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2572     // bits of the lower 64-bits. The upper 64-bits are undefined.
2573     Value *Op0 = II.getArgOperand(0);
2574     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2575     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2576            "Unexpected operand size");
2577
2578     // See if we're dealing with constant values.
2579     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2580     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2581
2582     // Attempt to simplify to a constant or shuffle vector.
2583     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2584       return IC.replaceInstUsesWith(II, V);
2585     }
2586
2587     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2588     // operand.
2589     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2590       return IC.replaceOperand(II, 0, V);
2591     }
2592     break;
2593   }
2594
2595   case Intrinsic::x86_sse4a_insertq: {
2596     Value *Op0 = II.getArgOperand(0);
2597     Value *Op1 = II.getArgOperand(1);
2598     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2599     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2600            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2601            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2602            "Unexpected operand size");
2603
2604     // See if we're dealing with constant values.
2605     auto *C1 = dyn_cast<Constant>(Op1);
2606     auto *CI11 =
2607         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2608            : nullptr;
2609
2610     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2611     if (CI11) {
2612       const APInt &V11 = CI11->getValue();
2613       APInt Len = V11.zextOrTrunc(6);
2614       APInt Idx = V11.lshr(8).zextOrTrunc(6);
2615       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2616         return IC.replaceInstUsesWith(II, V);
2617       }
2618     }
2619
2620     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2621     // operand.
2622     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2623       return IC.replaceOperand(II, 0, V);
2624     }
2625     break;
2626   }
2627
2628   case Intrinsic::x86_sse4a_insertqi: {
2629     // INSERTQI: Extract lowest Length bits from lower half of second source and
2630     // insert over first source starting at Index bit. The upper 64-bits are
2631     // undefined.
2632     Value *Op0 = II.getArgOperand(0);
2633     Value *Op1 = II.getArgOperand(1);
2634     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2635     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2636     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2637            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2638            VWidth1 == 2 && "Unexpected operand sizes");
2639
2640     // See if we're dealing with constant values.
2641     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2642     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2643
2644     // Attempt to simplify to a constant or shuffle vector.
2645     if (CILength && CIIndex) {
2646       APInt Len = CILength->getValue().zextOrTrunc(6);
2647       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2648       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2649         return IC.replaceInstUsesWith(II, V);
2650       }
2651     }
2652
2653     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2654     // operands.
2655     bool MadeChange = false;
2656     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2657       IC.replaceOperand(II, 0, V);
2658       MadeChange = true;
2659     }
2660     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2661       IC.replaceOperand(II, 1, V);
2662       MadeChange = true;
2663     }
2664     if (MadeChange) {
2665       return &II;
2666     }
2667     break;
2668   }
2669
2670   case Intrinsic::x86_sse41_pblendvb:
2671   case Intrinsic::x86_sse41_blendvps:
2672   case Intrinsic::x86_sse41_blendvpd:
2673   case Intrinsic::x86_avx_blendv_ps_256:
2674   case Intrinsic::x86_avx_blendv_pd_256:
2675   case Intrinsic::x86_avx2_pblendvb: {
2676     // fold (blend A, A, Mask) -> A
2677     Value *Op0 = II.getArgOperand(0);
2678     Value *Op1 = II.getArgOperand(1);
2679     Value *Mask = II.getArgOperand(2);
2680     if (Op0 == Op1) {
2681       return IC.replaceInstUsesWith(II, Op0);
2682     }
2683
2684     // Zero Mask - select 1st argument.
2685     if (isa<ConstantAggregateZero>(Mask)) {
2686       return IC.replaceInstUsesWith(II, Op0);
2687     }
2688
2689     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2690     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2691       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
2692       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2693     }
2694
2695     // Convert to a vector select if we can bypass casts and find a boolean
2696     // vector condition value.
2697     Value *BoolVec;
2698     Mask = InstCombiner::peekThroughBitcast(Mask);
2699     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2700         BoolVec->getType()->isVectorTy() &&
2701         BoolVec->getType()->getScalarSizeInBits() == 1) {
2702       assert(Mask->getType()->getPrimitiveSizeInBits() ==
2703                  II.getType()->getPrimitiveSizeInBits() &&
2704              "Not expecting mask and operands with different sizes");
2705
2706       unsigned NumMaskElts =
2707           cast<FixedVectorType>(Mask->getType())->getNumElements();
2708       unsigned NumOperandElts =
2709           cast<FixedVectorType>(II.getType())->getNumElements();
2710       if (NumMaskElts == NumOperandElts) {
2711         return SelectInst::Create(BoolVec, Op1, Op0);
2712       }
2713
2714       // If the mask has less elements than the operands, each mask bit maps to
2715       // multiple elements of the operands. Bitcast back and forth.
2716       if (NumMaskElts < NumOperandElts) {
2717         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
2718         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
2719         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2720         return new BitCastInst(Sel, II.getType());
2721       }
2722     }
2723
2724     break;
2725   }
2726
2727   case Intrinsic::x86_ssse3_pshuf_b_128:
2728   case Intrinsic::x86_avx2_pshuf_b:
2729   case Intrinsic::x86_avx512_pshuf_b_512:
2730     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2731       return IC.replaceInstUsesWith(II, V);
2732     }
2733     break;
2734
2735   case Intrinsic::x86_avx_vpermilvar_ps:
2736   case Intrinsic::x86_avx_vpermilvar_ps_256:
2737   case Intrinsic::x86_avx512_vpermilvar_ps_512:
2738   case Intrinsic::x86_avx_vpermilvar_pd:
2739   case Intrinsic::x86_avx_vpermilvar_pd_256:
2740   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2741     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2742       return IC.replaceInstUsesWith(II, V);
2743     }
2744     break;
2745
2746   case Intrinsic::x86_avx2_permd:
2747   case Intrinsic::x86_avx2_permps:
2748   case Intrinsic::x86_avx512_permvar_df_256:
2749   case Intrinsic::x86_avx512_permvar_df_512:
2750   case Intrinsic::x86_avx512_permvar_di_256:
2751   case Intrinsic::x86_avx512_permvar_di_512:
2752   case Intrinsic::x86_avx512_permvar_hi_128:
2753   case Intrinsic::x86_avx512_permvar_hi_256:
2754   case Intrinsic::x86_avx512_permvar_hi_512:
2755   case Intrinsic::x86_avx512_permvar_qi_128:
2756   case Intrinsic::x86_avx512_permvar_qi_256:
2757   case Intrinsic::x86_avx512_permvar_qi_512:
2758   case Intrinsic::x86_avx512_permvar_sf_512:
2759   case Intrinsic::x86_avx512_permvar_si_512:
2760     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2761       return IC.replaceInstUsesWith(II, V);
2762     }
2763     break;
2764
2765   case Intrinsic::x86_avx_maskload_ps:
2766   case Intrinsic::x86_avx_maskload_pd:
2767   case Intrinsic::x86_avx_maskload_ps_256:
2768   case Intrinsic::x86_avx_maskload_pd_256:
2769   case Intrinsic::x86_avx2_maskload_d:
2770   case Intrinsic::x86_avx2_maskload_q:
2771   case Intrinsic::x86_avx2_maskload_d_256:
2772   case Intrinsic::x86_avx2_maskload_q_256:
2773     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2774       return I;
2775     }
2776     break;
2777
2778   case Intrinsic::x86_sse2_maskmov_dqu:
2779   case Intrinsic::x86_avx_maskstore_ps:
2780   case Intrinsic::x86_avx_maskstore_pd:
2781   case Intrinsic::x86_avx_maskstore_ps_256:
2782   case Intrinsic::x86_avx_maskstore_pd_256:
2783   case Intrinsic::x86_avx2_maskstore_d:
2784   case Intrinsic::x86_avx2_maskstore_q:
2785   case Intrinsic::x86_avx2_maskstore_d_256:
2786   case Intrinsic::x86_avx2_maskstore_q_256:
2787     if (simplifyX86MaskedStore(II, IC)) {
2788       return nullptr;
2789     }
2790     break;
2791
2792   case Intrinsic::x86_addcarry_32:
2793   case Intrinsic::x86_addcarry_64:
2794     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2795       return IC.replaceInstUsesWith(II, V);
2796     }
2797     break;
2798
2799   case Intrinsic::x86_avx512_pternlog_d_128:
2800   case Intrinsic::x86_avx512_pternlog_d_256:
2801   case Intrinsic::x86_avx512_pternlog_d_512:
2802   case Intrinsic::x86_avx512_pternlog_q_128:
2803   case Intrinsic::x86_avx512_pternlog_q_256:
2804   case Intrinsic::x86_avx512_pternlog_q_512:
2805     if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2806       return IC.replaceInstUsesWith(II, V);
2807     }
2808     break;
2809   default:
2810     break;
2811   }
2812   return std::nullopt;
2813 }
2814
2815 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2816     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2817     bool &KnownBitsComputed) const {
2818   switch (II.getIntrinsicID()) {
2819   default:
2820     break;
2821   case Intrinsic::x86_mmx_pmovmskb:
2822   case Intrinsic::x86_sse_movmsk_ps:
2823   case Intrinsic::x86_sse2_movmsk_pd:
2824   case Intrinsic::x86_sse2_pmovmskb_128:
2825   case Intrinsic::x86_avx_movmsk_ps_256:
2826   case Intrinsic::x86_avx_movmsk_pd_256:
2827   case Intrinsic::x86_avx2_pmovmskb: {
2828     // MOVMSK copies the vector elements' sign bits to the low bits
2829     // and zeros the high bits.
2830     unsigned ArgWidth;
2831     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2832       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2833     } else {
2834       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2835       ArgWidth = ArgType->getNumElements();
2836     }
2837
2838     // If we don't need any of low bits then return zero,
2839     // we know that DemandedMask is non-zero already.
2840     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
2841     Type *VTy = II.getType();
2842     if (DemandedElts.isZero()) {
2843       return ConstantInt::getNullValue(VTy);
2844     }
2845
2846     // We know that the upper bits are set to zero.
2847     Known.Zero.setBitsFrom(ArgWidth);
2848     KnownBitsComputed = true;
2849     break;
2850   }
2851   }
2852   return std::nullopt;
2853 }
2854
2855 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2856     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2857     APInt &UndefElts2, APInt &UndefElts3,
2858     std::function<void(Instruction *, unsigned, APInt, APInt &)>
2859         simplifyAndSetOp) const {
2860   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
2861   switch (II.getIntrinsicID()) {
2862   default:
2863     break;
2864   case Intrinsic::x86_xop_vfrcz_ss:
2865   case Intrinsic::x86_xop_vfrcz_sd:
2866     // The instructions for these intrinsics are speced to zero upper bits not
2867     // pass them through like other scalar intrinsics. So we shouldn't just
2868     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2869     // Instead we should return a zero vector.
2870     if (!DemandedElts[0]) {
2871       IC.addToWorklist(&II);
2872       return ConstantAggregateZero::get(II.getType());
2873     }
2874
2875     // Only the lower element is used.
2876     DemandedElts = 1;
2877     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2878
2879     // Only the lower element is undefined. The high elements are zero.
2880     UndefElts = UndefElts[0];
2881     break;
2882
2883   // Unary scalar-as-vector operations that work column-wise.
2884   case Intrinsic::x86_sse_rcp_ss:
2885   case Intrinsic::x86_sse_rsqrt_ss:
2886     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2887
2888     // If lowest element of a scalar op isn't used then use Arg0.
2889     if (!DemandedElts[0]) {
2890       IC.addToWorklist(&II);
2891       return II.getArgOperand(0);
2892     }
2893     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2894     // checks).
2895     break;
2896
2897   // Binary scalar-as-vector operations that work column-wise. The high
2898   // elements come from operand 0. The low element is a function of both
2899   // operands.
2900   case Intrinsic::x86_sse_min_ss:
2901   case Intrinsic::x86_sse_max_ss:
2902   case Intrinsic::x86_sse_cmp_ss:
2903   case Intrinsic::x86_sse2_min_sd:
2904   case Intrinsic::x86_sse2_max_sd:
2905   case Intrinsic::x86_sse2_cmp_sd: {
2906     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2907
2908     // If lowest element of a scalar op isn't used then use Arg0.
2909     if (!DemandedElts[0]) {
2910       IC.addToWorklist(&II);
2911       return II.getArgOperand(0);
2912     }
2913
2914     // Only lower element is used for operand 1.
2915     DemandedElts = 1;
2916     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2917
2918     // Lower element is undefined if both lower elements are undefined.
2919     // Consider things like undef&0.  The result is known zero, not undef.
2920     if (!UndefElts2[0])
2921       UndefElts.clearBit(0);
2922
2923     break;
2924   }
2925
2926   // Binary scalar-as-vector operations that work column-wise. The high
2927   // elements come from operand 0 and the low element comes from operand 1.
2928   case Intrinsic::x86_sse41_round_ss:
2929   case Intrinsic::x86_sse41_round_sd: {
2930     // Don't use the low element of operand 0.
2931     APInt DemandedElts2 = DemandedElts;
2932     DemandedElts2.clearBit(0);
2933     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
2934
2935     // If lowest element of a scalar op isn't used then use Arg0.
2936     if (!DemandedElts[0]) {
2937       IC.addToWorklist(&II);
2938       return II.getArgOperand(0);
2939     }
2940
2941     // Only lower element is used for operand 1.
2942     DemandedElts = 1;
2943     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2944
2945     // Take the high undef elements from operand 0 and take the lower element
2946     // from operand 1.
2947     UndefElts.clearBit(0);
2948     UndefElts |= UndefElts2[0];
2949     break;
2950   }
2951
2952   // Three input scalar-as-vector operations that work column-wise. The high
2953   // elements come from operand 0 and the low element is a function of all
2954   // three inputs.
2955   case Intrinsic::x86_avx512_mask_add_ss_round:
2956   case Intrinsic::x86_avx512_mask_div_ss_round:
2957   case Intrinsic::x86_avx512_mask_mul_ss_round:
2958   case Intrinsic::x86_avx512_mask_sub_ss_round:
2959   case Intrinsic::x86_avx512_mask_max_ss_round:
2960   case Intrinsic::x86_avx512_mask_min_ss_round:
2961   case Intrinsic::x86_avx512_mask_add_sd_round:
2962   case Intrinsic::x86_avx512_mask_div_sd_round:
2963   case Intrinsic::x86_avx512_mask_mul_sd_round:
2964   case Intrinsic::x86_avx512_mask_sub_sd_round:
2965   case Intrinsic::x86_avx512_mask_max_sd_round:
2966   case Intrinsic::x86_avx512_mask_min_sd_round:
2967     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2968
2969     // If lowest element of a scalar op isn't used then use Arg0.
2970     if (!DemandedElts[0]) {
2971       IC.addToWorklist(&II);
2972       return II.getArgOperand(0);
2973     }
2974
2975     // Only lower element is used for operand 1 and 2.
2976     DemandedElts = 1;
2977     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2978     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
2979
2980     // Lower element is undefined if all three lower elements are undefined.
2981     // Consider things like undef&0.  The result is known zero, not undef.
2982     if (!UndefElts2[0] || !UndefElts3[0])
2983       UndefElts.clearBit(0);
2984     break;
2985
2986   // TODO: Add fmaddsub support?
2987   case Intrinsic::x86_sse3_addsub_pd:
2988   case Intrinsic::x86_sse3_addsub_ps:
2989   case Intrinsic::x86_avx_addsub_pd_256:
2990   case Intrinsic::x86_avx_addsub_ps_256: {
2991     // If none of the even or none of the odd lanes are required, turn this
2992     // into a generic FP math instruction.
2993     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
2994     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
2995     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
2996     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
2997     if (IsSubOnly || IsAddOnly) {
2998       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
2999       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3000       IC.Builder.SetInsertPoint(&II);
3001       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3002       return IC.Builder.CreateBinOp(
3003           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3004     }
3005
3006     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3007     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3008     UndefElts &= UndefElts2;
3009     break;
3010   }
3011
3012   // General per-element vector operations.
3013   case Intrinsic::x86_avx2_psllv_d:
3014   case Intrinsic::x86_avx2_psllv_d_256:
3015   case Intrinsic::x86_avx2_psllv_q:
3016   case Intrinsic::x86_avx2_psllv_q_256:
3017   case Intrinsic::x86_avx2_psrlv_d:
3018   case Intrinsic::x86_avx2_psrlv_d_256:
3019   case Intrinsic::x86_avx2_psrlv_q:
3020   case Intrinsic::x86_avx2_psrlv_q_256:
3021   case Intrinsic::x86_avx2_psrav_d:
3022   case Intrinsic::x86_avx2_psrav_d_256: {
3023     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3024     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3025     UndefElts &= UndefElts2;
3026     break;
3027   }
3028
3029   case Intrinsic::x86_sse2_packssdw_128:
3030   case Intrinsic::x86_sse2_packsswb_128:
3031   case Intrinsic::x86_sse2_packuswb_128:
3032   case Intrinsic::x86_sse41_packusdw:
3033   case Intrinsic::x86_avx2_packssdw:
3034   case Intrinsic::x86_avx2_packsswb:
3035   case Intrinsic::x86_avx2_packusdw:
3036   case Intrinsic::x86_avx2_packuswb:
3037   case Intrinsic::x86_avx512_packssdw_512:
3038   case Intrinsic::x86_avx512_packsswb_512:
3039   case Intrinsic::x86_avx512_packusdw_512:
3040   case Intrinsic::x86_avx512_packuswb_512: {
3041     auto *Ty0 = II.getArgOperand(0)->getType();
3042     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3043     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3044
3045     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3046     unsigned VWidthPerLane = VWidth / NumLanes;
3047     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3048
3049     // Per lane, pack the elements of the first input and then the second.
3050     // e.g.
3051     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3052     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3053     for (int OpNum = 0; OpNum != 2; ++OpNum) {
3054       APInt OpDemandedElts(InnerVWidth, 0);
3055       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3056         unsigned LaneIdx = Lane * VWidthPerLane;
3057         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3058           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3059           if (DemandedElts[Idx])
3060             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3061         }
3062       }
3063
3064       // Demand elements from the operand.
3065       APInt OpUndefElts(InnerVWidth, 0);
3066       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3067
3068       // Pack the operand's UNDEF elements, one lane at a time.
3069       OpUndefElts = OpUndefElts.zext(VWidth);
3070       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3071         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3072         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3073         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3074         UndefElts |= LaneElts;
3075       }
3076     }
3077     break;
3078   }
3079
3080   // PSHUFB
3081   case Intrinsic::x86_ssse3_pshuf_b_128:
3082   case Intrinsic::x86_avx2_pshuf_b:
3083   case Intrinsic::x86_avx512_pshuf_b_512:
3084   // PERMILVAR
3085   case Intrinsic::x86_avx_vpermilvar_ps:
3086   case Intrinsic::x86_avx_vpermilvar_ps_256:
3087   case Intrinsic::x86_avx512_vpermilvar_ps_512:
3088   case Intrinsic::x86_avx_vpermilvar_pd:
3089   case Intrinsic::x86_avx_vpermilvar_pd_256:
3090   case Intrinsic::x86_avx512_vpermilvar_pd_512:
3091   // PERMV
3092   case Intrinsic::x86_avx2_permd:
3093   case Intrinsic::x86_avx2_permps: {
3094     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3095     break;
3096   }
3097
3098   // SSE4A instructions leave the upper 64-bits of the 128-bit result
3099   // in an undefined state.
3100   case Intrinsic::x86_sse4a_extrq:
3101   case Intrinsic::x86_sse4a_extrqi:
3102   case Intrinsic::x86_sse4a_insertq:
3103   case Intrinsic::x86_sse4a_insertqi:
3104     UndefElts.setHighBits(VWidth / 2);
3105     break;
3106   }
3107   return std::nullopt;
3108 }