llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp

   1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements a TargetTransformInfo analysis pass specific to the
  10 /// X86 target machine. It uses the target's detailed information to provide
  11 /// more precise answers to certain TTI queries, while letting the target
  12 /// independent and default TTI implementations handle the rest.
  13 ///
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "X86TargetTransformInfo.h"
  17 #include "llvm/IR/IntrinsicInst.h"
  18 #include "llvm/IR/IntrinsicsX86.h"
  19 #include "llvm/Support/KnownBits.h"
  20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
  21 #include <optional>
  22
  23 using namespace llvm;
  24 using namespace llvm::PatternMatch;
  25
  26 #define DEBUG_TYPE "x86tti"
  27
  28 /// Return a constant boolean vector that has true elements in all positions
  29 /// where the input constant data vector has an element with the sign bit set.
  30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
  31   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
  32   V = ConstantExpr::getBitCast(V, IntTy);
  33   V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
  34                                       Constant::getNullValue(IntTy), V, DL);
  35   assert(V && "Vector must be foldable");
  36   return V;
  37 }
  38
  39 /// Convert the x86 XMM integer vector mask to a vector of bools based on
  40 /// each element's most significant bit (the sign bit).
  41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
  42   // Fold Constant Mask.
  43   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
  44     return getNegativeIsTrueBoolVec(ConstantMask, DL);
  45
  46   // Mask was extended from a boolean vector.
  47   Value *ExtMask;
  48   if (match(Mask, m_SExt(m_Value(ExtMask))) &&
  49       ExtMask->getType()->isIntOrIntVectorTy(1))
  50     return ExtMask;
  51
  52   return nullptr;
  53 }
  54
  55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
  57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
  59   Value *Ptr = II.getOperand(0);
  60   Value *Mask = II.getOperand(1);
  61   Constant *ZeroVec = Constant::getNullValue(II.getType());
  62
  63   // Zero Mask - masked load instruction creates a zero vector.
  64   if (isa<ConstantAggregateZero>(Mask))
  65     return IC.replaceInstUsesWith(II, ZeroVec);
  66
  67   // The mask is constant or extended from a bool vector. Convert this x86
  68   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
  69   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
  70     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
  71     // the LLVM intrinsic definition for the pointer argument.
  72     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
  73     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
  74     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
  75
  76     // The pass-through vector for an x86 masked load is a zero vector.
  77     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
  78         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
  79     return IC.replaceInstUsesWith(II, NewMaskedLoad);
  80   }
  81
  82   return nullptr;
  83 }
  84
  85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
  87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
  89   Value *Ptr = II.getOperand(0);
  90   Value *Mask = II.getOperand(1);
  91   Value *Vec = II.getOperand(2);
  92
  93   // Zero Mask - this masked store instruction does nothing.
  94   if (isa<ConstantAggregateZero>(Mask)) {
  95     IC.eraseInstFromFunction(II);
  96     return true;
  97   }
  98
  99   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
 100   // anything else at this level.
 101   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
 102     return false;
 103
 104   // The mask is constant or extended from a bool vector. Convert this x86
 105   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
 106   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
 107     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
 108     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
 109     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
 110
 111     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
 112
 113     // 'Replace uses' doesn't work for stores. Erase the original masked store.
 114     IC.eraseInstFromFunction(II);
 115     return true;
 116   }
 117
 118   return false;
 119 }
 120
 121 static Value *simplifyX86immShift(const IntrinsicInst &II,
 122                                   InstCombiner::BuilderTy &Builder) {
 123   bool LogicalShift = false;
 124   bool ShiftLeft = false;
 125   bool IsImm = false;
 126
 127   switch (II.getIntrinsicID()) {
 128   default:
 129     llvm_unreachable("Unexpected intrinsic!");
 130   case Intrinsic::x86_sse2_psrai_d:
 131   case Intrinsic::x86_sse2_psrai_w:
 132   case Intrinsic::x86_avx2_psrai_d:
 133   case Intrinsic::x86_avx2_psrai_w:
 134   case Intrinsic::x86_avx512_psrai_q_128:
 135   case Intrinsic::x86_avx512_psrai_q_256:
 136   case Intrinsic::x86_avx512_psrai_d_512:
 137   case Intrinsic::x86_avx512_psrai_q_512:
 138   case Intrinsic::x86_avx512_psrai_w_512:
 139     IsImm = true;
 140     [[fallthrough]];
 141   case Intrinsic::x86_sse2_psra_d:
 142   case Intrinsic::x86_sse2_psra_w:
 143   case Intrinsic::x86_avx2_psra_d:
 144   case Intrinsic::x86_avx2_psra_w:
 145   case Intrinsic::x86_avx512_psra_q_128:
 146   case Intrinsic::x86_avx512_psra_q_256:
 147   case Intrinsic::x86_avx512_psra_d_512:
 148   case Intrinsic::x86_avx512_psra_q_512:
 149   case Intrinsic::x86_avx512_psra_w_512:
 150     LogicalShift = false;
 151     ShiftLeft = false;
 152     break;
 153   case Intrinsic::x86_sse2_psrli_d:
 154   case Intrinsic::x86_sse2_psrli_q:
 155   case Intrinsic::x86_sse2_psrli_w:
 156   case Intrinsic::x86_avx2_psrli_d:
 157   case Intrinsic::x86_avx2_psrli_q:
 158   case Intrinsic::x86_avx2_psrli_w:
 159   case Intrinsic::x86_avx512_psrli_d_512:
 160   case Intrinsic::x86_avx512_psrli_q_512:
 161   case Intrinsic::x86_avx512_psrli_w_512:
 162     IsImm = true;
 163     [[fallthrough]];
 164   case Intrinsic::x86_sse2_psrl_d:
 165   case Intrinsic::x86_sse2_psrl_q:
 166   case Intrinsic::x86_sse2_psrl_w:
 167   case Intrinsic::x86_avx2_psrl_d:
 168   case Intrinsic::x86_avx2_psrl_q:
 169   case Intrinsic::x86_avx2_psrl_w:
 170   case Intrinsic::x86_avx512_psrl_d_512:
 171   case Intrinsic::x86_avx512_psrl_q_512:
 172   case Intrinsic::x86_avx512_psrl_w_512:
 173     LogicalShift = true;
 174     ShiftLeft = false;
 175     break;
 176   case Intrinsic::x86_sse2_pslli_d:
 177   case Intrinsic::x86_sse2_pslli_q:
 178   case Intrinsic::x86_sse2_pslli_w:
 179   case Intrinsic::x86_avx2_pslli_d:
 180   case Intrinsic::x86_avx2_pslli_q:
 181   case Intrinsic::x86_avx2_pslli_w:
 182   case Intrinsic::x86_avx512_pslli_d_512:
 183   case Intrinsic::x86_avx512_pslli_q_512:
 184   case Intrinsic::x86_avx512_pslli_w_512:
 185     IsImm = true;
 186     [[fallthrough]];
 187   case Intrinsic::x86_sse2_psll_d:
 188   case Intrinsic::x86_sse2_psll_q:
 189   case Intrinsic::x86_sse2_psll_w:
 190   case Intrinsic::x86_avx2_psll_d:
 191   case Intrinsic::x86_avx2_psll_q:
 192   case Intrinsic::x86_avx2_psll_w:
 193   case Intrinsic::x86_avx512_psll_d_512:
 194   case Intrinsic::x86_avx512_psll_q_512:
 195   case Intrinsic::x86_avx512_psll_w_512:
 196     LogicalShift = true;
 197     ShiftLeft = true;
 198     break;
 199   }
 200   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 201
 202   Value *Vec = II.getArgOperand(0);
 203   Value *Amt = II.getArgOperand(1);
 204   auto *VT = cast<FixedVectorType>(Vec->getType());
 205   Type *SVT = VT->getElementType();
 206   Type *AmtVT = Amt->getType();
 207   unsigned VWidth = VT->getNumElements();
 208   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
 209
 210   // If the shift amount is guaranteed to be in-range we can replace it with a
 211   // generic shift. If its guaranteed to be out of range, logical shifts combine
 212   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
 213   if (IsImm) {
 214     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
 215     KnownBits KnownAmtBits =
 216         llvm::computeKnownBits(Amt, II.getDataLayout());
 217     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
 218       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
 219       Amt = Builder.CreateVectorSplat(VWidth, Amt);
 220       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
 221                                         : Builder.CreateLShr(Vec, Amt))
 222                            : Builder.CreateAShr(Vec, Amt));
 223     }
 224     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
 225       if (LogicalShift)
 226         return ConstantAggregateZero::get(VT);
 227       Amt = ConstantInt::get(SVT, BitWidth - 1);
 228       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
 229     }
 230   } else {
 231     // Ensure the first element has an in-range value and the rest of the
 232     // elements in the bottom 64 bits are zero.
 233     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
 234            cast<VectorType>(AmtVT)->getElementType() == SVT &&
 235            "Unexpected shift-by-scalar type");
 236     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
 237     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
 238     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
 239     KnownBits KnownLowerBits = llvm::computeKnownBits(
 240         Amt, DemandedLower, II.getDataLayout());
 241     KnownBits KnownUpperBits = llvm::computeKnownBits(
 242         Amt, DemandedUpper, II.getDataLayout());
 243     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
 244         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
 245       SmallVector<int, 16> ZeroSplat(VWidth, 0);
 246       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
 247       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
 248                                         : Builder.CreateLShr(Vec, Amt))
 249                            : Builder.CreateAShr(Vec, Amt));
 250     }
 251   }
 252
 253   // Simplify if count is constant vector.
 254   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
 255   if (!CDV)
 256     return nullptr;
 257
 258   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
 259   // operand to compute the shift amount.
 260   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
 261          cast<VectorType>(AmtVT)->getElementType() == SVT &&
 262          "Unexpected shift-by-scalar type");
 263
 264   // Concatenate the sub-elements to create the 64-bit value.
 265   APInt Count(64, 0);
 266   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
 267     unsigned SubEltIdx = (NumSubElts - 1) - i;
 268     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
 269     Count <<= BitWidth;
 270     Count |= SubElt->getValue().zextOrTrunc(64);
 271   }
 272
 273   // If shift-by-zero then just return the original value.
 274   if (Count.isZero())
 275     return Vec;
 276
 277   // Handle cases when Shift >= BitWidth.
 278   if (Count.uge(BitWidth)) {
 279     // If LogicalShift - just return zero.
 280     if (LogicalShift)
 281       return ConstantAggregateZero::get(VT);
 282
 283     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
 284     Count = APInt(64, BitWidth - 1);
 285   }
 286
 287   // Get a constant vector of the same type as the first operand.
 288   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
 289   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
 290
 291   if (ShiftLeft)
 292     return Builder.CreateShl(Vec, ShiftVec);
 293
 294   if (LogicalShift)
 295     return Builder.CreateLShr(Vec, ShiftVec);
 296
 297   return Builder.CreateAShr(Vec, ShiftVec);
 298 }
 299
 300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
 301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
 302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
 303 static Value *simplifyX86varShift(const IntrinsicInst &II,
 304                                   InstCombiner::BuilderTy &Builder) {
 305   bool LogicalShift = false;
 306   bool ShiftLeft = false;
 307
 308   switch (II.getIntrinsicID()) {
 309   default:
 310     llvm_unreachable("Unexpected intrinsic!");
 311   case Intrinsic::x86_avx2_psrav_d:
 312   case Intrinsic::x86_avx2_psrav_d_256:
 313   case Intrinsic::x86_avx512_psrav_q_128:
 314   case Intrinsic::x86_avx512_psrav_q_256:
 315   case Intrinsic::x86_avx512_psrav_d_512:
 316   case Intrinsic::x86_avx512_psrav_q_512:
 317   case Intrinsic::x86_avx512_psrav_w_128:
 318   case Intrinsic::x86_avx512_psrav_w_256:
 319   case Intrinsic::x86_avx512_psrav_w_512:
 320     LogicalShift = false;
 321     ShiftLeft = false;
 322     break;
 323   case Intrinsic::x86_avx2_psrlv_d:
 324   case Intrinsic::x86_avx2_psrlv_d_256:
 325   case Intrinsic::x86_avx2_psrlv_q:
 326   case Intrinsic::x86_avx2_psrlv_q_256:
 327   case Intrinsic::x86_avx512_psrlv_d_512:
 328   case Intrinsic::x86_avx512_psrlv_q_512:
 329   case Intrinsic::x86_avx512_psrlv_w_128:
 330   case Intrinsic::x86_avx512_psrlv_w_256:
 331   case Intrinsic::x86_avx512_psrlv_w_512:
 332     LogicalShift = true;
 333     ShiftLeft = false;
 334     break;
 335   case Intrinsic::x86_avx2_psllv_d:
 336   case Intrinsic::x86_avx2_psllv_d_256:
 337   case Intrinsic::x86_avx2_psllv_q:
 338   case Intrinsic::x86_avx2_psllv_q_256:
 339   case Intrinsic::x86_avx512_psllv_d_512:
 340   case Intrinsic::x86_avx512_psllv_q_512:
 341   case Intrinsic::x86_avx512_psllv_w_128:
 342   case Intrinsic::x86_avx512_psllv_w_256:
 343   case Intrinsic::x86_avx512_psllv_w_512:
 344     LogicalShift = true;
 345     ShiftLeft = true;
 346     break;
 347   }
 348   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 349
 350   Value *Vec = II.getArgOperand(0);
 351   Value *Amt = II.getArgOperand(1);
 352   auto *VT = cast<FixedVectorType>(II.getType());
 353   Type *SVT = VT->getElementType();
 354   int NumElts = VT->getNumElements();
 355   int BitWidth = SVT->getIntegerBitWidth();
 356
 357   // If the shift amount is guaranteed to be in-range we can replace it with a
 358   // generic shift.
 359   KnownBits KnownAmt =
 360       llvm::computeKnownBits(Amt, II.getDataLayout());
 361   if (KnownAmt.getMaxValue().ult(BitWidth)) {
 362     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
 363                                       : Builder.CreateLShr(Vec, Amt))
 364                          : Builder.CreateAShr(Vec, Amt));
 365   }
 366
 367   // Simplify if all shift amounts are constant/undef.
 368   auto *CShift = dyn_cast<Constant>(Amt);
 369   if (!CShift)
 370     return nullptr;
 371
 372   // Collect each element's shift amount.
 373   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
 374   bool AnyOutOfRange = false;
 375   SmallVector<int, 8> ShiftAmts;
 376   for (int I = 0; I < NumElts; ++I) {
 377     auto *CElt = CShift->getAggregateElement(I);
 378     if (isa_and_nonnull<UndefValue>(CElt)) {
 379       ShiftAmts.push_back(-1);
 380       continue;
 381     }
 382
 383     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
 384     if (!COp)
 385       return nullptr;
 386
 387     // Handle out of range shifts.
 388     // If LogicalShift - set to BitWidth (special case).
 389     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
 390     APInt ShiftVal = COp->getValue();
 391     if (ShiftVal.uge(BitWidth)) {
 392       AnyOutOfRange = LogicalShift;
 393       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
 394       continue;
 395     }
 396
 397     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
 398   }
 399
 400   // If all elements out of range or UNDEF, return vector of zeros/undefs.
 401   // ArithmeticShift should only hit this if they are all UNDEF.
 402   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
 403   if (llvm::all_of(ShiftAmts, OutOfRange)) {
 404     SmallVector<Constant *, 8> ConstantVec;
 405     for (int Idx : ShiftAmts) {
 406       if (Idx < 0) {
 407         ConstantVec.push_back(UndefValue::get(SVT));
 408       } else {
 409         assert(LogicalShift && "Logical shift expected");
 410         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
 411       }
 412     }
 413     return ConstantVector::get(ConstantVec);
 414   }
 415
 416   // We can't handle only some out of range values with generic logical shifts.
 417   if (AnyOutOfRange)
 418     return nullptr;
 419
 420   // Build the shift amount constant vector.
 421   SmallVector<Constant *, 8> ShiftVecAmts;
 422   for (int Idx : ShiftAmts) {
 423     if (Idx < 0)
 424       ShiftVecAmts.push_back(UndefValue::get(SVT));
 425     else
 426       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
 427   }
 428   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
 429
 430   if (ShiftLeft)
 431     return Builder.CreateShl(Vec, ShiftVec);
 432
 433   if (LogicalShift)
 434     return Builder.CreateLShr(Vec, ShiftVec);
 435
 436   return Builder.CreateAShr(Vec, ShiftVec);
 437 }
 438
 439 static Value *simplifyX86pack(IntrinsicInst &II,
 440                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
 441   Value *Arg0 = II.getArgOperand(0);
 442   Value *Arg1 = II.getArgOperand(1);
 443   Type *ResTy = II.getType();
 444
 445   // Fast all undef handling.
 446   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
 447     return UndefValue::get(ResTy);
 448
 449   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
 450   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
 451   unsigned NumSrcElts = ArgTy->getNumElements();
 452   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
 453          "Unexpected packing types");
 454
 455   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
 456   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
 457   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
 458   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
 459          "Unexpected packing types");
 460
 461   // Constant folding.
 462   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
 463     return nullptr;
 464
 465   // Clamp Values - signed/unsigned both use signed clamp values, but they
 466   // differ on the min/max values.
 467   APInt MinValue, MaxValue;
 468   if (IsSigned) {
 469     // PACKSS: Truncate signed value with signed saturation.
 470     // Source values less than dst minint are saturated to minint.
 471     // Source values greater than dst maxint are saturated to maxint.
 472     MinValue =
 473         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
 474     MaxValue =
 475         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
 476   } else {
 477     // PACKUS: Truncate signed value with unsigned saturation.
 478     // Source values less than zero are saturated to zero.
 479     // Source values greater than dst maxuint are saturated to maxuint.
 480     MinValue = APInt::getZero(SrcScalarSizeInBits);
 481     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
 482   }
 483
 484   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
 485   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
 486   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
 487   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
 488   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
 489   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
 490
 491   // Shuffle clamped args together at the lane level.
 492   SmallVector<int, 32> PackMask;
 493   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
 494     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
 495       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
 496     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
 497       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
 498   }
 499   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
 500
 501   // Truncate to dst size.
 502   return Builder.CreateTrunc(Shuffle, ResTy);
 503 }
 504
 505 static Value *simplifyX86pmulh(IntrinsicInst &II,
 506                                InstCombiner::BuilderTy &Builder, bool IsSigned,
 507                                bool IsRounding) {
 508   Value *Arg0 = II.getArgOperand(0);
 509   Value *Arg1 = II.getArgOperand(1);
 510   auto *ResTy = cast<FixedVectorType>(II.getType());
 511   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
 512   assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
 513          "Unexpected PMULH types");
 514   assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
 515
 516   // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
 517   if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
 518     return ConstantAggregateZero::get(ResTy);
 519
 520   // Multiply by zero.
 521   if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
 522     return ConstantAggregateZero::get(ResTy);
 523
 524   // Multiply by one.
 525   if (!IsRounding) {
 526     if (match(Arg0, m_One()))
 527       return IsSigned ? Builder.CreateAShr(Arg1, 15)
 528                       : ConstantAggregateZero::get(ResTy);
 529     if (match(Arg1, m_One()))
 530       return IsSigned ? Builder.CreateAShr(Arg0, 15)
 531                       : ConstantAggregateZero::get(ResTy);
 532   }
 533
 534   // Constant folding.
 535   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
 536     return nullptr;
 537
 538   // Extend to twice the width and multiply.
 539   auto Cast =
 540       IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
 541   auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);
 542   Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
 543   Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
 544   Value *Mul = Builder.CreateMul(LHS, RHS);
 545
 546   if (IsRounding) {
 547     // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
 548     // extract bits[16:1].
 549     auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
 550     auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
 551     Mul = Builder.CreateLShr(Mul, 14);
 552     Mul = Builder.CreateTrunc(Mul, RndTy);
 553     Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
 554     Mul = Builder.CreateLShr(Mul, 1);
 555   } else {
 556     // PMULH/PMULHU: extract the vXi16 most significant bits.
 557     Mul = Builder.CreateLShr(Mul, 16);
 558   }
 559
 560   return Builder.CreateTrunc(Mul, ResTy);
 561 }
 562
 563 static Value *simplifyX86pmadd(IntrinsicInst &II,
 564                                InstCombiner::BuilderTy &Builder,
 565                                bool IsPMADDWD) {
 566   Value *Arg0 = II.getArgOperand(0);
 567   Value *Arg1 = II.getArgOperand(1);
 568   auto *ResTy = cast<FixedVectorType>(II.getType());
 569   [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
 570
 571   unsigned NumDstElts = ResTy->getNumElements();
 572   assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
 573          ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
 574          "Unexpected PMADD types");
 575
 576   // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
 577   if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
 578     return ConstantAggregateZero::get(ResTy);
 579
 580   // Multiply by zero.
 581   if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
 582     return ConstantAggregateZero::get(ResTy);
 583
 584   // Constant folding.
 585   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
 586     return nullptr;
 587
 588   // Split Lo/Hi elements pairs, extend and add together.
 589   // PMADDWD(X,Y) =
 590   // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
 591   // PMADDUBSW(X,Y) =
 592   // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
 593   SmallVector<int> LoMask, HiMask;
 594   for (unsigned I = 0; I != NumDstElts; ++I) {
 595     LoMask.push_back(2 * I + 0);
 596     HiMask.push_back(2 * I + 1);
 597   }
 598
 599   auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
 600   auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
 601   auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
 602   auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
 603
 604   auto LHSCast =
 605       IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
 606   LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
 607   LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
 608   RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
 609   RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
 610   Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
 611   Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
 612   return IsPMADDWD
 613              ? Builder.CreateAdd(Lo, Hi)
 614              : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
 615 }
 616
 617 static Value *simplifyX86movmsk(const IntrinsicInst &II,
 618                                 InstCombiner::BuilderTy &Builder) {
 619   Value *Arg = II.getArgOperand(0);
 620   Type *ResTy = II.getType();
 621
 622   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
 623   if (isa<UndefValue>(Arg))
 624     return Constant::getNullValue(ResTy);
 625
 626   // Preserve previous behavior and give up.
 627   // TODO: treat as <8 x i8>.
 628   if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
 629     return nullptr;
 630
 631   auto *ArgTy = cast<FixedVectorType>(Arg->getType());
 632
 633   // Expand MOVMSK to compare/bitcast/zext:
 634   // e.g. PMOVMSKB(v16i8 x):
 635   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
 636   // %int = bitcast <16 x i1> %cmp to i16
 637   // %res = zext i16 %int to i32
 638   unsigned NumElts = ArgTy->getNumElements();
 639   Type *IntegerTy = Builder.getIntNTy(NumElts);
 640
 641   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
 642   Res = Builder.CreateIsNeg(Res);
 643   Res = Builder.CreateBitCast(Res, IntegerTy);
 644   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
 645   return Res;
 646 }
 647
 648 static Value *simplifyX86addcarry(const IntrinsicInst &II,
 649                                   InstCombiner::BuilderTy &Builder) {
 650   Value *CarryIn = II.getArgOperand(0);
 651   Value *Op1 = II.getArgOperand(1);
 652   Value *Op2 = II.getArgOperand(2);
 653   Type *RetTy = II.getType();
 654   Type *OpTy = Op1->getType();
 655   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
 656          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
 657          "Unexpected types for x86 addcarry");
 658
 659   // If carry-in is zero, this is just an unsigned add with overflow.
 660   if (match(CarryIn, m_ZeroInt())) {
 661     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
 662                                           {Op1, Op2});
 663     // The types have to be adjusted to match the x86 call types.
 664     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
 665     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
 666                                        Builder.getInt8Ty());
 667     Value *Res = PoisonValue::get(RetTy);
 668     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
 669     return Builder.CreateInsertValue(Res, UAddResult, 1);
 670   }
 671
 672   return nullptr;
 673 }
 674
 675 static Value *simplifyTernarylogic(const IntrinsicInst &II,
 676                                    InstCombiner::BuilderTy &Builder) {
 677
 678   auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
 679   if (!ArgImm || ArgImm->getValue().uge(256))
 680     return nullptr;
 681
 682   Value *ArgA = II.getArgOperand(0);
 683   Value *ArgB = II.getArgOperand(1);
 684   Value *ArgC = II.getArgOperand(2);
 685
 686   Type *Ty = II.getType();
 687
 688   auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
 689     return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
 690   };
 691   auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
 692     return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
 693   };
 694   auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
 695     return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
 696   };
 697   auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
 698     return {Builder.CreateNot(V.first), ~V.second};
 699   };
 700   auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
 701   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
 702   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
 703
 704   bool AIsConst = match(ArgA, m_ImmConstant());
 705   bool BIsConst = match(ArgB, m_ImmConstant());
 706   bool CIsConst = match(ArgC, m_ImmConstant());
 707
 708   bool ABIsConst = AIsConst && BIsConst;
 709   bool ACIsConst = AIsConst && CIsConst;
 710   bool BCIsConst = BIsConst && CIsConst;
 711   bool ABCIsConst = AIsConst && BIsConst && CIsConst;
 712
 713   // Use for verification. Its a big table. Its difficult to go from Imm ->
 714   // logic ops, but easy to verify that a set of logic ops is correct. We track
 715   // the logic ops through the second value in the pair. At the end it should
 716   // equal Imm.
 717   std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
 718   std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
 719   std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
 720   std::pair<Value *, uint8_t> Res = {nullptr, 0};
 721
 722   // Currently we only handle cases that convert directly to another instruction
 723   // or cases where all the ops are constant.  This is because we don't properly
 724   // handle creating ternary ops in the backend, so splitting them here may
 725   // cause regressions. As the backend improves, uncomment more cases.
 726
 727   uint8_t Imm = ArgImm->getValue().getZExtValue();
 728   switch (Imm) {
 729   case 0x0:
 730     Res = {Constant::getNullValue(Ty), 0};
 731     break;
 732   case 0x1:
 733     if (ABCIsConst)
 734       Res = Nor(Or(A, B), C);
 735     break;
 736   case 0x2:
 737     if (ABCIsConst)
 738       Res = And(Nor(A, B), C);
 739     break;
 740   case 0x3:
 741     if (ABIsConst)
 742       Res = Nor(A, B);
 743     break;
 744   case 0x4:
 745     if (ABCIsConst)
 746       Res = And(Nor(A, C), B);
 747     break;
 748   case 0x5:
 749     if (ACIsConst)
 750       Res = Nor(A, C);
 751     break;
 752   case 0x6:
 753     if (ABCIsConst)
 754       Res = Nor(A, Xnor(B, C));
 755     break;
 756   case 0x7:
 757     if (ABCIsConst)
 758       Res = Nor(A, And(B, C));
 759     break;
 760   case 0x8:
 761     if (ABCIsConst)
 762       Res = Nor(A, Nand(B, C));
 763     break;
 764   case 0x9:
 765     if (ABCIsConst)
 766       Res = Nor(A, Xor(B, C));
 767     break;
 768   case 0xa:
 769     if (ACIsConst)
 770       Res = Nor(A, Not(C));
 771     break;
 772   case 0xb:
 773     if (ABCIsConst)
 774       Res = Nor(A, Nor(C, Not(B)));
 775     break;
 776   case 0xc:
 777     if (ABIsConst)
 778       Res = Nor(A, Not(B));
 779     break;
 780   case 0xd:
 781     if (ABCIsConst)
 782       Res = Nor(A, Nor(B, Not(C)));
 783     break;
 784   case 0xe:
 785     if (ABCIsConst)
 786       Res = Nor(A, Nor(B, C));
 787     break;
 788   case 0xf:
 789     Res = Not(A);
 790     break;
 791   case 0x10:
 792     if (ABCIsConst)
 793       Res = And(A, Nor(B, C));
 794     break;
 795   case 0x11:
 796     if (BCIsConst)
 797       Res = Nor(B, C);
 798     break;
 799   case 0x12:
 800     if (ABCIsConst)
 801       Res = Nor(Xnor(A, C), B);
 802     break;
 803   case 0x13:
 804     if (ABCIsConst)
 805       Res = Nor(And(A, C), B);
 806     break;
 807   case 0x14:
 808     if (ABCIsConst)
 809       Res = Nor(Xnor(A, B), C);
 810     break;
 811   case 0x15:
 812     if (ABCIsConst)
 813       Res = Nor(And(A, B), C);
 814     break;
 815   case 0x16:
 816     if (ABCIsConst)
 817       Res = Xor(Xor(A, B), And(Nand(A, B), C));
 818     break;
 819   case 0x17:
 820     if (ABCIsConst)
 821       Res = Xor(Or(A, B), Or(Xnor(A, B), C));
 822     break;
 823   case 0x18:
 824     if (ABCIsConst)
 825       Res = Nor(Xnor(A, B), Xnor(A, C));
 826     break;
 827   case 0x19:
 828     if (ABCIsConst)
 829       Res = And(Nand(A, B), Xnor(B, C));
 830     break;
 831   case 0x1a:
 832     if (ABCIsConst)
 833       Res = Xor(A, Or(And(A, B), C));
 834     break;
 835   case 0x1b:
 836     if (ABCIsConst)
 837       Res = Xor(A, Or(Xnor(A, B), C));
 838     break;
 839   case 0x1c:
 840     if (ABCIsConst)
 841       Res = Xor(A, Or(And(A, C), B));
 842     break;
 843   case 0x1d:
 844     if (ABCIsConst)
 845       Res = Xor(A, Or(Xnor(A, C), B));
 846     break;
 847   case 0x1e:
 848     if (ABCIsConst)
 849       Res = Xor(A, Or(B, C));
 850     break;
 851   case 0x1f:
 852     if (ABCIsConst)
 853       Res = Nand(A, Or(B, C));
 854     break;
 855   case 0x20:
 856     if (ABCIsConst)
 857       Res = Nor(Nand(A, C), B);
 858     break;
 859   case 0x21:
 860     if (ABCIsConst)
 861       Res = Nor(Xor(A, C), B);
 862     break;
 863   case 0x22:
 864     if (BCIsConst)
 865       Res = Nor(B, Not(C));
 866     break;
 867   case 0x23:
 868     if (ABCIsConst)
 869       Res = Nor(B, Nor(C, Not(A)));
 870     break;
 871   case 0x24:
 872     if (ABCIsConst)
 873       Res = Nor(Xnor(A, B), Xor(A, C));
 874     break;
 875   case 0x25:
 876     if (ABCIsConst)
 877       Res = Xor(A, Nand(Nand(A, B), C));
 878     break;
 879   case 0x26:
 880     if (ABCIsConst)
 881       Res = And(Nand(A, B), Xor(B, C));
 882     break;
 883   case 0x27:
 884     if (ABCIsConst)
 885       Res = Xor(Or(Xnor(A, B), C), B);
 886     break;
 887   case 0x28:
 888     if (ABCIsConst)
 889       Res = And(Xor(A, B), C);
 890     break;
 891   case 0x29:
 892     if (ABCIsConst)
 893       Res = Xor(Xor(A, B), Nor(And(A, B), C));
 894     break;
 895   case 0x2a:
 896     if (ABCIsConst)
 897       Res = And(Nand(A, B), C);
 898     break;
 899   case 0x2b:
 900     if (ABCIsConst)
 901       Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
 902     break;
 903   case 0x2c:
 904     if (ABCIsConst)
 905       Res = Nor(Xnor(A, B), Nor(B, C));
 906     break;
 907   case 0x2d:
 908     if (ABCIsConst)
 909       Res = Xor(A, Or(B, Not(C)));
 910     break;
 911   case 0x2e:
 912     if (ABCIsConst)
 913       Res = Xor(A, Or(Xor(A, C), B));
 914     break;
 915   case 0x2f:
 916     if (ABCIsConst)
 917       Res = Nand(A, Or(B, Not(C)));
 918     break;
 919   case 0x30:
 920     if (ABIsConst)
 921       Res = Nor(B, Not(A));
 922     break;
 923   case 0x31:
 924     if (ABCIsConst)
 925       Res = Nor(Nor(A, Not(C)), B);
 926     break;
 927   case 0x32:
 928     if (ABCIsConst)
 929       Res = Nor(Nor(A, C), B);
 930     break;
 931   case 0x33:
 932     Res = Not(B);
 933     break;
 934   case 0x34:
 935     if (ABCIsConst)
 936       Res = And(Xor(A, B), Nand(B, C));
 937     break;
 938   case 0x35:
 939     if (ABCIsConst)
 940       Res = Xor(B, Or(A, Xnor(B, C)));
 941     break;
 942   case 0x36:
 943     if (ABCIsConst)
 944       Res = Xor(Or(A, C), B);
 945     break;
 946   case 0x37:
 947     if (ABCIsConst)
 948       Res = Nand(Or(A, C), B);
 949     break;
 950   case 0x38:
 951     if (ABCIsConst)
 952       Res = Nor(Xnor(A, B), Nor(A, C));
 953     break;
 954   case 0x39:
 955     if (ABCIsConst)
 956       Res = Xor(Or(A, Not(C)), B);
 957     break;
 958   case 0x3a:
 959     if (ABCIsConst)
 960       Res = Xor(B, Or(A, Xor(B, C)));
 961     break;
 962   case 0x3b:
 963     if (ABCIsConst)
 964       Res = Nand(Or(A, Not(C)), B);
 965     break;
 966   case 0x3c:
 967     Res = Xor(A, B);
 968     break;
 969   case 0x3d:
 970     if (ABCIsConst)
 971       Res = Xor(A, Or(Nor(A, C), B));
 972     break;
 973   case 0x3e:
 974     if (ABCIsConst)
 975       Res = Xor(A, Or(Nor(A, Not(C)), B));
 976     break;
 977   case 0x3f:
 978     if (ABIsConst)
 979       Res = Nand(A, B);
 980     break;
 981   case 0x40:
 982     if (ABCIsConst)
 983       Res = Nor(Nand(A, B), C);
 984     break;
 985   case 0x41:
 986     if (ABCIsConst)
 987       Res = Nor(Xor(A, B), C);
 988     break;
 989   case 0x42:
 990     if (ABCIsConst)
 991       Res = Nor(Xor(A, B), Xnor(A, C));
 992     break;
 993   case 0x43:
 994     if (ABCIsConst)
 995       Res = Xor(A, Nand(Nand(A, C), B));
 996     break;
 997   case 0x44:
 998     if (BCIsConst)
 999       Res = Nor(C, Not(B));
1000     break;
1001   case 0x45:
1002     if (ABCIsConst)
1003       Res = Nor(Nor(B, Not(A)), C);
1004     break;
1005   case 0x46:
1006     if (ABCIsConst)
1007       Res = Xor(Or(And(A, C), B), C);
1008     break;
1009   case 0x47:
1010     if (ABCIsConst)
1011       Res = Xor(Or(Xnor(A, C), B), C);
1012     break;
1013   case 0x48:
1014     if (ABCIsConst)
1015       Res = And(Xor(A, C), B);
1016     break;
1017   case 0x49:
1018     if (ABCIsConst)
1019       Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1020     break;
1021   case 0x4a:
1022     if (ABCIsConst)
1023       Res = Nor(Xnor(A, C), Nor(B, C));
1024     break;
1025   case 0x4b:
1026     if (ABCIsConst)
1027       Res = Xor(A, Or(C, Not(B)));
1028     break;
1029   case 0x4c:
1030     if (ABCIsConst)
1031       Res = And(Nand(A, C), B);
1032     break;
1033   case 0x4d:
1034     if (ABCIsConst)
1035       Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1036     break;
1037   case 0x4e:
1038     if (ABCIsConst)
1039       Res = Xor(A, Or(Xor(A, B), C));
1040     break;
1041   case 0x4f:
1042     if (ABCIsConst)
1043       Res = Nand(A, Nand(B, Not(C)));
1044     break;
1045   case 0x50:
1046     if (ACIsConst)
1047       Res = Nor(C, Not(A));
1048     break;
1049   case 0x51:
1050     if (ABCIsConst)
1051       Res = Nor(Nor(A, Not(B)), C);
1052     break;
1053   case 0x52:
1054     if (ABCIsConst)
1055       Res = And(Xor(A, C), Nand(B, C));
1056     break;
1057   case 0x53:
1058     if (ABCIsConst)
1059       Res = Xor(Or(Xnor(B, C), A), C);
1060     break;
1061   case 0x54:
1062     if (ABCIsConst)
1063       Res = Nor(Nor(A, B), C);
1064     break;
1065   case 0x55:
1066     Res = Not(C);
1067     break;
1068   case 0x56:
1069     if (ABCIsConst)
1070       Res = Xor(Or(A, B), C);
1071     break;
1072   case 0x57:
1073     if (ABCIsConst)
1074       Res = Nand(Or(A, B), C);
1075     break;
1076   case 0x58:
1077     if (ABCIsConst)
1078       Res = Nor(Nor(A, B), Xnor(A, C));
1079     break;
1080   case 0x59:
1081     if (ABCIsConst)
1082       Res = Xor(Or(A, Not(B)), C);
1083     break;
1084   case 0x5a:
1085     Res = Xor(A, C);
1086     break;
1087   case 0x5b:
1088     if (ABCIsConst)
1089       Res = Xor(A, Or(Nor(A, B), C));
1090     break;
1091   case 0x5c:
1092     if (ABCIsConst)
1093       Res = Xor(Or(Xor(B, C), A), C);
1094     break;
1095   case 0x5d:
1096     if (ABCIsConst)
1097       Res = Nand(Or(A, Not(B)), C);
1098     break;
1099   case 0x5e:
1100     if (ABCIsConst)
1101       Res = Xor(A, Or(Nor(A, Not(B)), C));
1102     break;
1103   case 0x5f:
1104     if (ACIsConst)
1105       Res = Nand(A, C);
1106     break;
1107   case 0x60:
1108     if (ABCIsConst)
1109       Res = And(A, Xor(B, C));
1110     break;
1111   case 0x61:
1112     if (ABCIsConst)
1113       Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1114     break;
1115   case 0x62:
1116     if (ABCIsConst)
1117       Res = Nor(Nor(A, C), Xnor(B, C));
1118     break;
1119   case 0x63:
1120     if (ABCIsConst)
1121       Res = Xor(B, Or(C, Not(A)));
1122     break;
1123   case 0x64:
1124     if (ABCIsConst)
1125       Res = Nor(Nor(A, B), Xnor(B, C));
1126     break;
1127   case 0x65:
1128     if (ABCIsConst)
1129       Res = Xor(Or(B, Not(A)), C);
1130     break;
1131   case 0x66:
1132     Res = Xor(B, C);
1133     break;
1134   case 0x67:
1135     if (ABCIsConst)
1136       Res = Or(Nor(A, B), Xor(B, C));
1137     break;
1138   case 0x68:
1139     if (ABCIsConst)
1140       Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1141     break;
1142   case 0x69:
1143     if (ABCIsConst)
1144       Res = Xor(Xnor(A, B), C);
1145     break;
1146   case 0x6a:
1147     if (ABCIsConst)
1148       Res = Xor(And(A, B), C);
1149     break;
1150   case 0x6b:
1151     if (ABCIsConst)
1152       Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1153     break;
1154   case 0x6c:
1155     if (ABCIsConst)
1156       Res = Xor(And(A, C), B);
1157     break;
1158   case 0x6d:
1159     if (ABCIsConst)
1160       Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1161     break;
1162   case 0x6e:
1163     if (ABCIsConst)
1164       Res = Or(Nor(A, Not(B)), Xor(B, C));
1165     break;
1166   case 0x6f:
1167     if (ABCIsConst)
1168       Res = Nand(A, Xnor(B, C));
1169     break;
1170   case 0x70:
1171     if (ABCIsConst)
1172       Res = And(A, Nand(B, C));
1173     break;
1174   case 0x71:
1175     if (ABCIsConst)
1176       Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1177     break;
1178   case 0x72:
1179     if (ABCIsConst)
1180       Res = Xor(Or(Xor(A, B), C), B);
1181     break;
1182   case 0x73:
1183     if (ABCIsConst)
1184       Res = Nand(Nand(A, Not(C)), B);
1185     break;
1186   case 0x74:
1187     if (ABCIsConst)
1188       Res = Xor(Or(Xor(A, C), B), C);
1189     break;
1190   case 0x75:
1191     if (ABCIsConst)
1192       Res = Nand(Nand(A, Not(B)), C);
1193     break;
1194   case 0x76:
1195     if (ABCIsConst)
1196       Res = Xor(B, Or(Nor(B, Not(A)), C));
1197     break;
1198   case 0x77:
1199     if (BCIsConst)
1200       Res = Nand(B, C);
1201     break;
1202   case 0x78:
1203     if (ABCIsConst)
1204       Res = Xor(A, And(B, C));
1205     break;
1206   case 0x79:
1207     if (ABCIsConst)
1208       Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1209     break;
1210   case 0x7a:
1211     if (ABCIsConst)
1212       Res = Or(Xor(A, C), Nor(B, Not(A)));
1213     break;
1214   case 0x7b:
1215     if (ABCIsConst)
1216       Res = Nand(Xnor(A, C), B);
1217     break;
1218   case 0x7c:
1219     if (ABCIsConst)
1220       Res = Or(Xor(A, B), Nor(C, Not(A)));
1221     break;
1222   case 0x7d:
1223     if (ABCIsConst)
1224       Res = Nand(Xnor(A, B), C);
1225     break;
1226   case 0x7e:
1227     if (ABCIsConst)
1228       Res = Or(Xor(A, B), Xor(A, C));
1229     break;
1230   case 0x7f:
1231     if (ABCIsConst)
1232       Res = Nand(And(A, B), C);
1233     break;
1234   case 0x80:
1235     if (ABCIsConst)
1236       Res = And(And(A, B), C);
1237     break;
1238   case 0x81:
1239     if (ABCIsConst)
1240       Res = Nor(Xor(A, B), Xor(A, C));
1241     break;
1242   case 0x82:
1243     if (ABCIsConst)
1244       Res = And(Xnor(A, B), C);
1245     break;
1246   case 0x83:
1247     if (ABCIsConst)
1248       Res = Nor(Xor(A, B), Nor(C, Not(A)));
1249     break;
1250   case 0x84:
1251     if (ABCIsConst)
1252       Res = And(Xnor(A, C), B);
1253     break;
1254   case 0x85:
1255     if (ABCIsConst)
1256       Res = Nor(Xor(A, C), Nor(B, Not(A)));
1257     break;
1258   case 0x86:
1259     if (ABCIsConst)
1260       Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1261     break;
1262   case 0x87:
1263     if (ABCIsConst)
1264       Res = Xor(A, Nand(B, C));
1265     break;
1266   case 0x88:
1267     Res = And(B, C);
1268     break;
1269   case 0x89:
1270     if (ABCIsConst)
1271       Res = Xor(B, Nor(Nor(B, Not(A)), C));
1272     break;
1273   case 0x8a:
1274     if (ABCIsConst)
1275       Res = And(Nand(A, Not(B)), C);
1276     break;
1277   case 0x8b:
1278     if (ABCIsConst)
1279       Res = Xor(Nor(Xor(A, C), B), C);
1280     break;
1281   case 0x8c:
1282     if (ABCIsConst)
1283       Res = And(Nand(A, Not(C)), B);
1284     break;
1285   case 0x8d:
1286     if (ABCIsConst)
1287       Res = Xor(Nor(Xor(A, B), C), B);
1288     break;
1289   case 0x8e:
1290     if (ABCIsConst)
1291       Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1292     break;
1293   case 0x8f:
1294     if (ABCIsConst)
1295       Res = Nand(A, Nand(B, C));
1296     break;
1297   case 0x90:
1298     if (ABCIsConst)
1299       Res = And(A, Xnor(B, C));
1300     break;
1301   case 0x91:
1302     if (ABCIsConst)
1303       Res = Nor(Nor(A, Not(B)), Xor(B, C));
1304     break;
1305   case 0x92:
1306     if (ABCIsConst)
1307       Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1308     break;
1309   case 0x93:
1310     if (ABCIsConst)
1311       Res = Xor(Nand(A, C), B);
1312     break;
1313   case 0x94:
1314     if (ABCIsConst)
1315       Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1316     break;
1317   case 0x95:
1318     if (ABCIsConst)
1319       Res = Xor(Nand(A, B), C);
1320     break;
1321   case 0x96:
1322     if (ABCIsConst)
1323       Res = Xor(Xor(A, B), C);
1324     break;
1325   case 0x97:
1326     if (ABCIsConst)
1327       Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1328     break;
1329   case 0x98:
1330     if (ABCIsConst)
1331       Res = Nor(Nor(A, B), Xor(B, C));
1332     break;
1333   case 0x99:
1334     if (BCIsConst)
1335       Res = Xnor(B, C);
1336     break;
1337   case 0x9a:
1338     if (ABCIsConst)
1339       Res = Xor(Nor(B, Not(A)), C);
1340     break;
1341   case 0x9b:
1342     if (ABCIsConst)
1343       Res = Or(Nor(A, B), Xnor(B, C));
1344     break;
1345   case 0x9c:
1346     if (ABCIsConst)
1347       Res = Xor(B, Nor(C, Not(A)));
1348     break;
1349   case 0x9d:
1350     if (ABCIsConst)
1351       Res = Or(Nor(A, C), Xnor(B, C));
1352     break;
1353   case 0x9e:
1354     if (ABCIsConst)
1355       Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1356     break;
1357   case 0x9f:
1358     if (ABCIsConst)
1359       Res = Nand(A, Xor(B, C));
1360     break;
1361   case 0xa0:
1362     Res = And(A, C);
1363     break;
1364   case 0xa1:
1365     if (ABCIsConst)
1366       Res = Xor(A, Nor(Nor(A, Not(B)), C));
1367     break;
1368   case 0xa2:
1369     if (ABCIsConst)
1370       Res = And(Or(A, Not(B)), C);
1371     break;
1372   case 0xa3:
1373     if (ABCIsConst)
1374       Res = Xor(Nor(Xor(B, C), A), C);
1375     break;
1376   case 0xa4:
1377     if (ABCIsConst)
1378       Res = Xor(A, Nor(Nor(A, B), C));
1379     break;
1380   case 0xa5:
1381     if (ACIsConst)
1382       Res = Xnor(A, C);
1383     break;
1384   case 0xa6:
1385     if (ABCIsConst)
1386       Res = Xor(Nor(A, Not(B)), C);
1387     break;
1388   case 0xa7:
1389     if (ABCIsConst)
1390       Res = Or(Nor(A, B), Xnor(A, C));
1391     break;
1392   case 0xa8:
1393     if (ABCIsConst)
1394       Res = And(Or(A, B), C);
1395     break;
1396   case 0xa9:
1397     if (ABCIsConst)
1398       Res = Xor(Nor(A, B), C);
1399     break;
1400   case 0xaa:
1401     Res = C;
1402     break;
1403   case 0xab:
1404     if (ABCIsConst)
1405       Res = Or(Nor(A, B), C);
1406     break;
1407   case 0xac:
1408     if (ABCIsConst)
1409       Res = Xor(Nor(Xnor(B, C), A), C);
1410     break;
1411   case 0xad:
1412     if (ABCIsConst)
1413       Res = Or(Xnor(A, C), And(B, C));
1414     break;
1415   case 0xae:
1416     if (ABCIsConst)
1417       Res = Or(Nor(A, Not(B)), C);
1418     break;
1419   case 0xaf:
1420     if (ACIsConst)
1421       Res = Or(C, Not(A));
1422     break;
1423   case 0xb0:
1424     if (ABCIsConst)
1425       Res = And(A, Nand(B, Not(C)));
1426     break;
1427   case 0xb1:
1428     if (ABCIsConst)
1429       Res = Xor(A, Nor(Xor(A, B), C));
1430     break;
1431   case 0xb2:
1432     if (ABCIsConst)
1433       Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1434     break;
1435   case 0xb3:
1436     if (ABCIsConst)
1437       Res = Nand(Nand(A, C), B);
1438     break;
1439   case 0xb4:
1440     if (ABCIsConst)
1441       Res = Xor(A, Nor(C, Not(B)));
1442     break;
1443   case 0xb5:
1444     if (ABCIsConst)
1445       Res = Or(Xnor(A, C), Nor(B, C));
1446     break;
1447   case 0xb6:
1448     if (ABCIsConst)
1449       Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1450     break;
1451   case 0xb7:
1452     if (ABCIsConst)
1453       Res = Nand(Xor(A, C), B);
1454     break;
1455   case 0xb8:
1456     if (ABCIsConst)
1457       Res = Xor(Nor(Xnor(A, C), B), C);
1458     break;
1459   case 0xb9:
1460     if (ABCIsConst)
1461       Res = Xor(Nor(And(A, C), B), C);
1462     break;
1463   case 0xba:
1464     if (ABCIsConst)
1465       Res = Or(Nor(B, Not(A)), C);
1466     break;
1467   case 0xbb:
1468     if (BCIsConst)
1469       Res = Or(C, Not(B));
1470     break;
1471   case 0xbc:
1472     if (ABCIsConst)
1473       Res = Xor(A, And(Nand(A, C), B));
1474     break;
1475   case 0xbd:
1476     if (ABCIsConst)
1477       Res = Or(Xor(A, B), Xnor(A, C));
1478     break;
1479   case 0xbe:
1480     if (ABCIsConst)
1481       Res = Or(Xor(A, B), C);
1482     break;
1483   case 0xbf:
1484     if (ABCIsConst)
1485       Res = Or(Nand(A, B), C);
1486     break;
1487   case 0xc0:
1488     Res = And(A, B);
1489     break;
1490   case 0xc1:
1491     if (ABCIsConst)
1492       Res = Xor(A, Nor(Nor(A, Not(C)), B));
1493     break;
1494   case 0xc2:
1495     if (ABCIsConst)
1496       Res = Xor(A, Nor(Nor(A, C), B));
1497     break;
1498   case 0xc3:
1499     if (ABIsConst)
1500       Res = Xnor(A, B);
1501     break;
1502   case 0xc4:
1503     if (ABCIsConst)
1504       Res = And(Or(A, Not(C)), B);
1505     break;
1506   case 0xc5:
1507     if (ABCIsConst)
1508       Res = Xor(B, Nor(A, Xor(B, C)));
1509     break;
1510   case 0xc6:
1511     if (ABCIsConst)
1512       Res = Xor(Nor(A, Not(C)), B);
1513     break;
1514   case 0xc7:
1515     if (ABCIsConst)
1516       Res = Or(Xnor(A, B), Nor(A, C));
1517     break;
1518   case 0xc8:
1519     if (ABCIsConst)
1520       Res = And(Or(A, C), B);
1521     break;
1522   case 0xc9:
1523     if (ABCIsConst)
1524       Res = Xor(Nor(A, C), B);
1525     break;
1526   case 0xca:
1527     if (ABCIsConst)
1528       Res = Xor(B, Nor(A, Xnor(B, C)));
1529     break;
1530   case 0xcb:
1531     if (ABCIsConst)
1532       Res = Or(Xnor(A, B), And(B, C));
1533     break;
1534   case 0xcc:
1535     Res = B;
1536     break;
1537   case 0xcd:
1538     if (ABCIsConst)
1539       Res = Or(Nor(A, C), B);
1540     break;
1541   case 0xce:
1542     if (ABCIsConst)
1543       Res = Or(Nor(A, Not(C)), B);
1544     break;
1545   case 0xcf:
1546     if (ABIsConst)
1547       Res = Or(B, Not(A));
1548     break;
1549   case 0xd0:
1550     if (ABCIsConst)
1551       Res = And(A, Or(B, Not(C)));
1552     break;
1553   case 0xd1:
1554     if (ABCIsConst)
1555       Res = Xor(A, Nor(Xor(A, C), B));
1556     break;
1557   case 0xd2:
1558     if (ABCIsConst)
1559       Res = Xor(A, Nor(B, Not(C)));
1560     break;
1561   case 0xd3:
1562     if (ABCIsConst)
1563       Res = Or(Xnor(A, B), Nor(B, C));
1564     break;
1565   case 0xd4:
1566     if (ABCIsConst)
1567       Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1568     break;
1569   case 0xd5:
1570     if (ABCIsConst)
1571       Res = Nand(Nand(A, B), C);
1572     break;
1573   case 0xd6:
1574     if (ABCIsConst)
1575       Res = Xor(Xor(A, B), Or(And(A, B), C));
1576     break;
1577   case 0xd7:
1578     if (ABCIsConst)
1579       Res = Nand(Xor(A, B), C);
1580     break;
1581   case 0xd8:
1582     if (ABCIsConst)
1583       Res = Xor(Nor(Xnor(A, B), C), B);
1584     break;
1585   case 0xd9:
1586     if (ABCIsConst)
1587       Res = Or(And(A, B), Xnor(B, C));
1588     break;
1589   case 0xda:
1590     if (ABCIsConst)
1591       Res = Xor(A, And(Nand(A, B), C));
1592     break;
1593   case 0xdb:
1594     if (ABCIsConst)
1595       Res = Or(Xnor(A, B), Xor(A, C));
1596     break;
1597   case 0xdc:
1598     if (ABCIsConst)
1599       Res = Or(B, Nor(C, Not(A)));
1600     break;
1601   case 0xdd:
1602     if (BCIsConst)
1603       Res = Or(B, Not(C));
1604     break;
1605   case 0xde:
1606     if (ABCIsConst)
1607       Res = Or(Xor(A, C), B);
1608     break;
1609   case 0xdf:
1610     if (ABCIsConst)
1611       Res = Or(Nand(A, C), B);
1612     break;
1613   case 0xe0:
1614     if (ABCIsConst)
1615       Res = And(A, Or(B, C));
1616     break;
1617   case 0xe1:
1618     if (ABCIsConst)
1619       Res = Xor(A, Nor(B, C));
1620     break;
1621   case 0xe2:
1622     if (ABCIsConst)
1623       Res = Xor(A, Nor(Xnor(A, C), B));
1624     break;
1625   case 0xe3:
1626     if (ABCIsConst)
1627       Res = Xor(A, Nor(And(A, C), B));
1628     break;
1629   case 0xe4:
1630     if (ABCIsConst)
1631       Res = Xor(A, Nor(Xnor(A, B), C));
1632     break;
1633   case 0xe5:
1634     if (ABCIsConst)
1635       Res = Xor(A, Nor(And(A, B), C));
1636     break;
1637   case 0xe6:
1638     if (ABCIsConst)
1639       Res = Or(And(A, B), Xor(B, C));
1640     break;
1641   case 0xe7:
1642     if (ABCIsConst)
1643       Res = Or(Xnor(A, B), Xnor(A, C));
1644     break;
1645   case 0xe8:
1646     if (ABCIsConst)
1647       Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1648     break;
1649   case 0xe9:
1650     if (ABCIsConst)
1651       Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1652     break;
1653   case 0xea:
1654     if (ABCIsConst)
1655       Res = Or(And(A, B), C);
1656     break;
1657   case 0xeb:
1658     if (ABCIsConst)
1659       Res = Or(Xnor(A, B), C);
1660     break;
1661   case 0xec:
1662     if (ABCIsConst)
1663       Res = Or(And(A, C), B);
1664     break;
1665   case 0xed:
1666     if (ABCIsConst)
1667       Res = Or(Xnor(A, C), B);
1668     break;
1669   case 0xee:
1670     Res = Or(B, C);
1671     break;
1672   case 0xef:
1673     if (ABCIsConst)
1674       Res = Nand(A, Nor(B, C));
1675     break;
1676   case 0xf0:
1677     Res = A;
1678     break;
1679   case 0xf1:
1680     if (ABCIsConst)
1681       Res = Or(A, Nor(B, C));
1682     break;
1683   case 0xf2:
1684     if (ABCIsConst)
1685       Res = Or(A, Nor(B, Not(C)));
1686     break;
1687   case 0xf3:
1688     if (ABIsConst)
1689       Res = Or(A, Not(B));
1690     break;
1691   case 0xf4:
1692     if (ABCIsConst)
1693       Res = Or(A, Nor(C, Not(B)));
1694     break;
1695   case 0xf5:
1696     if (ACIsConst)
1697       Res = Or(A, Not(C));
1698     break;
1699   case 0xf6:
1700     if (ABCIsConst)
1701       Res = Or(A, Xor(B, C));
1702     break;
1703   case 0xf7:
1704     if (ABCIsConst)
1705       Res = Or(A, Nand(B, C));
1706     break;
1707   case 0xf8:
1708     if (ABCIsConst)
1709       Res = Or(A, And(B, C));
1710     break;
1711   case 0xf9:
1712     if (ABCIsConst)
1713       Res = Or(A, Xnor(B, C));
1714     break;
1715   case 0xfa:
1716     Res = Or(A, C);
1717     break;
1718   case 0xfb:
1719     if (ABCIsConst)
1720       Res = Nand(Nor(A, C), B);
1721     break;
1722   case 0xfc:
1723     Res = Or(A, B);
1724     break;
1725   case 0xfd:
1726     if (ABCIsConst)
1727       Res = Nand(Nor(A, B), C);
1728     break;
1729   case 0xfe:
1730     if (ABCIsConst)
1731       Res = Or(Or(A, B), C);
1732     break;
1733   case 0xff:
1734     Res = {Constant::getAllOnesValue(Ty), 0xff};
1735     break;
1736   }
1737
1738   assert((Res.first == nullptr || Res.second == Imm) &&
1739          "Simplification of ternary logic does not verify!");
1740   return Res.first;
1741 }
1742
1743 static Value *simplifyX86insertps(const IntrinsicInst &II,
1744                                   InstCombiner::BuilderTy &Builder) {
1745   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1746   if (!CInt)
1747     return nullptr;
1748
1749   auto *VecTy = cast<FixedVectorType>(II.getType());
1750   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1751
1752   // The immediate permute control byte looks like this:
1753   //    [3:0] - zero mask for each 32-bit lane
1754   //    [5:4] - select one 32-bit destination lane
1755   //    [7:6] - select one 32-bit source lane
1756
1757   uint8_t Imm = CInt->getZExtValue();
1758   uint8_t ZMask = Imm & 0xf;
1759   uint8_t DestLane = (Imm >> 4) & 0x3;
1760   uint8_t SourceLane = (Imm >> 6) & 0x3;
1761
1762   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1763
1764   // If all zero mask bits are set, this was just a weird way to
1765   // generate a zero vector.
1766   if (ZMask == 0xf)
1767     return ZeroVector;
1768
1769   // Initialize by passing all of the first source bits through.
1770   int ShuffleMask[4] = {0, 1, 2, 3};
1771
1772   // We may replace the second operand with the zero vector.
1773   Value *V1 = II.getArgOperand(1);
1774
1775   if (ZMask) {
1776     // If the zero mask is being used with a single input or the zero mask
1777     // overrides the destination lane, this is a shuffle with the zero vector.
1778     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1779         (ZMask & (1 << DestLane))) {
1780       V1 = ZeroVector;
1781       // We may still move 32-bits of the first source vector from one lane
1782       // to another.
1783       ShuffleMask[DestLane] = SourceLane;
1784       // The zero mask may override the previous insert operation.
1785       for (unsigned i = 0; i < 4; ++i)
1786         if ((ZMask >> i) & 0x1)
1787           ShuffleMask[i] = i + 4;
1788     } else {
1789       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1790       return nullptr;
1791     }
1792   } else {
1793     // Replace the selected destination lane with the selected source lane.
1794     ShuffleMask[DestLane] = SourceLane + 4;
1795   }
1796
1797   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1798 }
1799
1800 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1801 /// or conversion to a shuffle vector.
1802 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1803                                ConstantInt *CILength, ConstantInt *CIIndex,
1804                                InstCombiner::BuilderTy &Builder) {
1805   auto LowConstantHighUndef = [&](uint64_t Val) {
1806     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1807     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1808                         UndefValue::get(IntTy64)};
1809     return ConstantVector::get(Args);
1810   };
1811
1812   // See if we're dealing with constant values.
1813   auto *C0 = dyn_cast<Constant>(Op0);
1814   auto *CI0 =
1815       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1816          : nullptr;
1817
1818   // Attempt to constant fold.
1819   if (CILength && CIIndex) {
1820     // From AMD documentation: "The bit index and field length are each six
1821     // bits in length other bits of the field are ignored."
1822     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1823     APInt APLength = CILength->getValue().zextOrTrunc(6);
1824
1825     unsigned Index = APIndex.getZExtValue();
1826
1827     // From AMD documentation: "a value of zero in the field length is
1828     // defined as length of 64".
1829     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1830
1831     // From AMD documentation: "If the sum of the bit index + length field
1832     // is greater than 64, the results are undefined".
1833     unsigned End = Index + Length;
1834
1835     // Note that both field index and field length are 8-bit quantities.
1836     // Since variables 'Index' and 'Length' are unsigned values
1837     // obtained from zero-extending field index and field length
1838     // respectively, their sum should never wrap around.
1839     if (End > 64)
1840       return UndefValue::get(II.getType());
1841
1842     // If we are inserting whole bytes, we can convert this to a shuffle.
1843     // Lowering can recognize EXTRQI shuffle masks.
1844     if ((Length % 8) == 0 && (Index % 8) == 0) {
1845       // Convert bit indices to byte indices.
1846       Length /= 8;
1847       Index /= 8;
1848
1849       Type *IntTy8 = Type::getInt8Ty(II.getContext());
1850       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1851
1852       SmallVector<int, 16> ShuffleMask;
1853       for (int i = 0; i != (int)Length; ++i)
1854         ShuffleMask.push_back(i + Index);
1855       for (int i = Length; i != 8; ++i)
1856         ShuffleMask.push_back(i + 16);
1857       for (int i = 8; i != 16; ++i)
1858         ShuffleMask.push_back(-1);
1859
1860       Value *SV = Builder.CreateShuffleVector(
1861           Builder.CreateBitCast(Op0, ShufTy),
1862           ConstantAggregateZero::get(ShufTy), ShuffleMask);
1863       return Builder.CreateBitCast(SV, II.getType());
1864     }
1865
1866     // Constant Fold - shift Index'th bit to lowest position and mask off
1867     // Length bits.
1868     if (CI0) {
1869       APInt Elt = CI0->getValue();
1870       Elt.lshrInPlace(Index);
1871       Elt = Elt.zextOrTrunc(Length);
1872       return LowConstantHighUndef(Elt.getZExtValue());
1873     }
1874
1875     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1876     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1877       Value *Args[] = {Op0, CILength, CIIndex};
1878       return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args);
1879     }
1880   }
1881
1882   // Constant Fold - extraction from zero is always {zero, undef}.
1883   if (CI0 && CI0->isZero())
1884     return LowConstantHighUndef(0);
1885
1886   return nullptr;
1887 }
1888
1889 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1890 /// folding or conversion to a shuffle vector.
1891 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1892                                  APInt APLength, APInt APIndex,
1893                                  InstCombiner::BuilderTy &Builder) {
1894   // From AMD documentation: "The bit index and field length are each six bits
1895   // in length other bits of the field are ignored."
1896   APIndex = APIndex.zextOrTrunc(6);
1897   APLength = APLength.zextOrTrunc(6);
1898
1899   // Attempt to constant fold.
1900   unsigned Index = APIndex.getZExtValue();
1901
1902   // From AMD documentation: "a value of zero in the field length is
1903   // defined as length of 64".
1904   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1905
1906   // From AMD documentation: "If the sum of the bit index + length field
1907   // is greater than 64, the results are undefined".
1908   unsigned End = Index + Length;
1909
1910   // Note that both field index and field length are 8-bit quantities.
1911   // Since variables 'Index' and 'Length' are unsigned values
1912   // obtained from zero-extending field index and field length
1913   // respectively, their sum should never wrap around.
1914   if (End > 64)
1915     return UndefValue::get(II.getType());
1916
1917   // If we are inserting whole bytes, we can convert this to a shuffle.
1918   // Lowering can recognize INSERTQI shuffle masks.
1919   if ((Length % 8) == 0 && (Index % 8) == 0) {
1920     // Convert bit indices to byte indices.
1921     Length /= 8;
1922     Index /= 8;
1923
1924     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1925     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1926
1927     SmallVector<int, 16> ShuffleMask;
1928     for (int i = 0; i != (int)Index; ++i)
1929       ShuffleMask.push_back(i);
1930     for (int i = 0; i != (int)Length; ++i)
1931       ShuffleMask.push_back(i + 16);
1932     for (int i = Index + Length; i != 8; ++i)
1933       ShuffleMask.push_back(i);
1934     for (int i = 8; i != 16; ++i)
1935       ShuffleMask.push_back(-1);
1936
1937     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1938                                             Builder.CreateBitCast(Op1, ShufTy),
1939                                             ShuffleMask);
1940     return Builder.CreateBitCast(SV, II.getType());
1941   }
1942
1943   // See if we're dealing with constant values.
1944   auto *C0 = dyn_cast<Constant>(Op0);
1945   auto *C1 = dyn_cast<Constant>(Op1);
1946   auto *CI00 =
1947       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1948          : nullptr;
1949   auto *CI10 =
1950       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1951          : nullptr;
1952
1953   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1954   if (CI00 && CI10) {
1955     APInt V00 = CI00->getValue();
1956     APInt V10 = CI10->getValue();
1957     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1958     V00 = V00 & ~Mask;
1959     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1960     APInt Val = V00 | V10;
1961     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1962     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1963                         UndefValue::get(IntTy64)};
1964     return ConstantVector::get(Args);
1965   }
1966
1967   // If we were an INSERTQ call, we'll save demanded elements if we convert to
1968   // INSERTQI.
1969   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1970     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1971     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1972     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1973
1974     Value *Args[] = {Op0, Op1, CILength, CIIndex};
1975     return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args);
1976   }
1977
1978   return nullptr;
1979 }
1980
1981 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1982 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1983                                 InstCombiner::BuilderTy &Builder) {
1984   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1985   if (!V)
1986     return nullptr;
1987
1988   auto *VecTy = cast<FixedVectorType>(II.getType());
1989   unsigned NumElts = VecTy->getNumElements();
1990   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1991          "Unexpected number of elements in shuffle mask!");
1992
1993   // Construct a shuffle mask from constant integers or UNDEFs.
1994   int Indexes[64];
1995
1996   // Each byte in the shuffle control mask forms an index to permute the
1997   // corresponding byte in the destination operand.
1998   for (unsigned I = 0; I < NumElts; ++I) {
1999     Constant *COp = V->getAggregateElement(I);
2000     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2001       return nullptr;
2002
2003     if (isa<UndefValue>(COp)) {
2004       Indexes[I] = -1;
2005       continue;
2006     }
2007
2008     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2009
2010     // If the most significant bit (bit[7]) of each byte of the shuffle
2011     // control mask is set, then zero is written in the result byte.
2012     // The zero vector is in the right-hand side of the resulting
2013     // shufflevector.
2014
2015     // The value of each index for the high 128-bit lane is the least
2016     // significant 4 bits of the respective shuffle control byte.
2017     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2018     Indexes[I] = Index;
2019   }
2020
2021   auto V1 = II.getArgOperand(0);
2022   auto V2 = Constant::getNullValue(VecTy);
2023   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2024 }
2025
2026 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2027 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
2028                                     InstCombiner::BuilderTy &Builder) {
2029   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2030   if (!V)
2031     return nullptr;
2032
2033   auto *VecTy = cast<FixedVectorType>(II.getType());
2034   unsigned NumElts = VecTy->getNumElements();
2035   bool IsPD = VecTy->getScalarType()->isDoubleTy();
2036   unsigned NumLaneElts = IsPD ? 2 : 4;
2037   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2038
2039   // Construct a shuffle mask from constant integers or UNDEFs.
2040   int Indexes[16];
2041
2042   // The intrinsics only read one or two bits, clear the rest.
2043   for (unsigned I = 0; I < NumElts; ++I) {
2044     Constant *COp = V->getAggregateElement(I);
2045     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2046       return nullptr;
2047
2048     if (isa<UndefValue>(COp)) {
2049       Indexes[I] = -1;
2050       continue;
2051     }
2052
2053     APInt Index = cast<ConstantInt>(COp)->getValue();
2054     Index = Index.zextOrTrunc(32).getLoBits(2);
2055
2056     // The PD variants uses bit 1 to select per-lane element index, so
2057     // shift down to convert to generic shuffle mask index.
2058     if (IsPD)
2059       Index.lshrInPlace(1);
2060
2061     // The _256 variants are a bit trickier since the mask bits always index
2062     // into the corresponding 128 half. In order to convert to a generic
2063     // shuffle, we have to make that explicit.
2064     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2065
2066     Indexes[I] = Index.getZExtValue();
2067   }
2068
2069   auto V1 = II.getArgOperand(0);
2070   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2071 }
2072
2073 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2074 static Value *simplifyX86vpermv(const IntrinsicInst &II,
2075                                 InstCombiner::BuilderTy &Builder) {
2076   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2077   if (!V)
2078     return nullptr;
2079
2080   auto *VecTy = cast<FixedVectorType>(II.getType());
2081   unsigned Size = VecTy->getNumElements();
2082   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2083          "Unexpected shuffle mask size");
2084
2085   // Construct a shuffle mask from constant integers or UNDEFs.
2086   int Indexes[64];
2087
2088   for (unsigned I = 0; I < Size; ++I) {
2089     Constant *COp = V->getAggregateElement(I);
2090     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2091       return nullptr;
2092
2093     if (isa<UndefValue>(COp)) {
2094       Indexes[I] = -1;
2095       continue;
2096     }
2097
2098     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2099     Index &= Size - 1;
2100     Indexes[I] = Index;
2101   }
2102
2103   auto V1 = II.getArgOperand(0);
2104   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2105 }
2106
2107 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2108 static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2109                                  InstCombiner::BuilderTy &Builder) {
2110   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2111   if (!V)
2112     return nullptr;
2113
2114   auto *VecTy = cast<FixedVectorType>(II.getType());
2115   unsigned Size = VecTy->getNumElements();
2116   assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2117           Size == 64) &&
2118          "Unexpected shuffle mask size");
2119
2120   // Construct a shuffle mask from constant integers or UNDEFs.
2121   int Indexes[64];
2122
2123   for (unsigned I = 0; I < Size; ++I) {
2124     Constant *COp = V->getAggregateElement(I);
2125     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2126       return nullptr;
2127
2128     if (isa<UndefValue>(COp)) {
2129       Indexes[I] = -1;
2130       continue;
2131     }
2132
2133     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2134     Index &= (2 * Size) - 1;
2135     Indexes[I] = Index;
2136   }
2137
2138   auto V1 = II.getArgOperand(0);
2139   auto V2 = II.getArgOperand(2);
2140   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2141 }
2142
2143 // Simplify VPERMV/VPERMV3 mask - only demand the active index bits.
2144 static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary,
2145                                  InstCombiner &IC) {
2146   auto *VecTy = cast<FixedVectorType>(II->getType());
2147   unsigned EltSizeInBits = VecTy->getScalarSizeInBits();
2148   unsigned NumElts = VecTy->getNumElements();
2149   assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) &&
2150          "Unexpected shuffle mask size");
2151
2152   unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts);
2153   APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits);
2154
2155   KnownBits KnownMask(EltSizeInBits);
2156   return IC.SimplifyDemandedBits(II, /*OpNo=*/1, DemandedMask, KnownMask);
2157 }
2158
2159 std::optional<Instruction *>
2160 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2161   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2162                                              unsigned DemandedWidth) {
2163     APInt UndefElts(Width, 0);
2164     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2165     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2166   };
2167
2168   Intrinsic::ID IID = II.getIntrinsicID();
2169   switch (IID) {
2170   case Intrinsic::x86_bmi_bextr_32:
2171   case Intrinsic::x86_bmi_bextr_64:
2172   case Intrinsic::x86_tbm_bextri_u32:
2173   case Intrinsic::x86_tbm_bextri_u64:
2174     // If the RHS is a constant we can try some simplifications.
2175     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2176       uint64_t Shift = C->getZExtValue();
2177       uint64_t Length = (Shift >> 8) & 0xff;
2178       Shift &= 0xff;
2179       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2180       // If the length is 0 or the shift is out of range, replace with zero.
2181       if (Length == 0 || Shift >= BitWidth) {
2182         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2183       }
2184       // If the LHS is also a constant, we can completely constant fold this.
2185       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2186         uint64_t Result = InC->getZExtValue() >> Shift;
2187         if (Length > BitWidth)
2188           Length = BitWidth;
2189         Result &= maskTrailingOnes<uint64_t>(Length);
2190         return IC.replaceInstUsesWith(II,
2191                                       ConstantInt::get(II.getType(), Result));
2192       }
2193       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2194       // are only masking bits that a shift already cleared?
2195     }
2196     break;
2197
2198   case Intrinsic::x86_bmi_bzhi_32:
2199   case Intrinsic::x86_bmi_bzhi_64:
2200     // If the RHS is a constant we can try some simplifications.
2201     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2202       uint64_t Index = C->getZExtValue() & 0xff;
2203       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2204       if (Index >= BitWidth) {
2205         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2206       }
2207       if (Index == 0) {
2208         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2209       }
2210       // If the LHS is also a constant, we can completely constant fold this.
2211       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2212         uint64_t Result = InC->getZExtValue();
2213         Result &= maskTrailingOnes<uint64_t>(Index);
2214         return IC.replaceInstUsesWith(II,
2215                                       ConstantInt::get(II.getType(), Result));
2216       }
2217       // TODO should we convert this to an AND if the RHS is constant?
2218     }
2219     break;
2220   case Intrinsic::x86_bmi_pext_32:
2221   case Intrinsic::x86_bmi_pext_64:
2222     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2223       if (MaskC->isNullValue()) {
2224         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2225       }
2226       if (MaskC->isAllOnesValue()) {
2227         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2228       }
2229
2230       unsigned MaskIdx, MaskLen;
2231       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2232         // any single contingous sequence of 1s anywhere in the mask simply
2233         // describes a subset of the input bits shifted to the appropriate
2234         // position.  Replace with the straight forward IR.
2235         Value *Input = II.getArgOperand(0);
2236         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2237         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2238         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2239         return IC.replaceInstUsesWith(II, Shifted);
2240       }
2241
2242       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2243         uint64_t Src = SrcC->getZExtValue();
2244         uint64_t Mask = MaskC->getZExtValue();
2245         uint64_t Result = 0;
2246         uint64_t BitToSet = 1;
2247
2248         while (Mask) {
2249           // Isolate lowest set bit.
2250           uint64_t BitToTest = Mask & -Mask;
2251           if (BitToTest & Src)
2252             Result |= BitToSet;
2253
2254           BitToSet <<= 1;
2255           // Clear lowest set bit.
2256           Mask &= Mask - 1;
2257         }
2258
2259         return IC.replaceInstUsesWith(II,
2260                                       ConstantInt::get(II.getType(), Result));
2261       }
2262     }
2263     break;
2264   case Intrinsic::x86_bmi_pdep_32:
2265   case Intrinsic::x86_bmi_pdep_64:
2266     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2267       if (MaskC->isNullValue()) {
2268         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2269       }
2270       if (MaskC->isAllOnesValue()) {
2271         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2272       }
2273
2274       unsigned MaskIdx, MaskLen;
2275       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2276         // any single contingous sequence of 1s anywhere in the mask simply
2277         // describes a subset of the input bits shifted to the appropriate
2278         // position.  Replace with the straight forward IR.
2279         Value *Input = II.getArgOperand(0);
2280         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2281         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2282         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2283         return IC.replaceInstUsesWith(II, Masked);
2284       }
2285
2286       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2287         uint64_t Src = SrcC->getZExtValue();
2288         uint64_t Mask = MaskC->getZExtValue();
2289         uint64_t Result = 0;
2290         uint64_t BitToTest = 1;
2291
2292         while (Mask) {
2293           // Isolate lowest set bit.
2294           uint64_t BitToSet = Mask & -Mask;
2295           if (BitToTest & Src)
2296             Result |= BitToSet;
2297
2298           BitToTest <<= 1;
2299           // Clear lowest set bit;
2300           Mask &= Mask - 1;
2301         }
2302
2303         return IC.replaceInstUsesWith(II,
2304                                       ConstantInt::get(II.getType(), Result));
2305       }
2306     }
2307     break;
2308
2309   case Intrinsic::x86_sse_cvtss2si:
2310   case Intrinsic::x86_sse_cvtss2si64:
2311   case Intrinsic::x86_sse_cvttss2si:
2312   case Intrinsic::x86_sse_cvttss2si64:
2313   case Intrinsic::x86_sse2_cvtsd2si:
2314   case Intrinsic::x86_sse2_cvtsd2si64:
2315   case Intrinsic::x86_sse2_cvttsd2si:
2316   case Intrinsic::x86_sse2_cvttsd2si64:
2317   case Intrinsic::x86_avx512_vcvtss2si32:
2318   case Intrinsic::x86_avx512_vcvtss2si64:
2319   case Intrinsic::x86_avx512_vcvtss2usi32:
2320   case Intrinsic::x86_avx512_vcvtss2usi64:
2321   case Intrinsic::x86_avx512_vcvtsd2si32:
2322   case Intrinsic::x86_avx512_vcvtsd2si64:
2323   case Intrinsic::x86_avx512_vcvtsd2usi32:
2324   case Intrinsic::x86_avx512_vcvtsd2usi64:
2325   case Intrinsic::x86_avx512_cvttss2si:
2326   case Intrinsic::x86_avx512_cvttss2si64:
2327   case Intrinsic::x86_avx512_cvttss2usi:
2328   case Intrinsic::x86_avx512_cvttss2usi64:
2329   case Intrinsic::x86_avx512_cvttsd2si:
2330   case Intrinsic::x86_avx512_cvttsd2si64:
2331   case Intrinsic::x86_avx512_cvttsd2usi:
2332   case Intrinsic::x86_avx512_cvttsd2usi64: {
2333     // These intrinsics only demand the 0th element of their input vectors. If
2334     // we can simplify the input based on that, do so now.
2335     Value *Arg = II.getArgOperand(0);
2336     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2337     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2338       return IC.replaceOperand(II, 0, V);
2339     }
2340     break;
2341   }
2342
2343   case Intrinsic::x86_mmx_pmovmskb:
2344   case Intrinsic::x86_sse_movmsk_ps:
2345   case Intrinsic::x86_sse2_movmsk_pd:
2346   case Intrinsic::x86_sse2_pmovmskb_128:
2347   case Intrinsic::x86_avx_movmsk_pd_256:
2348   case Intrinsic::x86_avx_movmsk_ps_256:
2349   case Intrinsic::x86_avx2_pmovmskb:
2350     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2351       return IC.replaceInstUsesWith(II, V);
2352     }
2353     break;
2354
2355   case Intrinsic::x86_sse_comieq_ss:
2356   case Intrinsic::x86_sse_comige_ss:
2357   case Intrinsic::x86_sse_comigt_ss:
2358   case Intrinsic::x86_sse_comile_ss:
2359   case Intrinsic::x86_sse_comilt_ss:
2360   case Intrinsic::x86_sse_comineq_ss:
2361   case Intrinsic::x86_sse_ucomieq_ss:
2362   case Intrinsic::x86_sse_ucomige_ss:
2363   case Intrinsic::x86_sse_ucomigt_ss:
2364   case Intrinsic::x86_sse_ucomile_ss:
2365   case Intrinsic::x86_sse_ucomilt_ss:
2366   case Intrinsic::x86_sse_ucomineq_ss:
2367   case Intrinsic::x86_sse2_comieq_sd:
2368   case Intrinsic::x86_sse2_comige_sd:
2369   case Intrinsic::x86_sse2_comigt_sd:
2370   case Intrinsic::x86_sse2_comile_sd:
2371   case Intrinsic::x86_sse2_comilt_sd:
2372   case Intrinsic::x86_sse2_comineq_sd:
2373   case Intrinsic::x86_sse2_ucomieq_sd:
2374   case Intrinsic::x86_sse2_ucomige_sd:
2375   case Intrinsic::x86_sse2_ucomigt_sd:
2376   case Intrinsic::x86_sse2_ucomile_sd:
2377   case Intrinsic::x86_sse2_ucomilt_sd:
2378   case Intrinsic::x86_sse2_ucomineq_sd:
2379   case Intrinsic::x86_avx512_vcomi_ss:
2380   case Intrinsic::x86_avx512_vcomi_sd:
2381   case Intrinsic::x86_avx512_mask_cmp_ss:
2382   case Intrinsic::x86_avx512_mask_cmp_sd: {
2383     // These intrinsics only demand the 0th element of their input vectors. If
2384     // we can simplify the input based on that, do so now.
2385     bool MadeChange = false;
2386     Value *Arg0 = II.getArgOperand(0);
2387     Value *Arg1 = II.getArgOperand(1);
2388     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2389     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2390       IC.replaceOperand(II, 0, V);
2391       MadeChange = true;
2392     }
2393     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2394       IC.replaceOperand(II, 1, V);
2395       MadeChange = true;
2396     }
2397     if (MadeChange) {
2398       return &II;
2399     }
2400     break;
2401   }
2402
2403   case Intrinsic::x86_avx512_add_ps_512:
2404   case Intrinsic::x86_avx512_div_ps_512:
2405   case Intrinsic::x86_avx512_mul_ps_512:
2406   case Intrinsic::x86_avx512_sub_ps_512:
2407   case Intrinsic::x86_avx512_add_pd_512:
2408   case Intrinsic::x86_avx512_div_pd_512:
2409   case Intrinsic::x86_avx512_mul_pd_512:
2410   case Intrinsic::x86_avx512_sub_pd_512:
2411     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2412     // IR operations.
2413     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2414       if (R->getValue() == 4) {
2415         Value *Arg0 = II.getArgOperand(0);
2416         Value *Arg1 = II.getArgOperand(1);
2417
2418         Value *V;
2419         switch (IID) {
2420         default:
2421           llvm_unreachable("Case stmts out of sync!");
2422         case Intrinsic::x86_avx512_add_ps_512:
2423         case Intrinsic::x86_avx512_add_pd_512:
2424           V = IC.Builder.CreateFAdd(Arg0, Arg1);
2425           break;
2426         case Intrinsic::x86_avx512_sub_ps_512:
2427         case Intrinsic::x86_avx512_sub_pd_512:
2428           V = IC.Builder.CreateFSub(Arg0, Arg1);
2429           break;
2430         case Intrinsic::x86_avx512_mul_ps_512:
2431         case Intrinsic::x86_avx512_mul_pd_512:
2432           V = IC.Builder.CreateFMul(Arg0, Arg1);
2433           break;
2434         case Intrinsic::x86_avx512_div_ps_512:
2435         case Intrinsic::x86_avx512_div_pd_512:
2436           V = IC.Builder.CreateFDiv(Arg0, Arg1);
2437           break;
2438         }
2439
2440         return IC.replaceInstUsesWith(II, V);
2441       }
2442     }
2443     break;
2444
2445   case Intrinsic::x86_avx512_mask_add_ss_round:
2446   case Intrinsic::x86_avx512_mask_div_ss_round:
2447   case Intrinsic::x86_avx512_mask_mul_ss_round:
2448   case Intrinsic::x86_avx512_mask_sub_ss_round:
2449   case Intrinsic::x86_avx512_mask_add_sd_round:
2450   case Intrinsic::x86_avx512_mask_div_sd_round:
2451   case Intrinsic::x86_avx512_mask_mul_sd_round:
2452   case Intrinsic::x86_avx512_mask_sub_sd_round:
2453     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2454     // IR operations.
2455     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2456       if (R->getValue() == 4) {
2457         // Extract the element as scalars.
2458         Value *Arg0 = II.getArgOperand(0);
2459         Value *Arg1 = II.getArgOperand(1);
2460         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2461         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2462
2463         Value *V;
2464         switch (IID) {
2465         default:
2466           llvm_unreachable("Case stmts out of sync!");
2467         case Intrinsic::x86_avx512_mask_add_ss_round:
2468         case Intrinsic::x86_avx512_mask_add_sd_round:
2469           V = IC.Builder.CreateFAdd(LHS, RHS);
2470           break;
2471         case Intrinsic::x86_avx512_mask_sub_ss_round:
2472         case Intrinsic::x86_avx512_mask_sub_sd_round:
2473           V = IC.Builder.CreateFSub(LHS, RHS);
2474           break;
2475         case Intrinsic::x86_avx512_mask_mul_ss_round:
2476         case Intrinsic::x86_avx512_mask_mul_sd_round:
2477           V = IC.Builder.CreateFMul(LHS, RHS);
2478           break;
2479         case Intrinsic::x86_avx512_mask_div_ss_round:
2480         case Intrinsic::x86_avx512_mask_div_sd_round:
2481           V = IC.Builder.CreateFDiv(LHS, RHS);
2482           break;
2483         }
2484
2485         // Handle the masking aspect of the intrinsic.
2486         Value *Mask = II.getArgOperand(3);
2487         auto *C = dyn_cast<ConstantInt>(Mask);
2488         // We don't need a select if we know the mask bit is a 1.
2489         if (!C || !C->getValue()[0]) {
2490           // Cast the mask to an i1 vector and then extract the lowest element.
2491           auto *MaskTy = FixedVectorType::get(
2492               IC.Builder.getInt1Ty(),
2493               cast<IntegerType>(Mask->getType())->getBitWidth());
2494           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2495           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2496           // Extract the lowest element from the passthru operand.
2497           Value *Passthru =
2498               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2499           V = IC.Builder.CreateSelect(Mask, V, Passthru);
2500         }
2501
2502         // Insert the result back into the original argument 0.
2503         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2504
2505         return IC.replaceInstUsesWith(II, V);
2506       }
2507     }
2508     break;
2509
2510   // Constant fold ashr( <A x Bi>, Ci ).
2511   // Constant fold lshr( <A x Bi>, Ci ).
2512   // Constant fold shl( <A x Bi>, Ci ).
2513   case Intrinsic::x86_sse2_psrai_d:
2514   case Intrinsic::x86_sse2_psrai_w:
2515   case Intrinsic::x86_avx2_psrai_d:
2516   case Intrinsic::x86_avx2_psrai_w:
2517   case Intrinsic::x86_avx512_psrai_q_128:
2518   case Intrinsic::x86_avx512_psrai_q_256:
2519   case Intrinsic::x86_avx512_psrai_d_512:
2520   case Intrinsic::x86_avx512_psrai_q_512:
2521   case Intrinsic::x86_avx512_psrai_w_512:
2522   case Intrinsic::x86_sse2_psrli_d:
2523   case Intrinsic::x86_sse2_psrli_q:
2524   case Intrinsic::x86_sse2_psrli_w:
2525   case Intrinsic::x86_avx2_psrli_d:
2526   case Intrinsic::x86_avx2_psrli_q:
2527   case Intrinsic::x86_avx2_psrli_w:
2528   case Intrinsic::x86_avx512_psrli_d_512:
2529   case Intrinsic::x86_avx512_psrli_q_512:
2530   case Intrinsic::x86_avx512_psrli_w_512:
2531   case Intrinsic::x86_sse2_pslli_d:
2532   case Intrinsic::x86_sse2_pslli_q:
2533   case Intrinsic::x86_sse2_pslli_w:
2534   case Intrinsic::x86_avx2_pslli_d:
2535   case Intrinsic::x86_avx2_pslli_q:
2536   case Intrinsic::x86_avx2_pslli_w:
2537   case Intrinsic::x86_avx512_pslli_d_512:
2538   case Intrinsic::x86_avx512_pslli_q_512:
2539   case Intrinsic::x86_avx512_pslli_w_512:
2540     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2541       return IC.replaceInstUsesWith(II, V);
2542     }
2543     break;
2544
2545   case Intrinsic::x86_sse2_psra_d:
2546   case Intrinsic::x86_sse2_psra_w:
2547   case Intrinsic::x86_avx2_psra_d:
2548   case Intrinsic::x86_avx2_psra_w:
2549   case Intrinsic::x86_avx512_psra_q_128:
2550   case Intrinsic::x86_avx512_psra_q_256:
2551   case Intrinsic::x86_avx512_psra_d_512:
2552   case Intrinsic::x86_avx512_psra_q_512:
2553   case Intrinsic::x86_avx512_psra_w_512:
2554   case Intrinsic::x86_sse2_psrl_d:
2555   case Intrinsic::x86_sse2_psrl_q:
2556   case Intrinsic::x86_sse2_psrl_w:
2557   case Intrinsic::x86_avx2_psrl_d:
2558   case Intrinsic::x86_avx2_psrl_q:
2559   case Intrinsic::x86_avx2_psrl_w:
2560   case Intrinsic::x86_avx512_psrl_d_512:
2561   case Intrinsic::x86_avx512_psrl_q_512:
2562   case Intrinsic::x86_avx512_psrl_w_512:
2563   case Intrinsic::x86_sse2_psll_d:
2564   case Intrinsic::x86_sse2_psll_q:
2565   case Intrinsic::x86_sse2_psll_w:
2566   case Intrinsic::x86_avx2_psll_d:
2567   case Intrinsic::x86_avx2_psll_q:
2568   case Intrinsic::x86_avx2_psll_w:
2569   case Intrinsic::x86_avx512_psll_d_512:
2570   case Intrinsic::x86_avx512_psll_q_512:
2571   case Intrinsic::x86_avx512_psll_w_512: {
2572     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2573       return IC.replaceInstUsesWith(II, V);
2574     }
2575
2576     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2577     // operand to compute the shift amount.
2578     Value *Arg1 = II.getArgOperand(1);
2579     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2580            "Unexpected packed shift size");
2581     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2582
2583     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2584       return IC.replaceOperand(II, 1, V);
2585     }
2586     break;
2587   }
2588
2589   case Intrinsic::x86_avx2_psllv_d:
2590   case Intrinsic::x86_avx2_psllv_d_256:
2591   case Intrinsic::x86_avx2_psllv_q:
2592   case Intrinsic::x86_avx2_psllv_q_256:
2593   case Intrinsic::x86_avx512_psllv_d_512:
2594   case Intrinsic::x86_avx512_psllv_q_512:
2595   case Intrinsic::x86_avx512_psllv_w_128:
2596   case Intrinsic::x86_avx512_psllv_w_256:
2597   case Intrinsic::x86_avx512_psllv_w_512:
2598   case Intrinsic::x86_avx2_psrav_d:
2599   case Intrinsic::x86_avx2_psrav_d_256:
2600   case Intrinsic::x86_avx512_psrav_q_128:
2601   case Intrinsic::x86_avx512_psrav_q_256:
2602   case Intrinsic::x86_avx512_psrav_d_512:
2603   case Intrinsic::x86_avx512_psrav_q_512:
2604   case Intrinsic::x86_avx512_psrav_w_128:
2605   case Intrinsic::x86_avx512_psrav_w_256:
2606   case Intrinsic::x86_avx512_psrav_w_512:
2607   case Intrinsic::x86_avx2_psrlv_d:
2608   case Intrinsic::x86_avx2_psrlv_d_256:
2609   case Intrinsic::x86_avx2_psrlv_q:
2610   case Intrinsic::x86_avx2_psrlv_q_256:
2611   case Intrinsic::x86_avx512_psrlv_d_512:
2612   case Intrinsic::x86_avx512_psrlv_q_512:
2613   case Intrinsic::x86_avx512_psrlv_w_128:
2614   case Intrinsic::x86_avx512_psrlv_w_256:
2615   case Intrinsic::x86_avx512_psrlv_w_512:
2616     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2617       return IC.replaceInstUsesWith(II, V);
2618     }
2619     break;
2620
2621   case Intrinsic::x86_sse2_packssdw_128:
2622   case Intrinsic::x86_sse2_packsswb_128:
2623   case Intrinsic::x86_avx2_packssdw:
2624   case Intrinsic::x86_avx2_packsswb:
2625   case Intrinsic::x86_avx512_packssdw_512:
2626   case Intrinsic::x86_avx512_packsswb_512:
2627     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2628       return IC.replaceInstUsesWith(II, V);
2629     }
2630     break;
2631
2632   case Intrinsic::x86_sse2_packuswb_128:
2633   case Intrinsic::x86_sse41_packusdw:
2634   case Intrinsic::x86_avx2_packusdw:
2635   case Intrinsic::x86_avx2_packuswb:
2636   case Intrinsic::x86_avx512_packusdw_512:
2637   case Intrinsic::x86_avx512_packuswb_512:
2638     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2639       return IC.replaceInstUsesWith(II, V);
2640     }
2641     break;
2642
2643   case Intrinsic::x86_sse2_pmulh_w:
2644   case Intrinsic::x86_avx2_pmulh_w:
2645   case Intrinsic::x86_avx512_pmulh_w_512:
2646     if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2647       return IC.replaceInstUsesWith(II, V);
2648     }
2649     break;
2650
2651   case Intrinsic::x86_sse2_pmulhu_w:
2652   case Intrinsic::x86_avx2_pmulhu_w:
2653   case Intrinsic::x86_avx512_pmulhu_w_512:
2654     if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2655       return IC.replaceInstUsesWith(II, V);
2656     }
2657     break;
2658
2659   case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2660   case Intrinsic::x86_avx2_pmul_hr_sw:
2661   case Intrinsic::x86_avx512_pmul_hr_sw_512:
2662     if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2663       return IC.replaceInstUsesWith(II, V);
2664     }
2665     break;
2666
2667   case Intrinsic::x86_sse2_pmadd_wd:
2668   case Intrinsic::x86_avx2_pmadd_wd:
2669   case Intrinsic::x86_avx512_pmaddw_d_512:
2670     if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2671       return IC.replaceInstUsesWith(II, V);
2672     }
2673     break;
2674
2675   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2676   case Intrinsic::x86_avx2_pmadd_ub_sw:
2677   case Intrinsic::x86_avx512_pmaddubs_w_512:
2678     if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2679       return IC.replaceInstUsesWith(II, V);
2680     }
2681     break;
2682
2683   case Intrinsic::x86_pclmulqdq:
2684   case Intrinsic::x86_pclmulqdq_256:
2685   case Intrinsic::x86_pclmulqdq_512: {
2686     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2687       unsigned Imm = C->getZExtValue();
2688
2689       bool MadeChange = false;
2690       Value *Arg0 = II.getArgOperand(0);
2691       Value *Arg1 = II.getArgOperand(1);
2692       unsigned VWidth =
2693           cast<FixedVectorType>(Arg0->getType())->getNumElements();
2694
2695       APInt UndefElts1(VWidth, 0);
2696       APInt DemandedElts1 =
2697           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2698       if (Value *V =
2699               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2700         IC.replaceOperand(II, 0, V);
2701         MadeChange = true;
2702       }
2703
2704       APInt UndefElts2(VWidth, 0);
2705       APInt DemandedElts2 =
2706           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2707       if (Value *V =
2708               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2709         IC.replaceOperand(II, 1, V);
2710         MadeChange = true;
2711       }
2712
2713       // If either input elements are undef, the result is zero.
2714       if (DemandedElts1.isSubsetOf(UndefElts1) ||
2715           DemandedElts2.isSubsetOf(UndefElts2)) {
2716         return IC.replaceInstUsesWith(II,
2717                                       ConstantAggregateZero::get(II.getType()));
2718       }
2719
2720       if (MadeChange) {
2721         return &II;
2722       }
2723     }
2724     break;
2725   }
2726
2727   case Intrinsic::x86_sse41_insertps:
2728     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2729       return IC.replaceInstUsesWith(II, V);
2730     }
2731     break;
2732
2733   case Intrinsic::x86_sse4a_extrq: {
2734     Value *Op0 = II.getArgOperand(0);
2735     Value *Op1 = II.getArgOperand(1);
2736     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2737     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2738     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2739            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2740            VWidth1 == 16 && "Unexpected operand sizes");
2741
2742     // See if we're dealing with constant values.
2743     auto *C1 = dyn_cast<Constant>(Op1);
2744     auto *CILength =
2745         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2746            : nullptr;
2747     auto *CIIndex =
2748         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2749            : nullptr;
2750
2751     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2752     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2753       return IC.replaceInstUsesWith(II, V);
2754     }
2755
2756     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2757     // operands and the lowest 16-bits of the second.
2758     bool MadeChange = false;
2759     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2760       IC.replaceOperand(II, 0, V);
2761       MadeChange = true;
2762     }
2763     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2764       IC.replaceOperand(II, 1, V);
2765       MadeChange = true;
2766     }
2767     if (MadeChange) {
2768       return &II;
2769     }
2770     break;
2771   }
2772
2773   case Intrinsic::x86_sse4a_extrqi: {
2774     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2775     // bits of the lower 64-bits. The upper 64-bits are undefined.
2776     Value *Op0 = II.getArgOperand(0);
2777     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2778     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2779            "Unexpected operand size");
2780
2781     // See if we're dealing with constant values.
2782     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2783     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2784
2785     // Attempt to simplify to a constant or shuffle vector.
2786     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2787       return IC.replaceInstUsesWith(II, V);
2788     }
2789
2790     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2791     // operand.
2792     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2793       return IC.replaceOperand(II, 0, V);
2794     }
2795     break;
2796   }
2797
2798   case Intrinsic::x86_sse4a_insertq: {
2799     Value *Op0 = II.getArgOperand(0);
2800     Value *Op1 = II.getArgOperand(1);
2801     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2802     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2803            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2804            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2805            "Unexpected operand size");
2806
2807     // See if we're dealing with constant values.
2808     auto *C1 = dyn_cast<Constant>(Op1);
2809     auto *CI11 =
2810         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2811            : nullptr;
2812
2813     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2814     if (CI11) {
2815       const APInt &V11 = CI11->getValue();
2816       APInt Len = V11.zextOrTrunc(6);
2817       APInt Idx = V11.lshr(8).zextOrTrunc(6);
2818       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2819         return IC.replaceInstUsesWith(II, V);
2820       }
2821     }
2822
2823     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2824     // operand.
2825     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2826       return IC.replaceOperand(II, 0, V);
2827     }
2828     break;
2829   }
2830
2831   case Intrinsic::x86_sse4a_insertqi: {
2832     // INSERTQI: Extract lowest Length bits from lower half of second source and
2833     // insert over first source starting at Index bit. The upper 64-bits are
2834     // undefined.
2835     Value *Op0 = II.getArgOperand(0);
2836     Value *Op1 = II.getArgOperand(1);
2837     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2838     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2839     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2840            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2841            VWidth1 == 2 && "Unexpected operand sizes");
2842
2843     // See if we're dealing with constant values.
2844     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2845     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2846
2847     // Attempt to simplify to a constant or shuffle vector.
2848     if (CILength && CIIndex) {
2849       APInt Len = CILength->getValue().zextOrTrunc(6);
2850       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2851       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2852         return IC.replaceInstUsesWith(II, V);
2853       }
2854     }
2855
2856     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2857     // operands.
2858     bool MadeChange = false;
2859     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2860       IC.replaceOperand(II, 0, V);
2861       MadeChange = true;
2862     }
2863     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2864       IC.replaceOperand(II, 1, V);
2865       MadeChange = true;
2866     }
2867     if (MadeChange) {
2868       return &II;
2869     }
2870     break;
2871   }
2872
2873   case Intrinsic::x86_sse41_pblendvb:
2874   case Intrinsic::x86_sse41_blendvps:
2875   case Intrinsic::x86_sse41_blendvpd:
2876   case Intrinsic::x86_avx_blendv_ps_256:
2877   case Intrinsic::x86_avx_blendv_pd_256:
2878   case Intrinsic::x86_avx2_pblendvb: {
2879     // fold (blend A, A, Mask) -> A
2880     Value *Op0 = II.getArgOperand(0);
2881     Value *Op1 = II.getArgOperand(1);
2882     Value *Mask = II.getArgOperand(2);
2883     if (Op0 == Op1) {
2884       return IC.replaceInstUsesWith(II, Op0);
2885     }
2886
2887     // Zero Mask - select 1st argument.
2888     if (isa<ConstantAggregateZero>(Mask)) {
2889       return IC.replaceInstUsesWith(II, Op0);
2890     }
2891
2892     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2893     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2894       Constant *NewSelector =
2895           getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2896       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2897     }
2898
2899     Mask = InstCombiner::peekThroughBitcast(Mask);
2900
2901     // Peek through a one-use shuffle - VectorCombine should have simplified
2902     // this for cases where we're splitting wider vectors to use blendv
2903     // intrinsics.
2904     Value *MaskSrc = nullptr;
2905     ArrayRef<int> ShuffleMask;
2906     if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
2907                                        m_Mask(ShuffleMask))))) {
2908       // Bail if the shuffle was irregular or contains undefs.
2909       int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2910       if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2911           any_of(ShuffleMask,
2912                  [NumElts](int M) { return M < 0 || M >= NumElts; }))
2913         break;
2914       Mask = InstCombiner::peekThroughBitcast(MaskSrc);
2915     }
2916
2917     // Convert to a vector select if we can bypass casts and find a boolean
2918     // vector condition value.
2919     Value *BoolVec;
2920     if (match(Mask, m_SExt(m_Value(BoolVec))) &&
2921         BoolVec->getType()->isVectorTy() &&
2922         BoolVec->getType()->getScalarSizeInBits() == 1) {
2923       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2924       auto *OpTy = cast<FixedVectorType>(II.getType());
2925       unsigned NumMaskElts = MaskTy->getNumElements();
2926       unsigned NumOperandElts = OpTy->getNumElements();
2927
2928       // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2929       if (MaskSrc) {
2930         unsigned NumMaskSrcElts =
2931             cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2932         NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2933         // Multiple mask bits maps to the same operand element - bail out.
2934         if (NumMaskElts > NumOperandElts)
2935           break;
2936         SmallVector<int> ScaledMask;
2937         if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2938           break;
2939         BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2940         MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2941       }
2942       assert(MaskTy->getPrimitiveSizeInBits() ==
2943                  OpTy->getPrimitiveSizeInBits() &&
2944              "Not expecting mask and operands with different sizes");
2945
2946       if (NumMaskElts == NumOperandElts) {
2947         return SelectInst::Create(BoolVec, Op1, Op0);
2948       }
2949
2950       // If the mask has less elements than the operands, each mask bit maps to
2951       // multiple elements of the operands. Bitcast back and forth.
2952       if (NumMaskElts < NumOperandElts) {
2953         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2954         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2955         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2956         return new BitCastInst(Sel, II.getType());
2957       }
2958     }
2959
2960     break;
2961   }
2962
2963   case Intrinsic::x86_ssse3_pshuf_b_128:
2964   case Intrinsic::x86_avx2_pshuf_b:
2965   case Intrinsic::x86_avx512_pshuf_b_512: {
2966     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2967       return IC.replaceInstUsesWith(II, V);
2968     }
2969
2970     KnownBits KnownMask(8);
2971     if (IC.SimplifyDemandedBits(&II, 1, APInt(8, 0b10001111), KnownMask))
2972       return &II;
2973     break;
2974   }
2975
2976   case Intrinsic::x86_avx_vpermilvar_ps:
2977   case Intrinsic::x86_avx_vpermilvar_ps_256:
2978   case Intrinsic::x86_avx512_vpermilvar_ps_512: {
2979     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2980       return IC.replaceInstUsesWith(II, V);
2981     }
2982
2983     KnownBits KnownMask(32);
2984     if (IC.SimplifyDemandedBits(&II, 1, APInt(32, 0b00011), KnownMask))
2985       return &II;
2986     break;
2987   }
2988
2989   case Intrinsic::x86_avx_vpermilvar_pd:
2990   case Intrinsic::x86_avx_vpermilvar_pd_256:
2991   case Intrinsic::x86_avx512_vpermilvar_pd_512: {
2992     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2993       return IC.replaceInstUsesWith(II, V);
2994     }
2995
2996     KnownBits KnownMask(64);
2997     if (IC.SimplifyDemandedBits(&II, 1, APInt(64, 0b00010), KnownMask))
2998       return &II;
2999     break;
3000   }
3001
3002   case Intrinsic::x86_avx2_permd:
3003   case Intrinsic::x86_avx2_permps:
3004   case Intrinsic::x86_avx512_permvar_df_256:
3005   case Intrinsic::x86_avx512_permvar_df_512:
3006   case Intrinsic::x86_avx512_permvar_di_256:
3007   case Intrinsic::x86_avx512_permvar_di_512:
3008   case Intrinsic::x86_avx512_permvar_hi_128:
3009   case Intrinsic::x86_avx512_permvar_hi_256:
3010   case Intrinsic::x86_avx512_permvar_hi_512:
3011   case Intrinsic::x86_avx512_permvar_qi_128:
3012   case Intrinsic::x86_avx512_permvar_qi_256:
3013   case Intrinsic::x86_avx512_permvar_qi_512:
3014   case Intrinsic::x86_avx512_permvar_sf_512:
3015   case Intrinsic::x86_avx512_permvar_si_512:
3016     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
3017       return IC.replaceInstUsesWith(II, V);
3018     }
3019     if (simplifyX86VPERMMask(&II, /*IsBinary=*/false, IC))
3020       return &II;
3021     break;
3022
3023   case Intrinsic::x86_avx512_vpermi2var_d_128:
3024   case Intrinsic::x86_avx512_vpermi2var_d_256:
3025   case Intrinsic::x86_avx512_vpermi2var_d_512:
3026   case Intrinsic::x86_avx512_vpermi2var_hi_128:
3027   case Intrinsic::x86_avx512_vpermi2var_hi_256:
3028   case Intrinsic::x86_avx512_vpermi2var_hi_512:
3029   case Intrinsic::x86_avx512_vpermi2var_pd_128:
3030   case Intrinsic::x86_avx512_vpermi2var_pd_256:
3031   case Intrinsic::x86_avx512_vpermi2var_pd_512:
3032   case Intrinsic::x86_avx512_vpermi2var_ps_128:
3033   case Intrinsic::x86_avx512_vpermi2var_ps_256:
3034   case Intrinsic::x86_avx512_vpermi2var_ps_512:
3035   case Intrinsic::x86_avx512_vpermi2var_q_128:
3036   case Intrinsic::x86_avx512_vpermi2var_q_256:
3037   case Intrinsic::x86_avx512_vpermi2var_q_512:
3038   case Intrinsic::x86_avx512_vpermi2var_qi_128:
3039   case Intrinsic::x86_avx512_vpermi2var_qi_256:
3040   case Intrinsic::x86_avx512_vpermi2var_qi_512:
3041     if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3042       return IC.replaceInstUsesWith(II, V);
3043     }
3044     if (simplifyX86VPERMMask(&II, /*IsBinary=*/true, IC))
3045       return &II;
3046     break;
3047
3048   case Intrinsic::x86_avx_maskload_ps:
3049   case Intrinsic::x86_avx_maskload_pd:
3050   case Intrinsic::x86_avx_maskload_ps_256:
3051   case Intrinsic::x86_avx_maskload_pd_256:
3052   case Intrinsic::x86_avx2_maskload_d:
3053   case Intrinsic::x86_avx2_maskload_q:
3054   case Intrinsic::x86_avx2_maskload_d_256:
3055   case Intrinsic::x86_avx2_maskload_q_256:
3056     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3057       return I;
3058     }
3059     break;
3060
3061   case Intrinsic::x86_sse2_maskmov_dqu:
3062   case Intrinsic::x86_avx_maskstore_ps:
3063   case Intrinsic::x86_avx_maskstore_pd:
3064   case Intrinsic::x86_avx_maskstore_ps_256:
3065   case Intrinsic::x86_avx_maskstore_pd_256:
3066   case Intrinsic::x86_avx2_maskstore_d:
3067   case Intrinsic::x86_avx2_maskstore_q:
3068   case Intrinsic::x86_avx2_maskstore_d_256:
3069   case Intrinsic::x86_avx2_maskstore_q_256:
3070     if (simplifyX86MaskedStore(II, IC)) {
3071       return nullptr;
3072     }
3073     break;
3074
3075   case Intrinsic::x86_addcarry_32:
3076   case Intrinsic::x86_addcarry_64:
3077     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3078       return IC.replaceInstUsesWith(II, V);
3079     }
3080     break;
3081
3082   case Intrinsic::x86_avx512_pternlog_d_128:
3083   case Intrinsic::x86_avx512_pternlog_d_256:
3084   case Intrinsic::x86_avx512_pternlog_d_512:
3085   case Intrinsic::x86_avx512_pternlog_q_128:
3086   case Intrinsic::x86_avx512_pternlog_q_256:
3087   case Intrinsic::x86_avx512_pternlog_q_512:
3088     if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3089       return IC.replaceInstUsesWith(II, V);
3090     }
3091     break;
3092   default:
3093     break;
3094   }
3095   return std::nullopt;
3096 }
3097
3098 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3099     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3100     bool &KnownBitsComputed) const {
3101   switch (II.getIntrinsicID()) {
3102   default:
3103     break;
3104   case Intrinsic::x86_mmx_pmovmskb:
3105   case Intrinsic::x86_sse_movmsk_ps:
3106   case Intrinsic::x86_sse2_movmsk_pd:
3107   case Intrinsic::x86_sse2_pmovmskb_128:
3108   case Intrinsic::x86_avx_movmsk_ps_256:
3109   case Intrinsic::x86_avx_movmsk_pd_256:
3110   case Intrinsic::x86_avx2_pmovmskb: {
3111     // MOVMSK copies the vector elements' sign bits to the low bits
3112     // and zeros the high bits.
3113     unsigned ArgWidth;
3114     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3115       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3116     } else {
3117       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3118       ArgWidth = ArgType->getNumElements();
3119     }
3120
3121     // If we don't need any of low bits then return zero,
3122     // we know that DemandedMask is non-zero already.
3123     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3124     Type *VTy = II.getType();
3125     if (DemandedElts.isZero()) {
3126       return ConstantInt::getNullValue(VTy);
3127     }
3128
3129     // We know that the upper bits are set to zero.
3130     Known.Zero.setBitsFrom(ArgWidth);
3131     KnownBitsComputed = true;
3132     break;
3133   }
3134   }
3135   return std::nullopt;
3136 }
3137
3138 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3139     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3140     APInt &UndefElts2, APInt &UndefElts3,
3141     std::function<void(Instruction *, unsigned, APInt, APInt &)>
3142         simplifyAndSetOp) const {
3143   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3144   switch (II.getIntrinsicID()) {
3145   default:
3146     break;
3147   case Intrinsic::x86_xop_vfrcz_ss:
3148   case Intrinsic::x86_xop_vfrcz_sd:
3149     // The instructions for these intrinsics are speced to zero upper bits not
3150     // pass them through like other scalar intrinsics. So we shouldn't just
3151     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3152     // Instead we should return a zero vector.
3153     if (!DemandedElts[0]) {
3154       IC.addToWorklist(&II);
3155       return ConstantAggregateZero::get(II.getType());
3156     }
3157
3158     // Only the lower element is used.
3159     DemandedElts = 1;
3160     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3161
3162     // Only the lower element is undefined. The high elements are zero.
3163     UndefElts = UndefElts[0];
3164     break;
3165
3166   // Unary scalar-as-vector operations that work column-wise.
3167   case Intrinsic::x86_sse_rcp_ss:
3168   case Intrinsic::x86_sse_rsqrt_ss:
3169     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3170
3171     // If lowest element of a scalar op isn't used then use Arg0.
3172     if (!DemandedElts[0]) {
3173       IC.addToWorklist(&II);
3174       return II.getArgOperand(0);
3175     }
3176     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3177     // checks).
3178     break;
3179
3180   // Binary scalar-as-vector operations that work column-wise. The high
3181   // elements come from operand 0. The low element is a function of both
3182   // operands.
3183   case Intrinsic::x86_sse_min_ss:
3184   case Intrinsic::x86_sse_max_ss:
3185   case Intrinsic::x86_sse_cmp_ss:
3186   case Intrinsic::x86_sse2_min_sd:
3187   case Intrinsic::x86_sse2_max_sd:
3188   case Intrinsic::x86_sse2_cmp_sd: {
3189     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3190
3191     // If lowest element of a scalar op isn't used then use Arg0.
3192     if (!DemandedElts[0]) {
3193       IC.addToWorklist(&II);
3194       return II.getArgOperand(0);
3195     }
3196
3197     // Only lower element is used for operand 1.
3198     DemandedElts = 1;
3199     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3200
3201     // Lower element is undefined if both lower elements are undefined.
3202     // Consider things like undef&0.  The result is known zero, not undef.
3203     if (!UndefElts2[0])
3204       UndefElts.clearBit(0);
3205
3206     break;
3207   }
3208
3209   // Binary scalar-as-vector operations that work column-wise. The high
3210   // elements come from operand 0 and the low element comes from operand 1.
3211   case Intrinsic::x86_sse41_round_ss:
3212   case Intrinsic::x86_sse41_round_sd: {
3213     // Don't use the low element of operand 0.
3214     APInt DemandedElts2 = DemandedElts;
3215     DemandedElts2.clearBit(0);
3216     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3217
3218     // If lowest element of a scalar op isn't used then use Arg0.
3219     if (!DemandedElts[0]) {
3220       IC.addToWorklist(&II);
3221       return II.getArgOperand(0);
3222     }
3223
3224     // Only lower element is used for operand 1.
3225     DemandedElts = 1;
3226     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3227
3228     // Take the high undef elements from operand 0 and take the lower element
3229     // from operand 1.
3230     UndefElts.clearBit(0);
3231     UndefElts |= UndefElts2[0];
3232     break;
3233   }
3234
3235   // Three input scalar-as-vector operations that work column-wise. The high
3236   // elements come from operand 0 and the low element is a function of all
3237   // three inputs.
3238   case Intrinsic::x86_avx512_mask_add_ss_round:
3239   case Intrinsic::x86_avx512_mask_div_ss_round:
3240   case Intrinsic::x86_avx512_mask_mul_ss_round:
3241   case Intrinsic::x86_avx512_mask_sub_ss_round:
3242   case Intrinsic::x86_avx512_mask_max_ss_round:
3243   case Intrinsic::x86_avx512_mask_min_ss_round:
3244   case Intrinsic::x86_avx512_mask_add_sd_round:
3245   case Intrinsic::x86_avx512_mask_div_sd_round:
3246   case Intrinsic::x86_avx512_mask_mul_sd_round:
3247   case Intrinsic::x86_avx512_mask_sub_sd_round:
3248   case Intrinsic::x86_avx512_mask_max_sd_round:
3249   case Intrinsic::x86_avx512_mask_min_sd_round:
3250     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3251
3252     // If lowest element of a scalar op isn't used then use Arg0.
3253     if (!DemandedElts[0]) {
3254       IC.addToWorklist(&II);
3255       return II.getArgOperand(0);
3256     }
3257
3258     // Only lower element is used for operand 1 and 2.
3259     DemandedElts = 1;
3260     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3261     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3262
3263     // Lower element is undefined if all three lower elements are undefined.
3264     // Consider things like undef&0.  The result is known zero, not undef.
3265     if (!UndefElts2[0] || !UndefElts3[0])
3266       UndefElts.clearBit(0);
3267     break;
3268
3269   // TODO: Add fmaddsub support?
3270   case Intrinsic::x86_sse3_addsub_pd:
3271   case Intrinsic::x86_sse3_addsub_ps:
3272   case Intrinsic::x86_avx_addsub_pd_256:
3273   case Intrinsic::x86_avx_addsub_ps_256: {
3274     // If none of the even or none of the odd lanes are required, turn this
3275     // into a generic FP math instruction.
3276     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3277     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3278     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3279     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3280     if (IsSubOnly || IsAddOnly) {
3281       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3282       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3283       IC.Builder.SetInsertPoint(&II);
3284       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3285       return IC.Builder.CreateBinOp(
3286           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3287     }
3288
3289     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3290     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3291     UndefElts &= UndefElts2;
3292     break;
3293   }
3294
3295   // General per-element vector operations.
3296   case Intrinsic::x86_avx2_psllv_d:
3297   case Intrinsic::x86_avx2_psllv_d_256:
3298   case Intrinsic::x86_avx2_psllv_q:
3299   case Intrinsic::x86_avx2_psllv_q_256:
3300   case Intrinsic::x86_avx2_psrlv_d:
3301   case Intrinsic::x86_avx2_psrlv_d_256:
3302   case Intrinsic::x86_avx2_psrlv_q:
3303   case Intrinsic::x86_avx2_psrlv_q_256:
3304   case Intrinsic::x86_avx2_psrav_d:
3305   case Intrinsic::x86_avx2_psrav_d_256: {
3306     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3307     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3308     UndefElts &= UndefElts2;
3309     break;
3310   }
3311
3312   case Intrinsic::x86_sse2_pmulh_w:
3313   case Intrinsic::x86_avx2_pmulh_w:
3314   case Intrinsic::x86_avx512_pmulh_w_512:
3315   case Intrinsic::x86_sse2_pmulhu_w:
3316   case Intrinsic::x86_avx2_pmulhu_w:
3317   case Intrinsic::x86_avx512_pmulhu_w_512:
3318   case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3319   case Intrinsic::x86_avx2_pmul_hr_sw:
3320   case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3321     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3322     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3323     // NOTE: mulh(undef,undef) != undef.
3324     break;
3325   }
3326
3327   case Intrinsic::x86_sse2_packssdw_128:
3328   case Intrinsic::x86_sse2_packsswb_128:
3329   case Intrinsic::x86_sse2_packuswb_128:
3330   case Intrinsic::x86_sse41_packusdw:
3331   case Intrinsic::x86_avx2_packssdw:
3332   case Intrinsic::x86_avx2_packsswb:
3333   case Intrinsic::x86_avx2_packusdw:
3334   case Intrinsic::x86_avx2_packuswb:
3335   case Intrinsic::x86_avx512_packssdw_512:
3336   case Intrinsic::x86_avx512_packsswb_512:
3337   case Intrinsic::x86_avx512_packusdw_512:
3338   case Intrinsic::x86_avx512_packuswb_512: {
3339     auto *Ty0 = II.getArgOperand(0)->getType();
3340     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3341     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3342
3343     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3344     unsigned VWidthPerLane = VWidth / NumLanes;
3345     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3346
3347     // Per lane, pack the elements of the first input and then the second.
3348     // e.g.
3349     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3350     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3351     for (int OpNum = 0; OpNum != 2; ++OpNum) {
3352       APInt OpDemandedElts(InnerVWidth, 0);
3353       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3354         unsigned LaneIdx = Lane * VWidthPerLane;
3355         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3356           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3357           if (DemandedElts[Idx])
3358             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3359         }
3360       }
3361
3362       // Demand elements from the operand.
3363       APInt OpUndefElts(InnerVWidth, 0);
3364       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3365
3366       // Pack the operand's UNDEF elements, one lane at a time.
3367       OpUndefElts = OpUndefElts.zext(VWidth);
3368       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3369         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3370         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3371         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3372         UndefElts |= LaneElts;
3373       }
3374     }
3375     break;
3376   }
3377
3378   case Intrinsic::x86_sse2_pmadd_wd:
3379   case Intrinsic::x86_avx2_pmadd_wd:
3380   case Intrinsic::x86_avx512_pmaddw_d_512:
3381   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3382   case Intrinsic::x86_avx2_pmadd_ub_sw:
3383   case Intrinsic::x86_avx512_pmaddubs_w_512: {
3384     // PMADD - demand both src elements that map to each dst element.
3385     auto *ArgTy = II.getArgOperand(0)->getType();
3386     unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3387     assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3388     APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3389     APInt Op0UndefElts(InnerVWidth, 0);
3390     APInt Op1UndefElts(InnerVWidth, 0);
3391     simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3392     simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3393     // NOTE: madd(undef,undef) != undef.
3394     break;
3395   }
3396
3397   // PSHUFB
3398   case Intrinsic::x86_ssse3_pshuf_b_128:
3399   case Intrinsic::x86_avx2_pshuf_b:
3400   case Intrinsic::x86_avx512_pshuf_b_512:
3401   // PERMILVAR
3402   case Intrinsic::x86_avx_vpermilvar_ps:
3403   case Intrinsic::x86_avx_vpermilvar_ps_256:
3404   case Intrinsic::x86_avx512_vpermilvar_ps_512:
3405   case Intrinsic::x86_avx_vpermilvar_pd:
3406   case Intrinsic::x86_avx_vpermilvar_pd_256:
3407   case Intrinsic::x86_avx512_vpermilvar_pd_512:
3408   // PERMV
3409   case Intrinsic::x86_avx2_permd:
3410   case Intrinsic::x86_avx2_permps: {
3411     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3412     break;
3413   }
3414
3415   // SSE4A instructions leave the upper 64-bits of the 128-bit result
3416   // in an undefined state.
3417   case Intrinsic::x86_sse4a_extrq:
3418   case Intrinsic::x86_sse4a_extrqi:
3419   case Intrinsic::x86_sse4a_insertq:
3420   case Intrinsic::x86_sse4a_insertqi:
3421     UndefElts.setHighBits(VWidth / 2);
3422     break;
3423   }
3424   return std::nullopt;
3425 }