AMDGPU: Mark test as XFAIL in expensive_checks builds
[llvm-project.git] / llvm / lib / Target / X86 / X86InstCombineIntrinsic.cpp
blob7c9738bf0821647619c662a287fe0cf085464124
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
23 using namespace llvm;
24 using namespace llvm::PatternMatch;
26 #define DEBUG_TYPE "x86tti"
28 /// Return a constant boolean vector that has true elements in all positions
29 /// where the input constant data vector has an element with the sign bit set.
30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
31 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
32 V = ConstantExpr::getBitCast(V, IntTy);
33 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
34 Constant::getNullValue(IntTy), V, DL);
35 assert(V && "Vector must be foldable");
36 return V;
39 /// Convert the x86 XMM integer vector mask to a vector of bools based on
40 /// each element's most significant bit (the sign bit).
41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
42 // Fold Constant Mask.
43 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
44 return getNegativeIsTrueBoolVec(ConstantMask, DL);
46 // Mask was extended from a boolean vector.
47 Value *ExtMask;
48 if (match(Mask, m_SExt(m_Value(ExtMask))) &&
49 ExtMask->getType()->isIntOrIntVectorTy(1))
50 return ExtMask;
52 return nullptr;
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59 Value *Ptr = II.getOperand(0);
60 Value *Mask = II.getOperand(1);
61 Constant *ZeroVec = Constant::getNullValue(II.getType());
63 // Zero Mask - masked load instruction creates a zero vector.
64 if (isa<ConstantAggregateZero>(Mask))
65 return IC.replaceInstUsesWith(II, ZeroVec);
67 // The mask is constant or extended from a bool vector. Convert this x86
68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71 // the LLVM intrinsic definition for the pointer argument.
72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
76 // The pass-through vector for an x86 masked load is a zero vector.
77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79 return IC.replaceInstUsesWith(II, NewMaskedLoad);
82 return nullptr;
85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89 Value *Ptr = II.getOperand(0);
90 Value *Mask = II.getOperand(1);
91 Value *Vec = II.getOperand(2);
93 // Zero Mask - this masked store instruction does nothing.
94 if (isa<ConstantAggregateZero>(Mask)) {
95 IC.eraseInstFromFunction(II);
96 return true;
99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100 // anything else at this level.
101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102 return false;
104 // The mask is constant or extended from a bool vector. Convert this x86
105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
113 // 'Replace uses' doesn't work for stores. Erase the original masked store.
114 IC.eraseInstFromFunction(II);
115 return true;
118 return false;
121 static Value *simplifyX86immShift(const IntrinsicInst &II,
122 InstCombiner::BuilderTy &Builder) {
123 bool LogicalShift = false;
124 bool ShiftLeft = false;
125 bool IsImm = false;
127 switch (II.getIntrinsicID()) {
128 default:
129 llvm_unreachable("Unexpected intrinsic!");
130 case Intrinsic::x86_sse2_psrai_d:
131 case Intrinsic::x86_sse2_psrai_w:
132 case Intrinsic::x86_avx2_psrai_d:
133 case Intrinsic::x86_avx2_psrai_w:
134 case Intrinsic::x86_avx512_psrai_q_128:
135 case Intrinsic::x86_avx512_psrai_q_256:
136 case Intrinsic::x86_avx512_psrai_d_512:
137 case Intrinsic::x86_avx512_psrai_q_512:
138 case Intrinsic::x86_avx512_psrai_w_512:
139 IsImm = true;
140 [[fallthrough]];
141 case Intrinsic::x86_sse2_psra_d:
142 case Intrinsic::x86_sse2_psra_w:
143 case Intrinsic::x86_avx2_psra_d:
144 case Intrinsic::x86_avx2_psra_w:
145 case Intrinsic::x86_avx512_psra_q_128:
146 case Intrinsic::x86_avx512_psra_q_256:
147 case Intrinsic::x86_avx512_psra_d_512:
148 case Intrinsic::x86_avx512_psra_q_512:
149 case Intrinsic::x86_avx512_psra_w_512:
150 LogicalShift = false;
151 ShiftLeft = false;
152 break;
153 case Intrinsic::x86_sse2_psrli_d:
154 case Intrinsic::x86_sse2_psrli_q:
155 case Intrinsic::x86_sse2_psrli_w:
156 case Intrinsic::x86_avx2_psrli_d:
157 case Intrinsic::x86_avx2_psrli_q:
158 case Intrinsic::x86_avx2_psrli_w:
159 case Intrinsic::x86_avx512_psrli_d_512:
160 case Intrinsic::x86_avx512_psrli_q_512:
161 case Intrinsic::x86_avx512_psrli_w_512:
162 IsImm = true;
163 [[fallthrough]];
164 case Intrinsic::x86_sse2_psrl_d:
165 case Intrinsic::x86_sse2_psrl_q:
166 case Intrinsic::x86_sse2_psrl_w:
167 case Intrinsic::x86_avx2_psrl_d:
168 case Intrinsic::x86_avx2_psrl_q:
169 case Intrinsic::x86_avx2_psrl_w:
170 case Intrinsic::x86_avx512_psrl_d_512:
171 case Intrinsic::x86_avx512_psrl_q_512:
172 case Intrinsic::x86_avx512_psrl_w_512:
173 LogicalShift = true;
174 ShiftLeft = false;
175 break;
176 case Intrinsic::x86_sse2_pslli_d:
177 case Intrinsic::x86_sse2_pslli_q:
178 case Intrinsic::x86_sse2_pslli_w:
179 case Intrinsic::x86_avx2_pslli_d:
180 case Intrinsic::x86_avx2_pslli_q:
181 case Intrinsic::x86_avx2_pslli_w:
182 case Intrinsic::x86_avx512_pslli_d_512:
183 case Intrinsic::x86_avx512_pslli_q_512:
184 case Intrinsic::x86_avx512_pslli_w_512:
185 IsImm = true;
186 [[fallthrough]];
187 case Intrinsic::x86_sse2_psll_d:
188 case Intrinsic::x86_sse2_psll_q:
189 case Intrinsic::x86_sse2_psll_w:
190 case Intrinsic::x86_avx2_psll_d:
191 case Intrinsic::x86_avx2_psll_q:
192 case Intrinsic::x86_avx2_psll_w:
193 case Intrinsic::x86_avx512_psll_d_512:
194 case Intrinsic::x86_avx512_psll_q_512:
195 case Intrinsic::x86_avx512_psll_w_512:
196 LogicalShift = true;
197 ShiftLeft = true;
198 break;
200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
202 Value *Vec = II.getArgOperand(0);
203 Value *Amt = II.getArgOperand(1);
204 auto *VT = cast<FixedVectorType>(Vec->getType());
205 Type *SVT = VT->getElementType();
206 Type *AmtVT = Amt->getType();
207 unsigned VWidth = VT->getNumElements();
208 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
210 // If the shift amount is guaranteed to be in-range we can replace it with a
211 // generic shift. If its guaranteed to be out of range, logical shifts combine
212 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213 if (IsImm) {
214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215 KnownBits KnownAmtBits =
216 llvm::computeKnownBits(Amt, II.getDataLayout());
217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219 Amt = Builder.CreateVectorSplat(VWidth, Amt);
220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221 : Builder.CreateLShr(Vec, Amt))
222 : Builder.CreateAShr(Vec, Amt));
224 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225 if (LogicalShift)
226 return ConstantAggregateZero::get(VT);
227 Amt = ConstantInt::get(SVT, BitWidth - 1);
228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
230 } else {
231 // Ensure the first element has an in-range value and the rest of the
232 // elements in the bottom 64 bits are zero.
233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234 cast<VectorType>(AmtVT)->getElementType() == SVT &&
235 "Unexpected shift-by-scalar type");
236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239 KnownBits KnownLowerBits = llvm::computeKnownBits(
240 Amt, DemandedLower, II.getDataLayout());
241 KnownBits KnownUpperBits = llvm::computeKnownBits(
242 Amt, DemandedUpper, II.getDataLayout());
243 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245 SmallVector<int, 16> ZeroSplat(VWidth, 0);
246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248 : Builder.CreateLShr(Vec, Amt))
249 : Builder.CreateAShr(Vec, Amt));
253 // Simplify if count is constant vector.
254 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255 if (!CDV)
256 return nullptr;
258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259 // operand to compute the shift amount.
260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261 cast<VectorType>(AmtVT)->getElementType() == SVT &&
262 "Unexpected shift-by-scalar type");
264 // Concatenate the sub-elements to create the 64-bit value.
265 APInt Count(64, 0);
266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267 unsigned SubEltIdx = (NumSubElts - 1) - i;
268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269 Count <<= BitWidth;
270 Count |= SubElt->getValue().zextOrTrunc(64);
273 // If shift-by-zero then just return the original value.
274 if (Count.isZero())
275 return Vec;
277 // Handle cases when Shift >= BitWidth.
278 if (Count.uge(BitWidth)) {
279 // If LogicalShift - just return zero.
280 if (LogicalShift)
281 return ConstantAggregateZero::get(VT);
283 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284 Count = APInt(64, BitWidth - 1);
287 // Get a constant vector of the same type as the first operand.
288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
291 if (ShiftLeft)
292 return Builder.CreateShl(Vec, ShiftVec);
294 if (LogicalShift)
295 return Builder.CreateLShr(Vec, ShiftVec);
297 return Builder.CreateAShr(Vec, ShiftVec);
300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303 static Value *simplifyX86varShift(const IntrinsicInst &II,
304 InstCombiner::BuilderTy &Builder) {
305 bool LogicalShift = false;
306 bool ShiftLeft = false;
308 switch (II.getIntrinsicID()) {
309 default:
310 llvm_unreachable("Unexpected intrinsic!");
311 case Intrinsic::x86_avx2_psrav_d:
312 case Intrinsic::x86_avx2_psrav_d_256:
313 case Intrinsic::x86_avx512_psrav_q_128:
314 case Intrinsic::x86_avx512_psrav_q_256:
315 case Intrinsic::x86_avx512_psrav_d_512:
316 case Intrinsic::x86_avx512_psrav_q_512:
317 case Intrinsic::x86_avx512_psrav_w_128:
318 case Intrinsic::x86_avx512_psrav_w_256:
319 case Intrinsic::x86_avx512_psrav_w_512:
320 LogicalShift = false;
321 ShiftLeft = false;
322 break;
323 case Intrinsic::x86_avx2_psrlv_d:
324 case Intrinsic::x86_avx2_psrlv_d_256:
325 case Intrinsic::x86_avx2_psrlv_q:
326 case Intrinsic::x86_avx2_psrlv_q_256:
327 case Intrinsic::x86_avx512_psrlv_d_512:
328 case Intrinsic::x86_avx512_psrlv_q_512:
329 case Intrinsic::x86_avx512_psrlv_w_128:
330 case Intrinsic::x86_avx512_psrlv_w_256:
331 case Intrinsic::x86_avx512_psrlv_w_512:
332 LogicalShift = true;
333 ShiftLeft = false;
334 break;
335 case Intrinsic::x86_avx2_psllv_d:
336 case Intrinsic::x86_avx2_psllv_d_256:
337 case Intrinsic::x86_avx2_psllv_q:
338 case Intrinsic::x86_avx2_psllv_q_256:
339 case Intrinsic::x86_avx512_psllv_d_512:
340 case Intrinsic::x86_avx512_psllv_q_512:
341 case Intrinsic::x86_avx512_psllv_w_128:
342 case Intrinsic::x86_avx512_psllv_w_256:
343 case Intrinsic::x86_avx512_psllv_w_512:
344 LogicalShift = true;
345 ShiftLeft = true;
346 break;
348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
350 Value *Vec = II.getArgOperand(0);
351 Value *Amt = II.getArgOperand(1);
352 auto *VT = cast<FixedVectorType>(II.getType());
353 Type *SVT = VT->getElementType();
354 int NumElts = VT->getNumElements();
355 int BitWidth = SVT->getIntegerBitWidth();
357 // If the shift amount is guaranteed to be in-range we can replace it with a
358 // generic shift.
359 KnownBits KnownAmt =
360 llvm::computeKnownBits(Amt, II.getDataLayout());
361 if (KnownAmt.getMaxValue().ult(BitWidth)) {
362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363 : Builder.CreateLShr(Vec, Amt))
364 : Builder.CreateAShr(Vec, Amt));
367 // Simplify if all shift amounts are constant/undef.
368 auto *CShift = dyn_cast<Constant>(Amt);
369 if (!CShift)
370 return nullptr;
372 // Collect each element's shift amount.
373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374 bool AnyOutOfRange = false;
375 SmallVector<int, 8> ShiftAmts;
376 for (int I = 0; I < NumElts; ++I) {
377 auto *CElt = CShift->getAggregateElement(I);
378 if (isa_and_nonnull<UndefValue>(CElt)) {
379 ShiftAmts.push_back(-1);
380 continue;
383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384 if (!COp)
385 return nullptr;
387 // Handle out of range shifts.
388 // If LogicalShift - set to BitWidth (special case).
389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390 APInt ShiftVal = COp->getValue();
391 if (ShiftVal.uge(BitWidth)) {
392 AnyOutOfRange = LogicalShift;
393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394 continue;
397 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
400 // If all elements out of range or UNDEF, return vector of zeros/undefs.
401 // ArithmeticShift should only hit this if they are all UNDEF.
402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403 if (llvm::all_of(ShiftAmts, OutOfRange)) {
404 SmallVector<Constant *, 8> ConstantVec;
405 for (int Idx : ShiftAmts) {
406 if (Idx < 0) {
407 ConstantVec.push_back(UndefValue::get(SVT));
408 } else {
409 assert(LogicalShift && "Logical shift expected");
410 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
413 return ConstantVector::get(ConstantVec);
416 // We can't handle only some out of range values with generic logical shifts.
417 if (AnyOutOfRange)
418 return nullptr;
420 // Build the shift amount constant vector.
421 SmallVector<Constant *, 8> ShiftVecAmts;
422 for (int Idx : ShiftAmts) {
423 if (Idx < 0)
424 ShiftVecAmts.push_back(UndefValue::get(SVT));
425 else
426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
428 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
430 if (ShiftLeft)
431 return Builder.CreateShl(Vec, ShiftVec);
433 if (LogicalShift)
434 return Builder.CreateLShr(Vec, ShiftVec);
436 return Builder.CreateAShr(Vec, ShiftVec);
439 static Value *simplifyX86pack(IntrinsicInst &II,
440 InstCombiner::BuilderTy &Builder, bool IsSigned) {
441 Value *Arg0 = II.getArgOperand(0);
442 Value *Arg1 = II.getArgOperand(1);
443 Type *ResTy = II.getType();
445 // Fast all undef handling.
446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447 return UndefValue::get(ResTy);
449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451 unsigned NumSrcElts = ArgTy->getNumElements();
452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453 "Unexpected packing types");
455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459 "Unexpected packing types");
461 // Constant folding.
462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463 return nullptr;
465 // Clamp Values - signed/unsigned both use signed clamp values, but they
466 // differ on the min/max values.
467 APInt MinValue, MaxValue;
468 if (IsSigned) {
469 // PACKSS: Truncate signed value with signed saturation.
470 // Source values less than dst minint are saturated to minint.
471 // Source values greater than dst maxint are saturated to maxint.
472 MinValue =
473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474 MaxValue =
475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476 } else {
477 // PACKUS: Truncate signed value with unsigned saturation.
478 // Source values less than zero are saturated to zero.
479 // Source values greater than dst maxuint are saturated to maxuint.
480 MinValue = APInt::getZero(SrcScalarSizeInBits);
481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
491 // Shuffle clamped args together at the lane level.
492 SmallVector<int, 32> PackMask;
493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
501 // Truncate to dst size.
502 return Builder.CreateTrunc(Shuffle, ResTy);
505 static Value *simplifyX86pmulh(IntrinsicInst &II,
506 InstCombiner::BuilderTy &Builder, bool IsSigned,
507 bool IsRounding) {
508 Value *Arg0 = II.getArgOperand(0);
509 Value *Arg1 = II.getArgOperand(1);
510 auto *ResTy = cast<FixedVectorType>(II.getType());
511 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512 assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
513 "Unexpected PMULH types");
514 assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
516 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
517 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
518 return ConstantAggregateZero::get(ResTy);
520 // Multiply by zero.
521 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
522 return ConstantAggregateZero::get(ResTy);
524 // Multiply by one.
525 if (!IsRounding) {
526 if (match(Arg0, m_One()))
527 return IsSigned ? Builder.CreateAShr(Arg1, 15)
528 : ConstantAggregateZero::get(ResTy);
529 if (match(Arg1, m_One()))
530 return IsSigned ? Builder.CreateAShr(Arg0, 15)
531 : ConstantAggregateZero::get(ResTy);
534 // Constant folding.
535 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
536 return nullptr;
538 // Extend to twice the width and multiply.
539 auto Cast =
540 IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
541 auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);
542 Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
543 Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
544 Value *Mul = Builder.CreateMul(LHS, RHS);
546 if (IsRounding) {
547 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
548 // extract bits[16:1].
549 auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
550 auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
551 Mul = Builder.CreateLShr(Mul, 14);
552 Mul = Builder.CreateTrunc(Mul, RndTy);
553 Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
554 Mul = Builder.CreateLShr(Mul, 1);
555 } else {
556 // PMULH/PMULHU: extract the vXi16 most significant bits.
557 Mul = Builder.CreateLShr(Mul, 16);
560 return Builder.CreateTrunc(Mul, ResTy);
563 static Value *simplifyX86pmadd(IntrinsicInst &II,
564 InstCombiner::BuilderTy &Builder,
565 bool IsPMADDWD) {
566 Value *Arg0 = II.getArgOperand(0);
567 Value *Arg1 = II.getArgOperand(1);
568 auto *ResTy = cast<FixedVectorType>(II.getType());
569 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
571 unsigned NumDstElts = ResTy->getNumElements();
572 assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
573 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
574 "Unexpected PMADD types");
576 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
577 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
578 return ConstantAggregateZero::get(ResTy);
580 // Multiply by zero.
581 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
582 return ConstantAggregateZero::get(ResTy);
584 // Constant folding.
585 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
586 return nullptr;
588 // Split Lo/Hi elements pairs, extend and add together.
589 // PMADDWD(X,Y) =
590 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
591 // PMADDUBSW(X,Y) =
592 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
593 SmallVector<int> LoMask, HiMask;
594 for (unsigned I = 0; I != NumDstElts; ++I) {
595 LoMask.push_back(2 * I + 0);
596 HiMask.push_back(2 * I + 1);
599 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
600 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
601 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
602 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
604 auto LHSCast =
605 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
606 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
607 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
608 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
609 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
610 Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
611 Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
612 return IsPMADDWD
613 ? Builder.CreateAdd(Lo, Hi)
614 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
617 static Value *simplifyX86movmsk(const IntrinsicInst &II,
618 InstCombiner::BuilderTy &Builder) {
619 Value *Arg = II.getArgOperand(0);
620 Type *ResTy = II.getType();
622 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
623 if (isa<UndefValue>(Arg))
624 return Constant::getNullValue(ResTy);
626 // Preserve previous behavior and give up.
627 // TODO: treat as <8 x i8>.
628 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
629 return nullptr;
631 auto *ArgTy = cast<FixedVectorType>(Arg->getType());
633 // Expand MOVMSK to compare/bitcast/zext:
634 // e.g. PMOVMSKB(v16i8 x):
635 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
636 // %int = bitcast <16 x i1> %cmp to i16
637 // %res = zext i16 %int to i32
638 unsigned NumElts = ArgTy->getNumElements();
639 Type *IntegerTy = Builder.getIntNTy(NumElts);
641 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
642 Res = Builder.CreateIsNeg(Res);
643 Res = Builder.CreateBitCast(Res, IntegerTy);
644 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
645 return Res;
648 static Value *simplifyX86addcarry(const IntrinsicInst &II,
649 InstCombiner::BuilderTy &Builder) {
650 Value *CarryIn = II.getArgOperand(0);
651 Value *Op1 = II.getArgOperand(1);
652 Value *Op2 = II.getArgOperand(2);
653 Type *RetTy = II.getType();
654 Type *OpTy = Op1->getType();
655 assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
656 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
657 "Unexpected types for x86 addcarry");
659 // If carry-in is zero, this is just an unsigned add with overflow.
660 if (match(CarryIn, m_ZeroInt())) {
661 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
662 {Op1, Op2});
663 // The types have to be adjusted to match the x86 call types.
664 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
665 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
666 Builder.getInt8Ty());
667 Value *Res = PoisonValue::get(RetTy);
668 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
669 return Builder.CreateInsertValue(Res, UAddResult, 1);
672 return nullptr;
675 static Value *simplifyTernarylogic(const IntrinsicInst &II,
676 InstCombiner::BuilderTy &Builder) {
678 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
679 if (!ArgImm || ArgImm->getValue().uge(256))
680 return nullptr;
682 Value *ArgA = II.getArgOperand(0);
683 Value *ArgB = II.getArgOperand(1);
684 Value *ArgC = II.getArgOperand(2);
686 Type *Ty = II.getType();
688 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
689 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
691 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
692 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
694 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
695 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
697 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
698 return {Builder.CreateNot(V.first), ~V.second};
700 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
701 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
702 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
704 bool AIsConst = match(ArgA, m_ImmConstant());
705 bool BIsConst = match(ArgB, m_ImmConstant());
706 bool CIsConst = match(ArgC, m_ImmConstant());
708 bool ABIsConst = AIsConst && BIsConst;
709 bool ACIsConst = AIsConst && CIsConst;
710 bool BCIsConst = BIsConst && CIsConst;
711 bool ABCIsConst = AIsConst && BIsConst && CIsConst;
713 // Use for verification. Its a big table. Its difficult to go from Imm ->
714 // logic ops, but easy to verify that a set of logic ops is correct. We track
715 // the logic ops through the second value in the pair. At the end it should
716 // equal Imm.
717 std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
718 std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
719 std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
720 std::pair<Value *, uint8_t> Res = {nullptr, 0};
722 // Currently we only handle cases that convert directly to another instruction
723 // or cases where all the ops are constant. This is because we don't properly
724 // handle creating ternary ops in the backend, so splitting them here may
725 // cause regressions. As the backend improves, uncomment more cases.
727 uint8_t Imm = ArgImm->getValue().getZExtValue();
728 switch (Imm) {
729 case 0x0:
730 Res = {Constant::getNullValue(Ty), 0};
731 break;
732 case 0x1:
733 if (ABCIsConst)
734 Res = Nor(Or(A, B), C);
735 break;
736 case 0x2:
737 if (ABCIsConst)
738 Res = And(Nor(A, B), C);
739 break;
740 case 0x3:
741 if (ABIsConst)
742 Res = Nor(A, B);
743 break;
744 case 0x4:
745 if (ABCIsConst)
746 Res = And(Nor(A, C), B);
747 break;
748 case 0x5:
749 if (ACIsConst)
750 Res = Nor(A, C);
751 break;
752 case 0x6:
753 if (ABCIsConst)
754 Res = Nor(A, Xnor(B, C));
755 break;
756 case 0x7:
757 if (ABCIsConst)
758 Res = Nor(A, And(B, C));
759 break;
760 case 0x8:
761 if (ABCIsConst)
762 Res = Nor(A, Nand(B, C));
763 break;
764 case 0x9:
765 if (ABCIsConst)
766 Res = Nor(A, Xor(B, C));
767 break;
768 case 0xa:
769 if (ACIsConst)
770 Res = Nor(A, Not(C));
771 break;
772 case 0xb:
773 if (ABCIsConst)
774 Res = Nor(A, Nor(C, Not(B)));
775 break;
776 case 0xc:
777 if (ABIsConst)
778 Res = Nor(A, Not(B));
779 break;
780 case 0xd:
781 if (ABCIsConst)
782 Res = Nor(A, Nor(B, Not(C)));
783 break;
784 case 0xe:
785 if (ABCIsConst)
786 Res = Nor(A, Nor(B, C));
787 break;
788 case 0xf:
789 Res = Not(A);
790 break;
791 case 0x10:
792 if (ABCIsConst)
793 Res = And(A, Nor(B, C));
794 break;
795 case 0x11:
796 if (BCIsConst)
797 Res = Nor(B, C);
798 break;
799 case 0x12:
800 if (ABCIsConst)
801 Res = Nor(Xnor(A, C), B);
802 break;
803 case 0x13:
804 if (ABCIsConst)
805 Res = Nor(And(A, C), B);
806 break;
807 case 0x14:
808 if (ABCIsConst)
809 Res = Nor(Xnor(A, B), C);
810 break;
811 case 0x15:
812 if (ABCIsConst)
813 Res = Nor(And(A, B), C);
814 break;
815 case 0x16:
816 if (ABCIsConst)
817 Res = Xor(Xor(A, B), And(Nand(A, B), C));
818 break;
819 case 0x17:
820 if (ABCIsConst)
821 Res = Xor(Or(A, B), Or(Xnor(A, B), C));
822 break;
823 case 0x18:
824 if (ABCIsConst)
825 Res = Nor(Xnor(A, B), Xnor(A, C));
826 break;
827 case 0x19:
828 if (ABCIsConst)
829 Res = And(Nand(A, B), Xnor(B, C));
830 break;
831 case 0x1a:
832 if (ABCIsConst)
833 Res = Xor(A, Or(And(A, B), C));
834 break;
835 case 0x1b:
836 if (ABCIsConst)
837 Res = Xor(A, Or(Xnor(A, B), C));
838 break;
839 case 0x1c:
840 if (ABCIsConst)
841 Res = Xor(A, Or(And(A, C), B));
842 break;
843 case 0x1d:
844 if (ABCIsConst)
845 Res = Xor(A, Or(Xnor(A, C), B));
846 break;
847 case 0x1e:
848 if (ABCIsConst)
849 Res = Xor(A, Or(B, C));
850 break;
851 case 0x1f:
852 if (ABCIsConst)
853 Res = Nand(A, Or(B, C));
854 break;
855 case 0x20:
856 if (ABCIsConst)
857 Res = Nor(Nand(A, C), B);
858 break;
859 case 0x21:
860 if (ABCIsConst)
861 Res = Nor(Xor(A, C), B);
862 break;
863 case 0x22:
864 if (BCIsConst)
865 Res = Nor(B, Not(C));
866 break;
867 case 0x23:
868 if (ABCIsConst)
869 Res = Nor(B, Nor(C, Not(A)));
870 break;
871 case 0x24:
872 if (ABCIsConst)
873 Res = Nor(Xnor(A, B), Xor(A, C));
874 break;
875 case 0x25:
876 if (ABCIsConst)
877 Res = Xor(A, Nand(Nand(A, B), C));
878 break;
879 case 0x26:
880 if (ABCIsConst)
881 Res = And(Nand(A, B), Xor(B, C));
882 break;
883 case 0x27:
884 if (ABCIsConst)
885 Res = Xor(Or(Xnor(A, B), C), B);
886 break;
887 case 0x28:
888 if (ABCIsConst)
889 Res = And(Xor(A, B), C);
890 break;
891 case 0x29:
892 if (ABCIsConst)
893 Res = Xor(Xor(A, B), Nor(And(A, B), C));
894 break;
895 case 0x2a:
896 if (ABCIsConst)
897 Res = And(Nand(A, B), C);
898 break;
899 case 0x2b:
900 if (ABCIsConst)
901 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
902 break;
903 case 0x2c:
904 if (ABCIsConst)
905 Res = Nor(Xnor(A, B), Nor(B, C));
906 break;
907 case 0x2d:
908 if (ABCIsConst)
909 Res = Xor(A, Or(B, Not(C)));
910 break;
911 case 0x2e:
912 if (ABCIsConst)
913 Res = Xor(A, Or(Xor(A, C), B));
914 break;
915 case 0x2f:
916 if (ABCIsConst)
917 Res = Nand(A, Or(B, Not(C)));
918 break;
919 case 0x30:
920 if (ABIsConst)
921 Res = Nor(B, Not(A));
922 break;
923 case 0x31:
924 if (ABCIsConst)
925 Res = Nor(Nor(A, Not(C)), B);
926 break;
927 case 0x32:
928 if (ABCIsConst)
929 Res = Nor(Nor(A, C), B);
930 break;
931 case 0x33:
932 Res = Not(B);
933 break;
934 case 0x34:
935 if (ABCIsConst)
936 Res = And(Xor(A, B), Nand(B, C));
937 break;
938 case 0x35:
939 if (ABCIsConst)
940 Res = Xor(B, Or(A, Xnor(B, C)));
941 break;
942 case 0x36:
943 if (ABCIsConst)
944 Res = Xor(Or(A, C), B);
945 break;
946 case 0x37:
947 if (ABCIsConst)
948 Res = Nand(Or(A, C), B);
949 break;
950 case 0x38:
951 if (ABCIsConst)
952 Res = Nor(Xnor(A, B), Nor(A, C));
953 break;
954 case 0x39:
955 if (ABCIsConst)
956 Res = Xor(Or(A, Not(C)), B);
957 break;
958 case 0x3a:
959 if (ABCIsConst)
960 Res = Xor(B, Or(A, Xor(B, C)));
961 break;
962 case 0x3b:
963 if (ABCIsConst)
964 Res = Nand(Or(A, Not(C)), B);
965 break;
966 case 0x3c:
967 Res = Xor(A, B);
968 break;
969 case 0x3d:
970 if (ABCIsConst)
971 Res = Xor(A, Or(Nor(A, C), B));
972 break;
973 case 0x3e:
974 if (ABCIsConst)
975 Res = Xor(A, Or(Nor(A, Not(C)), B));
976 break;
977 case 0x3f:
978 if (ABIsConst)
979 Res = Nand(A, B);
980 break;
981 case 0x40:
982 if (ABCIsConst)
983 Res = Nor(Nand(A, B), C);
984 break;
985 case 0x41:
986 if (ABCIsConst)
987 Res = Nor(Xor(A, B), C);
988 break;
989 case 0x42:
990 if (ABCIsConst)
991 Res = Nor(Xor(A, B), Xnor(A, C));
992 break;
993 case 0x43:
994 if (ABCIsConst)
995 Res = Xor(A, Nand(Nand(A, C), B));
996 break;
997 case 0x44:
998 if (BCIsConst)
999 Res = Nor(C, Not(B));
1000 break;
1001 case 0x45:
1002 if (ABCIsConst)
1003 Res = Nor(Nor(B, Not(A)), C);
1004 break;
1005 case 0x46:
1006 if (ABCIsConst)
1007 Res = Xor(Or(And(A, C), B), C);
1008 break;
1009 case 0x47:
1010 if (ABCIsConst)
1011 Res = Xor(Or(Xnor(A, C), B), C);
1012 break;
1013 case 0x48:
1014 if (ABCIsConst)
1015 Res = And(Xor(A, C), B);
1016 break;
1017 case 0x49:
1018 if (ABCIsConst)
1019 Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1020 break;
1021 case 0x4a:
1022 if (ABCIsConst)
1023 Res = Nor(Xnor(A, C), Nor(B, C));
1024 break;
1025 case 0x4b:
1026 if (ABCIsConst)
1027 Res = Xor(A, Or(C, Not(B)));
1028 break;
1029 case 0x4c:
1030 if (ABCIsConst)
1031 Res = And(Nand(A, C), B);
1032 break;
1033 case 0x4d:
1034 if (ABCIsConst)
1035 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1036 break;
1037 case 0x4e:
1038 if (ABCIsConst)
1039 Res = Xor(A, Or(Xor(A, B), C));
1040 break;
1041 case 0x4f:
1042 if (ABCIsConst)
1043 Res = Nand(A, Nand(B, Not(C)));
1044 break;
1045 case 0x50:
1046 if (ACIsConst)
1047 Res = Nor(C, Not(A));
1048 break;
1049 case 0x51:
1050 if (ABCIsConst)
1051 Res = Nor(Nor(A, Not(B)), C);
1052 break;
1053 case 0x52:
1054 if (ABCIsConst)
1055 Res = And(Xor(A, C), Nand(B, C));
1056 break;
1057 case 0x53:
1058 if (ABCIsConst)
1059 Res = Xor(Or(Xnor(B, C), A), C);
1060 break;
1061 case 0x54:
1062 if (ABCIsConst)
1063 Res = Nor(Nor(A, B), C);
1064 break;
1065 case 0x55:
1066 Res = Not(C);
1067 break;
1068 case 0x56:
1069 if (ABCIsConst)
1070 Res = Xor(Or(A, B), C);
1071 break;
1072 case 0x57:
1073 if (ABCIsConst)
1074 Res = Nand(Or(A, B), C);
1075 break;
1076 case 0x58:
1077 if (ABCIsConst)
1078 Res = Nor(Nor(A, B), Xnor(A, C));
1079 break;
1080 case 0x59:
1081 if (ABCIsConst)
1082 Res = Xor(Or(A, Not(B)), C);
1083 break;
1084 case 0x5a:
1085 Res = Xor(A, C);
1086 break;
1087 case 0x5b:
1088 if (ABCIsConst)
1089 Res = Xor(A, Or(Nor(A, B), C));
1090 break;
1091 case 0x5c:
1092 if (ABCIsConst)
1093 Res = Xor(Or(Xor(B, C), A), C);
1094 break;
1095 case 0x5d:
1096 if (ABCIsConst)
1097 Res = Nand(Or(A, Not(B)), C);
1098 break;
1099 case 0x5e:
1100 if (ABCIsConst)
1101 Res = Xor(A, Or(Nor(A, Not(B)), C));
1102 break;
1103 case 0x5f:
1104 if (ACIsConst)
1105 Res = Nand(A, C);
1106 break;
1107 case 0x60:
1108 if (ABCIsConst)
1109 Res = And(A, Xor(B, C));
1110 break;
1111 case 0x61:
1112 if (ABCIsConst)
1113 Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1114 break;
1115 case 0x62:
1116 if (ABCIsConst)
1117 Res = Nor(Nor(A, C), Xnor(B, C));
1118 break;
1119 case 0x63:
1120 if (ABCIsConst)
1121 Res = Xor(B, Or(C, Not(A)));
1122 break;
1123 case 0x64:
1124 if (ABCIsConst)
1125 Res = Nor(Nor(A, B), Xnor(B, C));
1126 break;
1127 case 0x65:
1128 if (ABCIsConst)
1129 Res = Xor(Or(B, Not(A)), C);
1130 break;
1131 case 0x66:
1132 Res = Xor(B, C);
1133 break;
1134 case 0x67:
1135 if (ABCIsConst)
1136 Res = Or(Nor(A, B), Xor(B, C));
1137 break;
1138 case 0x68:
1139 if (ABCIsConst)
1140 Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1141 break;
1142 case 0x69:
1143 if (ABCIsConst)
1144 Res = Xor(Xnor(A, B), C);
1145 break;
1146 case 0x6a:
1147 if (ABCIsConst)
1148 Res = Xor(And(A, B), C);
1149 break;
1150 case 0x6b:
1151 if (ABCIsConst)
1152 Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1153 break;
1154 case 0x6c:
1155 if (ABCIsConst)
1156 Res = Xor(And(A, C), B);
1157 break;
1158 case 0x6d:
1159 if (ABCIsConst)
1160 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1161 break;
1162 case 0x6e:
1163 if (ABCIsConst)
1164 Res = Or(Nor(A, Not(B)), Xor(B, C));
1165 break;
1166 case 0x6f:
1167 if (ABCIsConst)
1168 Res = Nand(A, Xnor(B, C));
1169 break;
1170 case 0x70:
1171 if (ABCIsConst)
1172 Res = And(A, Nand(B, C));
1173 break;
1174 case 0x71:
1175 if (ABCIsConst)
1176 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1177 break;
1178 case 0x72:
1179 if (ABCIsConst)
1180 Res = Xor(Or(Xor(A, B), C), B);
1181 break;
1182 case 0x73:
1183 if (ABCIsConst)
1184 Res = Nand(Nand(A, Not(C)), B);
1185 break;
1186 case 0x74:
1187 if (ABCIsConst)
1188 Res = Xor(Or(Xor(A, C), B), C);
1189 break;
1190 case 0x75:
1191 if (ABCIsConst)
1192 Res = Nand(Nand(A, Not(B)), C);
1193 break;
1194 case 0x76:
1195 if (ABCIsConst)
1196 Res = Xor(B, Or(Nor(B, Not(A)), C));
1197 break;
1198 case 0x77:
1199 if (BCIsConst)
1200 Res = Nand(B, C);
1201 break;
1202 case 0x78:
1203 if (ABCIsConst)
1204 Res = Xor(A, And(B, C));
1205 break;
1206 case 0x79:
1207 if (ABCIsConst)
1208 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1209 break;
1210 case 0x7a:
1211 if (ABCIsConst)
1212 Res = Or(Xor(A, C), Nor(B, Not(A)));
1213 break;
1214 case 0x7b:
1215 if (ABCIsConst)
1216 Res = Nand(Xnor(A, C), B);
1217 break;
1218 case 0x7c:
1219 if (ABCIsConst)
1220 Res = Or(Xor(A, B), Nor(C, Not(A)));
1221 break;
1222 case 0x7d:
1223 if (ABCIsConst)
1224 Res = Nand(Xnor(A, B), C);
1225 break;
1226 case 0x7e:
1227 if (ABCIsConst)
1228 Res = Or(Xor(A, B), Xor(A, C));
1229 break;
1230 case 0x7f:
1231 if (ABCIsConst)
1232 Res = Nand(And(A, B), C);
1233 break;
1234 case 0x80:
1235 if (ABCIsConst)
1236 Res = And(And(A, B), C);
1237 break;
1238 case 0x81:
1239 if (ABCIsConst)
1240 Res = Nor(Xor(A, B), Xor(A, C));
1241 break;
1242 case 0x82:
1243 if (ABCIsConst)
1244 Res = And(Xnor(A, B), C);
1245 break;
1246 case 0x83:
1247 if (ABCIsConst)
1248 Res = Nor(Xor(A, B), Nor(C, Not(A)));
1249 break;
1250 case 0x84:
1251 if (ABCIsConst)
1252 Res = And(Xnor(A, C), B);
1253 break;
1254 case 0x85:
1255 if (ABCIsConst)
1256 Res = Nor(Xor(A, C), Nor(B, Not(A)));
1257 break;
1258 case 0x86:
1259 if (ABCIsConst)
1260 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1261 break;
1262 case 0x87:
1263 if (ABCIsConst)
1264 Res = Xor(A, Nand(B, C));
1265 break;
1266 case 0x88:
1267 Res = And(B, C);
1268 break;
1269 case 0x89:
1270 if (ABCIsConst)
1271 Res = Xor(B, Nor(Nor(B, Not(A)), C));
1272 break;
1273 case 0x8a:
1274 if (ABCIsConst)
1275 Res = And(Nand(A, Not(B)), C);
1276 break;
1277 case 0x8b:
1278 if (ABCIsConst)
1279 Res = Xor(Nor(Xor(A, C), B), C);
1280 break;
1281 case 0x8c:
1282 if (ABCIsConst)
1283 Res = And(Nand(A, Not(C)), B);
1284 break;
1285 case 0x8d:
1286 if (ABCIsConst)
1287 Res = Xor(Nor(Xor(A, B), C), B);
1288 break;
1289 case 0x8e:
1290 if (ABCIsConst)
1291 Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1292 break;
1293 case 0x8f:
1294 if (ABCIsConst)
1295 Res = Nand(A, Nand(B, C));
1296 break;
1297 case 0x90:
1298 if (ABCIsConst)
1299 Res = And(A, Xnor(B, C));
1300 break;
1301 case 0x91:
1302 if (ABCIsConst)
1303 Res = Nor(Nor(A, Not(B)), Xor(B, C));
1304 break;
1305 case 0x92:
1306 if (ABCIsConst)
1307 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1308 break;
1309 case 0x93:
1310 if (ABCIsConst)
1311 Res = Xor(Nand(A, C), B);
1312 break;
1313 case 0x94:
1314 if (ABCIsConst)
1315 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1316 break;
1317 case 0x95:
1318 if (ABCIsConst)
1319 Res = Xor(Nand(A, B), C);
1320 break;
1321 case 0x96:
1322 if (ABCIsConst)
1323 Res = Xor(Xor(A, B), C);
1324 break;
1325 case 0x97:
1326 if (ABCIsConst)
1327 Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1328 break;
1329 case 0x98:
1330 if (ABCIsConst)
1331 Res = Nor(Nor(A, B), Xor(B, C));
1332 break;
1333 case 0x99:
1334 if (BCIsConst)
1335 Res = Xnor(B, C);
1336 break;
1337 case 0x9a:
1338 if (ABCIsConst)
1339 Res = Xor(Nor(B, Not(A)), C);
1340 break;
1341 case 0x9b:
1342 if (ABCIsConst)
1343 Res = Or(Nor(A, B), Xnor(B, C));
1344 break;
1345 case 0x9c:
1346 if (ABCIsConst)
1347 Res = Xor(B, Nor(C, Not(A)));
1348 break;
1349 case 0x9d:
1350 if (ABCIsConst)
1351 Res = Or(Nor(A, C), Xnor(B, C));
1352 break;
1353 case 0x9e:
1354 if (ABCIsConst)
1355 Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1356 break;
1357 case 0x9f:
1358 if (ABCIsConst)
1359 Res = Nand(A, Xor(B, C));
1360 break;
1361 case 0xa0:
1362 Res = And(A, C);
1363 break;
1364 case 0xa1:
1365 if (ABCIsConst)
1366 Res = Xor(A, Nor(Nor(A, Not(B)), C));
1367 break;
1368 case 0xa2:
1369 if (ABCIsConst)
1370 Res = And(Or(A, Not(B)), C);
1371 break;
1372 case 0xa3:
1373 if (ABCIsConst)
1374 Res = Xor(Nor(Xor(B, C), A), C);
1375 break;
1376 case 0xa4:
1377 if (ABCIsConst)
1378 Res = Xor(A, Nor(Nor(A, B), C));
1379 break;
1380 case 0xa5:
1381 if (ACIsConst)
1382 Res = Xnor(A, C);
1383 break;
1384 case 0xa6:
1385 if (ABCIsConst)
1386 Res = Xor(Nor(A, Not(B)), C);
1387 break;
1388 case 0xa7:
1389 if (ABCIsConst)
1390 Res = Or(Nor(A, B), Xnor(A, C));
1391 break;
1392 case 0xa8:
1393 if (ABCIsConst)
1394 Res = And(Or(A, B), C);
1395 break;
1396 case 0xa9:
1397 if (ABCIsConst)
1398 Res = Xor(Nor(A, B), C);
1399 break;
1400 case 0xaa:
1401 Res = C;
1402 break;
1403 case 0xab:
1404 if (ABCIsConst)
1405 Res = Or(Nor(A, B), C);
1406 break;
1407 case 0xac:
1408 if (ABCIsConst)
1409 Res = Xor(Nor(Xnor(B, C), A), C);
1410 break;
1411 case 0xad:
1412 if (ABCIsConst)
1413 Res = Or(Xnor(A, C), And(B, C));
1414 break;
1415 case 0xae:
1416 if (ABCIsConst)
1417 Res = Or(Nor(A, Not(B)), C);
1418 break;
1419 case 0xaf:
1420 if (ACIsConst)
1421 Res = Or(C, Not(A));
1422 break;
1423 case 0xb0:
1424 if (ABCIsConst)
1425 Res = And(A, Nand(B, Not(C)));
1426 break;
1427 case 0xb1:
1428 if (ABCIsConst)
1429 Res = Xor(A, Nor(Xor(A, B), C));
1430 break;
1431 case 0xb2:
1432 if (ABCIsConst)
1433 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1434 break;
1435 case 0xb3:
1436 if (ABCIsConst)
1437 Res = Nand(Nand(A, C), B);
1438 break;
1439 case 0xb4:
1440 if (ABCIsConst)
1441 Res = Xor(A, Nor(C, Not(B)));
1442 break;
1443 case 0xb5:
1444 if (ABCIsConst)
1445 Res = Or(Xnor(A, C), Nor(B, C));
1446 break;
1447 case 0xb6:
1448 if (ABCIsConst)
1449 Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1450 break;
1451 case 0xb7:
1452 if (ABCIsConst)
1453 Res = Nand(Xor(A, C), B);
1454 break;
1455 case 0xb8:
1456 if (ABCIsConst)
1457 Res = Xor(Nor(Xnor(A, C), B), C);
1458 break;
1459 case 0xb9:
1460 if (ABCIsConst)
1461 Res = Xor(Nor(And(A, C), B), C);
1462 break;
1463 case 0xba:
1464 if (ABCIsConst)
1465 Res = Or(Nor(B, Not(A)), C);
1466 break;
1467 case 0xbb:
1468 if (BCIsConst)
1469 Res = Or(C, Not(B));
1470 break;
1471 case 0xbc:
1472 if (ABCIsConst)
1473 Res = Xor(A, And(Nand(A, C), B));
1474 break;
1475 case 0xbd:
1476 if (ABCIsConst)
1477 Res = Or(Xor(A, B), Xnor(A, C));
1478 break;
1479 case 0xbe:
1480 if (ABCIsConst)
1481 Res = Or(Xor(A, B), C);
1482 break;
1483 case 0xbf:
1484 if (ABCIsConst)
1485 Res = Or(Nand(A, B), C);
1486 break;
1487 case 0xc0:
1488 Res = And(A, B);
1489 break;
1490 case 0xc1:
1491 if (ABCIsConst)
1492 Res = Xor(A, Nor(Nor(A, Not(C)), B));
1493 break;
1494 case 0xc2:
1495 if (ABCIsConst)
1496 Res = Xor(A, Nor(Nor(A, C), B));
1497 break;
1498 case 0xc3:
1499 if (ABIsConst)
1500 Res = Xnor(A, B);
1501 break;
1502 case 0xc4:
1503 if (ABCIsConst)
1504 Res = And(Or(A, Not(C)), B);
1505 break;
1506 case 0xc5:
1507 if (ABCIsConst)
1508 Res = Xor(B, Nor(A, Xor(B, C)));
1509 break;
1510 case 0xc6:
1511 if (ABCIsConst)
1512 Res = Xor(Nor(A, Not(C)), B);
1513 break;
1514 case 0xc7:
1515 if (ABCIsConst)
1516 Res = Or(Xnor(A, B), Nor(A, C));
1517 break;
1518 case 0xc8:
1519 if (ABCIsConst)
1520 Res = And(Or(A, C), B);
1521 break;
1522 case 0xc9:
1523 if (ABCIsConst)
1524 Res = Xor(Nor(A, C), B);
1525 break;
1526 case 0xca:
1527 if (ABCIsConst)
1528 Res = Xor(B, Nor(A, Xnor(B, C)));
1529 break;
1530 case 0xcb:
1531 if (ABCIsConst)
1532 Res = Or(Xnor(A, B), And(B, C));
1533 break;
1534 case 0xcc:
1535 Res = B;
1536 break;
1537 case 0xcd:
1538 if (ABCIsConst)
1539 Res = Or(Nor(A, C), B);
1540 break;
1541 case 0xce:
1542 if (ABCIsConst)
1543 Res = Or(Nor(A, Not(C)), B);
1544 break;
1545 case 0xcf:
1546 if (ABIsConst)
1547 Res = Or(B, Not(A));
1548 break;
1549 case 0xd0:
1550 if (ABCIsConst)
1551 Res = And(A, Or(B, Not(C)));
1552 break;
1553 case 0xd1:
1554 if (ABCIsConst)
1555 Res = Xor(A, Nor(Xor(A, C), B));
1556 break;
1557 case 0xd2:
1558 if (ABCIsConst)
1559 Res = Xor(A, Nor(B, Not(C)));
1560 break;
1561 case 0xd3:
1562 if (ABCIsConst)
1563 Res = Or(Xnor(A, B), Nor(B, C));
1564 break;
1565 case 0xd4:
1566 if (ABCIsConst)
1567 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1568 break;
1569 case 0xd5:
1570 if (ABCIsConst)
1571 Res = Nand(Nand(A, B), C);
1572 break;
1573 case 0xd6:
1574 if (ABCIsConst)
1575 Res = Xor(Xor(A, B), Or(And(A, B), C));
1576 break;
1577 case 0xd7:
1578 if (ABCIsConst)
1579 Res = Nand(Xor(A, B), C);
1580 break;
1581 case 0xd8:
1582 if (ABCIsConst)
1583 Res = Xor(Nor(Xnor(A, B), C), B);
1584 break;
1585 case 0xd9:
1586 if (ABCIsConst)
1587 Res = Or(And(A, B), Xnor(B, C));
1588 break;
1589 case 0xda:
1590 if (ABCIsConst)
1591 Res = Xor(A, And(Nand(A, B), C));
1592 break;
1593 case 0xdb:
1594 if (ABCIsConst)
1595 Res = Or(Xnor(A, B), Xor(A, C));
1596 break;
1597 case 0xdc:
1598 if (ABCIsConst)
1599 Res = Or(B, Nor(C, Not(A)));
1600 break;
1601 case 0xdd:
1602 if (BCIsConst)
1603 Res = Or(B, Not(C));
1604 break;
1605 case 0xde:
1606 if (ABCIsConst)
1607 Res = Or(Xor(A, C), B);
1608 break;
1609 case 0xdf:
1610 if (ABCIsConst)
1611 Res = Or(Nand(A, C), B);
1612 break;
1613 case 0xe0:
1614 if (ABCIsConst)
1615 Res = And(A, Or(B, C));
1616 break;
1617 case 0xe1:
1618 if (ABCIsConst)
1619 Res = Xor(A, Nor(B, C));
1620 break;
1621 case 0xe2:
1622 if (ABCIsConst)
1623 Res = Xor(A, Nor(Xnor(A, C), B));
1624 break;
1625 case 0xe3:
1626 if (ABCIsConst)
1627 Res = Xor(A, Nor(And(A, C), B));
1628 break;
1629 case 0xe4:
1630 if (ABCIsConst)
1631 Res = Xor(A, Nor(Xnor(A, B), C));
1632 break;
1633 case 0xe5:
1634 if (ABCIsConst)
1635 Res = Xor(A, Nor(And(A, B), C));
1636 break;
1637 case 0xe6:
1638 if (ABCIsConst)
1639 Res = Or(And(A, B), Xor(B, C));
1640 break;
1641 case 0xe7:
1642 if (ABCIsConst)
1643 Res = Or(Xnor(A, B), Xnor(A, C));
1644 break;
1645 case 0xe8:
1646 if (ABCIsConst)
1647 Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1648 break;
1649 case 0xe9:
1650 if (ABCIsConst)
1651 Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1652 break;
1653 case 0xea:
1654 if (ABCIsConst)
1655 Res = Or(And(A, B), C);
1656 break;
1657 case 0xeb:
1658 if (ABCIsConst)
1659 Res = Or(Xnor(A, B), C);
1660 break;
1661 case 0xec:
1662 if (ABCIsConst)
1663 Res = Or(And(A, C), B);
1664 break;
1665 case 0xed:
1666 if (ABCIsConst)
1667 Res = Or(Xnor(A, C), B);
1668 break;
1669 case 0xee:
1670 Res = Or(B, C);
1671 break;
1672 case 0xef:
1673 if (ABCIsConst)
1674 Res = Nand(A, Nor(B, C));
1675 break;
1676 case 0xf0:
1677 Res = A;
1678 break;
1679 case 0xf1:
1680 if (ABCIsConst)
1681 Res = Or(A, Nor(B, C));
1682 break;
1683 case 0xf2:
1684 if (ABCIsConst)
1685 Res = Or(A, Nor(B, Not(C)));
1686 break;
1687 case 0xf3:
1688 if (ABIsConst)
1689 Res = Or(A, Not(B));
1690 break;
1691 case 0xf4:
1692 if (ABCIsConst)
1693 Res = Or(A, Nor(C, Not(B)));
1694 break;
1695 case 0xf5:
1696 if (ACIsConst)
1697 Res = Or(A, Not(C));
1698 break;
1699 case 0xf6:
1700 if (ABCIsConst)
1701 Res = Or(A, Xor(B, C));
1702 break;
1703 case 0xf7:
1704 if (ABCIsConst)
1705 Res = Or(A, Nand(B, C));
1706 break;
1707 case 0xf8:
1708 if (ABCIsConst)
1709 Res = Or(A, And(B, C));
1710 break;
1711 case 0xf9:
1712 if (ABCIsConst)
1713 Res = Or(A, Xnor(B, C));
1714 break;
1715 case 0xfa:
1716 Res = Or(A, C);
1717 break;
1718 case 0xfb:
1719 if (ABCIsConst)
1720 Res = Nand(Nor(A, C), B);
1721 break;
1722 case 0xfc:
1723 Res = Or(A, B);
1724 break;
1725 case 0xfd:
1726 if (ABCIsConst)
1727 Res = Nand(Nor(A, B), C);
1728 break;
1729 case 0xfe:
1730 if (ABCIsConst)
1731 Res = Or(Or(A, B), C);
1732 break;
1733 case 0xff:
1734 Res = {Constant::getAllOnesValue(Ty), 0xff};
1735 break;
1738 assert((Res.first == nullptr || Res.second == Imm) &&
1739 "Simplification of ternary logic does not verify!");
1740 return Res.first;
1743 static Value *simplifyX86insertps(const IntrinsicInst &II,
1744 InstCombiner::BuilderTy &Builder) {
1745 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1746 if (!CInt)
1747 return nullptr;
1749 auto *VecTy = cast<FixedVectorType>(II.getType());
1750 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1752 // The immediate permute control byte looks like this:
1753 // [3:0] - zero mask for each 32-bit lane
1754 // [5:4] - select one 32-bit destination lane
1755 // [7:6] - select one 32-bit source lane
1757 uint8_t Imm = CInt->getZExtValue();
1758 uint8_t ZMask = Imm & 0xf;
1759 uint8_t DestLane = (Imm >> 4) & 0x3;
1760 uint8_t SourceLane = (Imm >> 6) & 0x3;
1762 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1764 // If all zero mask bits are set, this was just a weird way to
1765 // generate a zero vector.
1766 if (ZMask == 0xf)
1767 return ZeroVector;
1769 // Initialize by passing all of the first source bits through.
1770 int ShuffleMask[4] = {0, 1, 2, 3};
1772 // We may replace the second operand with the zero vector.
1773 Value *V1 = II.getArgOperand(1);
1775 if (ZMask) {
1776 // If the zero mask is being used with a single input or the zero mask
1777 // overrides the destination lane, this is a shuffle with the zero vector.
1778 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1779 (ZMask & (1 << DestLane))) {
1780 V1 = ZeroVector;
1781 // We may still move 32-bits of the first source vector from one lane
1782 // to another.
1783 ShuffleMask[DestLane] = SourceLane;
1784 // The zero mask may override the previous insert operation.
1785 for (unsigned i = 0; i < 4; ++i)
1786 if ((ZMask >> i) & 0x1)
1787 ShuffleMask[i] = i + 4;
1788 } else {
1789 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1790 return nullptr;
1792 } else {
1793 // Replace the selected destination lane with the selected source lane.
1794 ShuffleMask[DestLane] = SourceLane + 4;
1797 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1800 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1801 /// or conversion to a shuffle vector.
1802 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1803 ConstantInt *CILength, ConstantInt *CIIndex,
1804 InstCombiner::BuilderTy &Builder) {
1805 auto LowConstantHighUndef = [&](uint64_t Val) {
1806 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1807 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1808 UndefValue::get(IntTy64)};
1809 return ConstantVector::get(Args);
1812 // See if we're dealing with constant values.
1813 auto *C0 = dyn_cast<Constant>(Op0);
1814 auto *CI0 =
1815 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1816 : nullptr;
1818 // Attempt to constant fold.
1819 if (CILength && CIIndex) {
1820 // From AMD documentation: "The bit index and field length are each six
1821 // bits in length other bits of the field are ignored."
1822 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1823 APInt APLength = CILength->getValue().zextOrTrunc(6);
1825 unsigned Index = APIndex.getZExtValue();
1827 // From AMD documentation: "a value of zero in the field length is
1828 // defined as length of 64".
1829 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1831 // From AMD documentation: "If the sum of the bit index + length field
1832 // is greater than 64, the results are undefined".
1833 unsigned End = Index + Length;
1835 // Note that both field index and field length are 8-bit quantities.
1836 // Since variables 'Index' and 'Length' are unsigned values
1837 // obtained from zero-extending field index and field length
1838 // respectively, their sum should never wrap around.
1839 if (End > 64)
1840 return UndefValue::get(II.getType());
1842 // If we are inserting whole bytes, we can convert this to a shuffle.
1843 // Lowering can recognize EXTRQI shuffle masks.
1844 if ((Length % 8) == 0 && (Index % 8) == 0) {
1845 // Convert bit indices to byte indices.
1846 Length /= 8;
1847 Index /= 8;
1849 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1850 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1852 SmallVector<int, 16> ShuffleMask;
1853 for (int i = 0; i != (int)Length; ++i)
1854 ShuffleMask.push_back(i + Index);
1855 for (int i = Length; i != 8; ++i)
1856 ShuffleMask.push_back(i + 16);
1857 for (int i = 8; i != 16; ++i)
1858 ShuffleMask.push_back(-1);
1860 Value *SV = Builder.CreateShuffleVector(
1861 Builder.CreateBitCast(Op0, ShufTy),
1862 ConstantAggregateZero::get(ShufTy), ShuffleMask);
1863 return Builder.CreateBitCast(SV, II.getType());
1866 // Constant Fold - shift Index'th bit to lowest position and mask off
1867 // Length bits.
1868 if (CI0) {
1869 APInt Elt = CI0->getValue();
1870 Elt.lshrInPlace(Index);
1871 Elt = Elt.zextOrTrunc(Length);
1872 return LowConstantHighUndef(Elt.getZExtValue());
1875 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1876 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1877 Value *Args[] = {Op0, CILength, CIIndex};
1878 return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args);
1882 // Constant Fold - extraction from zero is always {zero, undef}.
1883 if (CI0 && CI0->isZero())
1884 return LowConstantHighUndef(0);
1886 return nullptr;
1889 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1890 /// folding or conversion to a shuffle vector.
1891 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1892 APInt APLength, APInt APIndex,
1893 InstCombiner::BuilderTy &Builder) {
1894 // From AMD documentation: "The bit index and field length are each six bits
1895 // in length other bits of the field are ignored."
1896 APIndex = APIndex.zextOrTrunc(6);
1897 APLength = APLength.zextOrTrunc(6);
1899 // Attempt to constant fold.
1900 unsigned Index = APIndex.getZExtValue();
1902 // From AMD documentation: "a value of zero in the field length is
1903 // defined as length of 64".
1904 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1906 // From AMD documentation: "If the sum of the bit index + length field
1907 // is greater than 64, the results are undefined".
1908 unsigned End = Index + Length;
1910 // Note that both field index and field length are 8-bit quantities.
1911 // Since variables 'Index' and 'Length' are unsigned values
1912 // obtained from zero-extending field index and field length
1913 // respectively, their sum should never wrap around.
1914 if (End > 64)
1915 return UndefValue::get(II.getType());
1917 // If we are inserting whole bytes, we can convert this to a shuffle.
1918 // Lowering can recognize INSERTQI shuffle masks.
1919 if ((Length % 8) == 0 && (Index % 8) == 0) {
1920 // Convert bit indices to byte indices.
1921 Length /= 8;
1922 Index /= 8;
1924 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1925 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1927 SmallVector<int, 16> ShuffleMask;
1928 for (int i = 0; i != (int)Index; ++i)
1929 ShuffleMask.push_back(i);
1930 for (int i = 0; i != (int)Length; ++i)
1931 ShuffleMask.push_back(i + 16);
1932 for (int i = Index + Length; i != 8; ++i)
1933 ShuffleMask.push_back(i);
1934 for (int i = 8; i != 16; ++i)
1935 ShuffleMask.push_back(-1);
1937 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1938 Builder.CreateBitCast(Op1, ShufTy),
1939 ShuffleMask);
1940 return Builder.CreateBitCast(SV, II.getType());
1943 // See if we're dealing with constant values.
1944 auto *C0 = dyn_cast<Constant>(Op0);
1945 auto *C1 = dyn_cast<Constant>(Op1);
1946 auto *CI00 =
1947 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1948 : nullptr;
1949 auto *CI10 =
1950 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1951 : nullptr;
1953 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1954 if (CI00 && CI10) {
1955 APInt V00 = CI00->getValue();
1956 APInt V10 = CI10->getValue();
1957 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1958 V00 = V00 & ~Mask;
1959 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1960 APInt Val = V00 | V10;
1961 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1962 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1963 UndefValue::get(IntTy64)};
1964 return ConstantVector::get(Args);
1967 // If we were an INSERTQ call, we'll save demanded elements if we convert to
1968 // INSERTQI.
1969 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1970 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1971 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1972 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1974 Value *Args[] = {Op0, Op1, CILength, CIIndex};
1975 return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args);
1978 return nullptr;
1981 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1982 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1983 InstCombiner::BuilderTy &Builder) {
1984 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1985 if (!V)
1986 return nullptr;
1988 auto *VecTy = cast<FixedVectorType>(II.getType());
1989 unsigned NumElts = VecTy->getNumElements();
1990 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1991 "Unexpected number of elements in shuffle mask!");
1993 // Construct a shuffle mask from constant integers or UNDEFs.
1994 int Indexes[64];
1996 // Each byte in the shuffle control mask forms an index to permute the
1997 // corresponding byte in the destination operand.
1998 for (unsigned I = 0; I < NumElts; ++I) {
1999 Constant *COp = V->getAggregateElement(I);
2000 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2001 return nullptr;
2003 if (isa<UndefValue>(COp)) {
2004 Indexes[I] = -1;
2005 continue;
2008 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2010 // If the most significant bit (bit[7]) of each byte of the shuffle
2011 // control mask is set, then zero is written in the result byte.
2012 // The zero vector is in the right-hand side of the resulting
2013 // shufflevector.
2015 // The value of each index for the high 128-bit lane is the least
2016 // significant 4 bits of the respective shuffle control byte.
2017 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2018 Indexes[I] = Index;
2021 auto V1 = II.getArgOperand(0);
2022 auto V2 = Constant::getNullValue(VecTy);
2023 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2026 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2027 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
2028 InstCombiner::BuilderTy &Builder) {
2029 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2030 if (!V)
2031 return nullptr;
2033 auto *VecTy = cast<FixedVectorType>(II.getType());
2034 unsigned NumElts = VecTy->getNumElements();
2035 bool IsPD = VecTy->getScalarType()->isDoubleTy();
2036 unsigned NumLaneElts = IsPD ? 2 : 4;
2037 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2039 // Construct a shuffle mask from constant integers or UNDEFs.
2040 int Indexes[16];
2042 // The intrinsics only read one or two bits, clear the rest.
2043 for (unsigned I = 0; I < NumElts; ++I) {
2044 Constant *COp = V->getAggregateElement(I);
2045 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2046 return nullptr;
2048 if (isa<UndefValue>(COp)) {
2049 Indexes[I] = -1;
2050 continue;
2053 APInt Index = cast<ConstantInt>(COp)->getValue();
2054 Index = Index.zextOrTrunc(32).getLoBits(2);
2056 // The PD variants uses bit 1 to select per-lane element index, so
2057 // shift down to convert to generic shuffle mask index.
2058 if (IsPD)
2059 Index.lshrInPlace(1);
2061 // The _256 variants are a bit trickier since the mask bits always index
2062 // into the corresponding 128 half. In order to convert to a generic
2063 // shuffle, we have to make that explicit.
2064 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2066 Indexes[I] = Index.getZExtValue();
2069 auto V1 = II.getArgOperand(0);
2070 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2073 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2074 static Value *simplifyX86vpermv(const IntrinsicInst &II,
2075 InstCombiner::BuilderTy &Builder) {
2076 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2077 if (!V)
2078 return nullptr;
2080 auto *VecTy = cast<FixedVectorType>(II.getType());
2081 unsigned Size = VecTy->getNumElements();
2082 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2083 "Unexpected shuffle mask size");
2085 // Construct a shuffle mask from constant integers or UNDEFs.
2086 int Indexes[64];
2088 for (unsigned I = 0; I < Size; ++I) {
2089 Constant *COp = V->getAggregateElement(I);
2090 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2091 return nullptr;
2093 if (isa<UndefValue>(COp)) {
2094 Indexes[I] = -1;
2095 continue;
2098 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2099 Index &= Size - 1;
2100 Indexes[I] = Index;
2103 auto V1 = II.getArgOperand(0);
2104 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2107 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2108 static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2109 InstCombiner::BuilderTy &Builder) {
2110 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2111 if (!V)
2112 return nullptr;
2114 auto *VecTy = cast<FixedVectorType>(II.getType());
2115 unsigned Size = VecTy->getNumElements();
2116 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2117 Size == 64) &&
2118 "Unexpected shuffle mask size");
2120 // Construct a shuffle mask from constant integers or UNDEFs.
2121 int Indexes[64];
2123 for (unsigned I = 0; I < Size; ++I) {
2124 Constant *COp = V->getAggregateElement(I);
2125 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2126 return nullptr;
2128 if (isa<UndefValue>(COp)) {
2129 Indexes[I] = -1;
2130 continue;
2133 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2134 Index &= (2 * Size) - 1;
2135 Indexes[I] = Index;
2138 auto V1 = II.getArgOperand(0);
2139 auto V2 = II.getArgOperand(2);
2140 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2143 // Simplify VPERMV/VPERMV3 mask - only demand the active index bits.
2144 static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary,
2145 InstCombiner &IC) {
2146 auto *VecTy = cast<FixedVectorType>(II->getType());
2147 unsigned EltSizeInBits = VecTy->getScalarSizeInBits();
2148 unsigned NumElts = VecTy->getNumElements();
2149 assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) &&
2150 "Unexpected shuffle mask size");
2152 unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts);
2153 APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits);
2155 KnownBits KnownMask(EltSizeInBits);
2156 return IC.SimplifyDemandedBits(II, /*OpNo=*/1, DemandedMask, KnownMask);
2159 std::optional<Instruction *>
2160 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2161 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2162 unsigned DemandedWidth) {
2163 APInt UndefElts(Width, 0);
2164 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2165 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2168 Intrinsic::ID IID = II.getIntrinsicID();
2169 switch (IID) {
2170 case Intrinsic::x86_bmi_bextr_32:
2171 case Intrinsic::x86_bmi_bextr_64:
2172 case Intrinsic::x86_tbm_bextri_u32:
2173 case Intrinsic::x86_tbm_bextri_u64:
2174 // If the RHS is a constant we can try some simplifications.
2175 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2176 uint64_t Shift = C->getZExtValue();
2177 uint64_t Length = (Shift >> 8) & 0xff;
2178 Shift &= 0xff;
2179 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2180 // If the length is 0 or the shift is out of range, replace with zero.
2181 if (Length == 0 || Shift >= BitWidth) {
2182 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2184 // If the LHS is also a constant, we can completely constant fold this.
2185 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2186 uint64_t Result = InC->getZExtValue() >> Shift;
2187 if (Length > BitWidth)
2188 Length = BitWidth;
2189 Result &= maskTrailingOnes<uint64_t>(Length);
2190 return IC.replaceInstUsesWith(II,
2191 ConstantInt::get(II.getType(), Result));
2193 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2194 // are only masking bits that a shift already cleared?
2196 break;
2198 case Intrinsic::x86_bmi_bzhi_32:
2199 case Intrinsic::x86_bmi_bzhi_64:
2200 // If the RHS is a constant we can try some simplifications.
2201 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2202 uint64_t Index = C->getZExtValue() & 0xff;
2203 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2204 if (Index >= BitWidth) {
2205 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2207 if (Index == 0) {
2208 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2210 // If the LHS is also a constant, we can completely constant fold this.
2211 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2212 uint64_t Result = InC->getZExtValue();
2213 Result &= maskTrailingOnes<uint64_t>(Index);
2214 return IC.replaceInstUsesWith(II,
2215 ConstantInt::get(II.getType(), Result));
2217 // TODO should we convert this to an AND if the RHS is constant?
2219 break;
2220 case Intrinsic::x86_bmi_pext_32:
2221 case Intrinsic::x86_bmi_pext_64:
2222 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2223 if (MaskC->isNullValue()) {
2224 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2226 if (MaskC->isAllOnesValue()) {
2227 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2230 unsigned MaskIdx, MaskLen;
2231 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2232 // any single contingous sequence of 1s anywhere in the mask simply
2233 // describes a subset of the input bits shifted to the appropriate
2234 // position. Replace with the straight forward IR.
2235 Value *Input = II.getArgOperand(0);
2236 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2237 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2238 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2239 return IC.replaceInstUsesWith(II, Shifted);
2242 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2243 uint64_t Src = SrcC->getZExtValue();
2244 uint64_t Mask = MaskC->getZExtValue();
2245 uint64_t Result = 0;
2246 uint64_t BitToSet = 1;
2248 while (Mask) {
2249 // Isolate lowest set bit.
2250 uint64_t BitToTest = Mask & -Mask;
2251 if (BitToTest & Src)
2252 Result |= BitToSet;
2254 BitToSet <<= 1;
2255 // Clear lowest set bit.
2256 Mask &= Mask - 1;
2259 return IC.replaceInstUsesWith(II,
2260 ConstantInt::get(II.getType(), Result));
2263 break;
2264 case Intrinsic::x86_bmi_pdep_32:
2265 case Intrinsic::x86_bmi_pdep_64:
2266 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2267 if (MaskC->isNullValue()) {
2268 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2270 if (MaskC->isAllOnesValue()) {
2271 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2274 unsigned MaskIdx, MaskLen;
2275 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2276 // any single contingous sequence of 1s anywhere in the mask simply
2277 // describes a subset of the input bits shifted to the appropriate
2278 // position. Replace with the straight forward IR.
2279 Value *Input = II.getArgOperand(0);
2280 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2281 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2282 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2283 return IC.replaceInstUsesWith(II, Masked);
2286 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2287 uint64_t Src = SrcC->getZExtValue();
2288 uint64_t Mask = MaskC->getZExtValue();
2289 uint64_t Result = 0;
2290 uint64_t BitToTest = 1;
2292 while (Mask) {
2293 // Isolate lowest set bit.
2294 uint64_t BitToSet = Mask & -Mask;
2295 if (BitToTest & Src)
2296 Result |= BitToSet;
2298 BitToTest <<= 1;
2299 // Clear lowest set bit;
2300 Mask &= Mask - 1;
2303 return IC.replaceInstUsesWith(II,
2304 ConstantInt::get(II.getType(), Result));
2307 break;
2309 case Intrinsic::x86_sse_cvtss2si:
2310 case Intrinsic::x86_sse_cvtss2si64:
2311 case Intrinsic::x86_sse_cvttss2si:
2312 case Intrinsic::x86_sse_cvttss2si64:
2313 case Intrinsic::x86_sse2_cvtsd2si:
2314 case Intrinsic::x86_sse2_cvtsd2si64:
2315 case Intrinsic::x86_sse2_cvttsd2si:
2316 case Intrinsic::x86_sse2_cvttsd2si64:
2317 case Intrinsic::x86_avx512_vcvtss2si32:
2318 case Intrinsic::x86_avx512_vcvtss2si64:
2319 case Intrinsic::x86_avx512_vcvtss2usi32:
2320 case Intrinsic::x86_avx512_vcvtss2usi64:
2321 case Intrinsic::x86_avx512_vcvtsd2si32:
2322 case Intrinsic::x86_avx512_vcvtsd2si64:
2323 case Intrinsic::x86_avx512_vcvtsd2usi32:
2324 case Intrinsic::x86_avx512_vcvtsd2usi64:
2325 case Intrinsic::x86_avx512_cvttss2si:
2326 case Intrinsic::x86_avx512_cvttss2si64:
2327 case Intrinsic::x86_avx512_cvttss2usi:
2328 case Intrinsic::x86_avx512_cvttss2usi64:
2329 case Intrinsic::x86_avx512_cvttsd2si:
2330 case Intrinsic::x86_avx512_cvttsd2si64:
2331 case Intrinsic::x86_avx512_cvttsd2usi:
2332 case Intrinsic::x86_avx512_cvttsd2usi64: {
2333 // These intrinsics only demand the 0th element of their input vectors. If
2334 // we can simplify the input based on that, do so now.
2335 Value *Arg = II.getArgOperand(0);
2336 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2337 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2338 return IC.replaceOperand(II, 0, V);
2340 break;
2343 case Intrinsic::x86_mmx_pmovmskb:
2344 case Intrinsic::x86_sse_movmsk_ps:
2345 case Intrinsic::x86_sse2_movmsk_pd:
2346 case Intrinsic::x86_sse2_pmovmskb_128:
2347 case Intrinsic::x86_avx_movmsk_pd_256:
2348 case Intrinsic::x86_avx_movmsk_ps_256:
2349 case Intrinsic::x86_avx2_pmovmskb:
2350 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2351 return IC.replaceInstUsesWith(II, V);
2353 break;
2355 case Intrinsic::x86_sse_comieq_ss:
2356 case Intrinsic::x86_sse_comige_ss:
2357 case Intrinsic::x86_sse_comigt_ss:
2358 case Intrinsic::x86_sse_comile_ss:
2359 case Intrinsic::x86_sse_comilt_ss:
2360 case Intrinsic::x86_sse_comineq_ss:
2361 case Intrinsic::x86_sse_ucomieq_ss:
2362 case Intrinsic::x86_sse_ucomige_ss:
2363 case Intrinsic::x86_sse_ucomigt_ss:
2364 case Intrinsic::x86_sse_ucomile_ss:
2365 case Intrinsic::x86_sse_ucomilt_ss:
2366 case Intrinsic::x86_sse_ucomineq_ss:
2367 case Intrinsic::x86_sse2_comieq_sd:
2368 case Intrinsic::x86_sse2_comige_sd:
2369 case Intrinsic::x86_sse2_comigt_sd:
2370 case Intrinsic::x86_sse2_comile_sd:
2371 case Intrinsic::x86_sse2_comilt_sd:
2372 case Intrinsic::x86_sse2_comineq_sd:
2373 case Intrinsic::x86_sse2_ucomieq_sd:
2374 case Intrinsic::x86_sse2_ucomige_sd:
2375 case Intrinsic::x86_sse2_ucomigt_sd:
2376 case Intrinsic::x86_sse2_ucomile_sd:
2377 case Intrinsic::x86_sse2_ucomilt_sd:
2378 case Intrinsic::x86_sse2_ucomineq_sd:
2379 case Intrinsic::x86_avx512_vcomi_ss:
2380 case Intrinsic::x86_avx512_vcomi_sd:
2381 case Intrinsic::x86_avx512_mask_cmp_ss:
2382 case Intrinsic::x86_avx512_mask_cmp_sd: {
2383 // These intrinsics only demand the 0th element of their input vectors. If
2384 // we can simplify the input based on that, do so now.
2385 bool MadeChange = false;
2386 Value *Arg0 = II.getArgOperand(0);
2387 Value *Arg1 = II.getArgOperand(1);
2388 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2389 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2390 IC.replaceOperand(II, 0, V);
2391 MadeChange = true;
2393 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2394 IC.replaceOperand(II, 1, V);
2395 MadeChange = true;
2397 if (MadeChange) {
2398 return &II;
2400 break;
2403 case Intrinsic::x86_avx512_add_ps_512:
2404 case Intrinsic::x86_avx512_div_ps_512:
2405 case Intrinsic::x86_avx512_mul_ps_512:
2406 case Intrinsic::x86_avx512_sub_ps_512:
2407 case Intrinsic::x86_avx512_add_pd_512:
2408 case Intrinsic::x86_avx512_div_pd_512:
2409 case Intrinsic::x86_avx512_mul_pd_512:
2410 case Intrinsic::x86_avx512_sub_pd_512:
2411 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2412 // IR operations.
2413 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2414 if (R->getValue() == 4) {
2415 Value *Arg0 = II.getArgOperand(0);
2416 Value *Arg1 = II.getArgOperand(1);
2418 Value *V;
2419 switch (IID) {
2420 default:
2421 llvm_unreachable("Case stmts out of sync!");
2422 case Intrinsic::x86_avx512_add_ps_512:
2423 case Intrinsic::x86_avx512_add_pd_512:
2424 V = IC.Builder.CreateFAdd(Arg0, Arg1);
2425 break;
2426 case Intrinsic::x86_avx512_sub_ps_512:
2427 case Intrinsic::x86_avx512_sub_pd_512:
2428 V = IC.Builder.CreateFSub(Arg0, Arg1);
2429 break;
2430 case Intrinsic::x86_avx512_mul_ps_512:
2431 case Intrinsic::x86_avx512_mul_pd_512:
2432 V = IC.Builder.CreateFMul(Arg0, Arg1);
2433 break;
2434 case Intrinsic::x86_avx512_div_ps_512:
2435 case Intrinsic::x86_avx512_div_pd_512:
2436 V = IC.Builder.CreateFDiv(Arg0, Arg1);
2437 break;
2440 return IC.replaceInstUsesWith(II, V);
2443 break;
2445 case Intrinsic::x86_avx512_mask_add_ss_round:
2446 case Intrinsic::x86_avx512_mask_div_ss_round:
2447 case Intrinsic::x86_avx512_mask_mul_ss_round:
2448 case Intrinsic::x86_avx512_mask_sub_ss_round:
2449 case Intrinsic::x86_avx512_mask_add_sd_round:
2450 case Intrinsic::x86_avx512_mask_div_sd_round:
2451 case Intrinsic::x86_avx512_mask_mul_sd_round:
2452 case Intrinsic::x86_avx512_mask_sub_sd_round:
2453 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2454 // IR operations.
2455 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2456 if (R->getValue() == 4) {
2457 // Extract the element as scalars.
2458 Value *Arg0 = II.getArgOperand(0);
2459 Value *Arg1 = II.getArgOperand(1);
2460 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2461 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2463 Value *V;
2464 switch (IID) {
2465 default:
2466 llvm_unreachable("Case stmts out of sync!");
2467 case Intrinsic::x86_avx512_mask_add_ss_round:
2468 case Intrinsic::x86_avx512_mask_add_sd_round:
2469 V = IC.Builder.CreateFAdd(LHS, RHS);
2470 break;
2471 case Intrinsic::x86_avx512_mask_sub_ss_round:
2472 case Intrinsic::x86_avx512_mask_sub_sd_round:
2473 V = IC.Builder.CreateFSub(LHS, RHS);
2474 break;
2475 case Intrinsic::x86_avx512_mask_mul_ss_round:
2476 case Intrinsic::x86_avx512_mask_mul_sd_round:
2477 V = IC.Builder.CreateFMul(LHS, RHS);
2478 break;
2479 case Intrinsic::x86_avx512_mask_div_ss_round:
2480 case Intrinsic::x86_avx512_mask_div_sd_round:
2481 V = IC.Builder.CreateFDiv(LHS, RHS);
2482 break;
2485 // Handle the masking aspect of the intrinsic.
2486 Value *Mask = II.getArgOperand(3);
2487 auto *C = dyn_cast<ConstantInt>(Mask);
2488 // We don't need a select if we know the mask bit is a 1.
2489 if (!C || !C->getValue()[0]) {
2490 // Cast the mask to an i1 vector and then extract the lowest element.
2491 auto *MaskTy = FixedVectorType::get(
2492 IC.Builder.getInt1Ty(),
2493 cast<IntegerType>(Mask->getType())->getBitWidth());
2494 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2495 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2496 // Extract the lowest element from the passthru operand.
2497 Value *Passthru =
2498 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2499 V = IC.Builder.CreateSelect(Mask, V, Passthru);
2502 // Insert the result back into the original argument 0.
2503 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2505 return IC.replaceInstUsesWith(II, V);
2508 break;
2510 // Constant fold ashr( <A x Bi>, Ci ).
2511 // Constant fold lshr( <A x Bi>, Ci ).
2512 // Constant fold shl( <A x Bi>, Ci ).
2513 case Intrinsic::x86_sse2_psrai_d:
2514 case Intrinsic::x86_sse2_psrai_w:
2515 case Intrinsic::x86_avx2_psrai_d:
2516 case Intrinsic::x86_avx2_psrai_w:
2517 case Intrinsic::x86_avx512_psrai_q_128:
2518 case Intrinsic::x86_avx512_psrai_q_256:
2519 case Intrinsic::x86_avx512_psrai_d_512:
2520 case Intrinsic::x86_avx512_psrai_q_512:
2521 case Intrinsic::x86_avx512_psrai_w_512:
2522 case Intrinsic::x86_sse2_psrli_d:
2523 case Intrinsic::x86_sse2_psrli_q:
2524 case Intrinsic::x86_sse2_psrli_w:
2525 case Intrinsic::x86_avx2_psrli_d:
2526 case Intrinsic::x86_avx2_psrli_q:
2527 case Intrinsic::x86_avx2_psrli_w:
2528 case Intrinsic::x86_avx512_psrli_d_512:
2529 case Intrinsic::x86_avx512_psrli_q_512:
2530 case Intrinsic::x86_avx512_psrli_w_512:
2531 case Intrinsic::x86_sse2_pslli_d:
2532 case Intrinsic::x86_sse2_pslli_q:
2533 case Intrinsic::x86_sse2_pslli_w:
2534 case Intrinsic::x86_avx2_pslli_d:
2535 case Intrinsic::x86_avx2_pslli_q:
2536 case Intrinsic::x86_avx2_pslli_w:
2537 case Intrinsic::x86_avx512_pslli_d_512:
2538 case Intrinsic::x86_avx512_pslli_q_512:
2539 case Intrinsic::x86_avx512_pslli_w_512:
2540 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2541 return IC.replaceInstUsesWith(II, V);
2543 break;
2545 case Intrinsic::x86_sse2_psra_d:
2546 case Intrinsic::x86_sse2_psra_w:
2547 case Intrinsic::x86_avx2_psra_d:
2548 case Intrinsic::x86_avx2_psra_w:
2549 case Intrinsic::x86_avx512_psra_q_128:
2550 case Intrinsic::x86_avx512_psra_q_256:
2551 case Intrinsic::x86_avx512_psra_d_512:
2552 case Intrinsic::x86_avx512_psra_q_512:
2553 case Intrinsic::x86_avx512_psra_w_512:
2554 case Intrinsic::x86_sse2_psrl_d:
2555 case Intrinsic::x86_sse2_psrl_q:
2556 case Intrinsic::x86_sse2_psrl_w:
2557 case Intrinsic::x86_avx2_psrl_d:
2558 case Intrinsic::x86_avx2_psrl_q:
2559 case Intrinsic::x86_avx2_psrl_w:
2560 case Intrinsic::x86_avx512_psrl_d_512:
2561 case Intrinsic::x86_avx512_psrl_q_512:
2562 case Intrinsic::x86_avx512_psrl_w_512:
2563 case Intrinsic::x86_sse2_psll_d:
2564 case Intrinsic::x86_sse2_psll_q:
2565 case Intrinsic::x86_sse2_psll_w:
2566 case Intrinsic::x86_avx2_psll_d:
2567 case Intrinsic::x86_avx2_psll_q:
2568 case Intrinsic::x86_avx2_psll_w:
2569 case Intrinsic::x86_avx512_psll_d_512:
2570 case Intrinsic::x86_avx512_psll_q_512:
2571 case Intrinsic::x86_avx512_psll_w_512: {
2572 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2573 return IC.replaceInstUsesWith(II, V);
2576 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2577 // operand to compute the shift amount.
2578 Value *Arg1 = II.getArgOperand(1);
2579 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2580 "Unexpected packed shift size");
2581 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2583 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2584 return IC.replaceOperand(II, 1, V);
2586 break;
2589 case Intrinsic::x86_avx2_psllv_d:
2590 case Intrinsic::x86_avx2_psllv_d_256:
2591 case Intrinsic::x86_avx2_psllv_q:
2592 case Intrinsic::x86_avx2_psllv_q_256:
2593 case Intrinsic::x86_avx512_psllv_d_512:
2594 case Intrinsic::x86_avx512_psllv_q_512:
2595 case Intrinsic::x86_avx512_psllv_w_128:
2596 case Intrinsic::x86_avx512_psllv_w_256:
2597 case Intrinsic::x86_avx512_psllv_w_512:
2598 case Intrinsic::x86_avx2_psrav_d:
2599 case Intrinsic::x86_avx2_psrav_d_256:
2600 case Intrinsic::x86_avx512_psrav_q_128:
2601 case Intrinsic::x86_avx512_psrav_q_256:
2602 case Intrinsic::x86_avx512_psrav_d_512:
2603 case Intrinsic::x86_avx512_psrav_q_512:
2604 case Intrinsic::x86_avx512_psrav_w_128:
2605 case Intrinsic::x86_avx512_psrav_w_256:
2606 case Intrinsic::x86_avx512_psrav_w_512:
2607 case Intrinsic::x86_avx2_psrlv_d:
2608 case Intrinsic::x86_avx2_psrlv_d_256:
2609 case Intrinsic::x86_avx2_psrlv_q:
2610 case Intrinsic::x86_avx2_psrlv_q_256:
2611 case Intrinsic::x86_avx512_psrlv_d_512:
2612 case Intrinsic::x86_avx512_psrlv_q_512:
2613 case Intrinsic::x86_avx512_psrlv_w_128:
2614 case Intrinsic::x86_avx512_psrlv_w_256:
2615 case Intrinsic::x86_avx512_psrlv_w_512:
2616 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2617 return IC.replaceInstUsesWith(II, V);
2619 break;
2621 case Intrinsic::x86_sse2_packssdw_128:
2622 case Intrinsic::x86_sse2_packsswb_128:
2623 case Intrinsic::x86_avx2_packssdw:
2624 case Intrinsic::x86_avx2_packsswb:
2625 case Intrinsic::x86_avx512_packssdw_512:
2626 case Intrinsic::x86_avx512_packsswb_512:
2627 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2628 return IC.replaceInstUsesWith(II, V);
2630 break;
2632 case Intrinsic::x86_sse2_packuswb_128:
2633 case Intrinsic::x86_sse41_packusdw:
2634 case Intrinsic::x86_avx2_packusdw:
2635 case Intrinsic::x86_avx2_packuswb:
2636 case Intrinsic::x86_avx512_packusdw_512:
2637 case Intrinsic::x86_avx512_packuswb_512:
2638 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2639 return IC.replaceInstUsesWith(II, V);
2641 break;
2643 case Intrinsic::x86_sse2_pmulh_w:
2644 case Intrinsic::x86_avx2_pmulh_w:
2645 case Intrinsic::x86_avx512_pmulh_w_512:
2646 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2647 return IC.replaceInstUsesWith(II, V);
2649 break;
2651 case Intrinsic::x86_sse2_pmulhu_w:
2652 case Intrinsic::x86_avx2_pmulhu_w:
2653 case Intrinsic::x86_avx512_pmulhu_w_512:
2654 if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2655 return IC.replaceInstUsesWith(II, V);
2657 break;
2659 case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2660 case Intrinsic::x86_avx2_pmul_hr_sw:
2661 case Intrinsic::x86_avx512_pmul_hr_sw_512:
2662 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2663 return IC.replaceInstUsesWith(II, V);
2665 break;
2667 case Intrinsic::x86_sse2_pmadd_wd:
2668 case Intrinsic::x86_avx2_pmadd_wd:
2669 case Intrinsic::x86_avx512_pmaddw_d_512:
2670 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2671 return IC.replaceInstUsesWith(II, V);
2673 break;
2675 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2676 case Intrinsic::x86_avx2_pmadd_ub_sw:
2677 case Intrinsic::x86_avx512_pmaddubs_w_512:
2678 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2679 return IC.replaceInstUsesWith(II, V);
2681 break;
2683 case Intrinsic::x86_pclmulqdq:
2684 case Intrinsic::x86_pclmulqdq_256:
2685 case Intrinsic::x86_pclmulqdq_512: {
2686 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2687 unsigned Imm = C->getZExtValue();
2689 bool MadeChange = false;
2690 Value *Arg0 = II.getArgOperand(0);
2691 Value *Arg1 = II.getArgOperand(1);
2692 unsigned VWidth =
2693 cast<FixedVectorType>(Arg0->getType())->getNumElements();
2695 APInt UndefElts1(VWidth, 0);
2696 APInt DemandedElts1 =
2697 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2698 if (Value *V =
2699 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2700 IC.replaceOperand(II, 0, V);
2701 MadeChange = true;
2704 APInt UndefElts2(VWidth, 0);
2705 APInt DemandedElts2 =
2706 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2707 if (Value *V =
2708 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2709 IC.replaceOperand(II, 1, V);
2710 MadeChange = true;
2713 // If either input elements are undef, the result is zero.
2714 if (DemandedElts1.isSubsetOf(UndefElts1) ||
2715 DemandedElts2.isSubsetOf(UndefElts2)) {
2716 return IC.replaceInstUsesWith(II,
2717 ConstantAggregateZero::get(II.getType()));
2720 if (MadeChange) {
2721 return &II;
2724 break;
2727 case Intrinsic::x86_sse41_insertps:
2728 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2729 return IC.replaceInstUsesWith(II, V);
2731 break;
2733 case Intrinsic::x86_sse4a_extrq: {
2734 Value *Op0 = II.getArgOperand(0);
2735 Value *Op1 = II.getArgOperand(1);
2736 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2737 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2738 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2739 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2740 VWidth1 == 16 && "Unexpected operand sizes");
2742 // See if we're dealing with constant values.
2743 auto *C1 = dyn_cast<Constant>(Op1);
2744 auto *CILength =
2745 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2746 : nullptr;
2747 auto *CIIndex =
2748 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2749 : nullptr;
2751 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2752 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2753 return IC.replaceInstUsesWith(II, V);
2756 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2757 // operands and the lowest 16-bits of the second.
2758 bool MadeChange = false;
2759 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2760 IC.replaceOperand(II, 0, V);
2761 MadeChange = true;
2763 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2764 IC.replaceOperand(II, 1, V);
2765 MadeChange = true;
2767 if (MadeChange) {
2768 return &II;
2770 break;
2773 case Intrinsic::x86_sse4a_extrqi: {
2774 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2775 // bits of the lower 64-bits. The upper 64-bits are undefined.
2776 Value *Op0 = II.getArgOperand(0);
2777 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2778 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2779 "Unexpected operand size");
2781 // See if we're dealing with constant values.
2782 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2783 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2785 // Attempt to simplify to a constant or shuffle vector.
2786 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2787 return IC.replaceInstUsesWith(II, V);
2790 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2791 // operand.
2792 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2793 return IC.replaceOperand(II, 0, V);
2795 break;
2798 case Intrinsic::x86_sse4a_insertq: {
2799 Value *Op0 = II.getArgOperand(0);
2800 Value *Op1 = II.getArgOperand(1);
2801 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2802 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2803 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2804 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2805 "Unexpected operand size");
2807 // See if we're dealing with constant values.
2808 auto *C1 = dyn_cast<Constant>(Op1);
2809 auto *CI11 =
2810 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2811 : nullptr;
2813 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2814 if (CI11) {
2815 const APInt &V11 = CI11->getValue();
2816 APInt Len = V11.zextOrTrunc(6);
2817 APInt Idx = V11.lshr(8).zextOrTrunc(6);
2818 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2819 return IC.replaceInstUsesWith(II, V);
2823 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2824 // operand.
2825 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2826 return IC.replaceOperand(II, 0, V);
2828 break;
2831 case Intrinsic::x86_sse4a_insertqi: {
2832 // INSERTQI: Extract lowest Length bits from lower half of second source and
2833 // insert over first source starting at Index bit. The upper 64-bits are
2834 // undefined.
2835 Value *Op0 = II.getArgOperand(0);
2836 Value *Op1 = II.getArgOperand(1);
2837 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2838 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2839 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2840 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2841 VWidth1 == 2 && "Unexpected operand sizes");
2843 // See if we're dealing with constant values.
2844 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2845 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2847 // Attempt to simplify to a constant or shuffle vector.
2848 if (CILength && CIIndex) {
2849 APInt Len = CILength->getValue().zextOrTrunc(6);
2850 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2851 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2852 return IC.replaceInstUsesWith(II, V);
2856 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2857 // operands.
2858 bool MadeChange = false;
2859 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2860 IC.replaceOperand(II, 0, V);
2861 MadeChange = true;
2863 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2864 IC.replaceOperand(II, 1, V);
2865 MadeChange = true;
2867 if (MadeChange) {
2868 return &II;
2870 break;
2873 case Intrinsic::x86_sse41_pblendvb:
2874 case Intrinsic::x86_sse41_blendvps:
2875 case Intrinsic::x86_sse41_blendvpd:
2876 case Intrinsic::x86_avx_blendv_ps_256:
2877 case Intrinsic::x86_avx_blendv_pd_256:
2878 case Intrinsic::x86_avx2_pblendvb: {
2879 // fold (blend A, A, Mask) -> A
2880 Value *Op0 = II.getArgOperand(0);
2881 Value *Op1 = II.getArgOperand(1);
2882 Value *Mask = II.getArgOperand(2);
2883 if (Op0 == Op1) {
2884 return IC.replaceInstUsesWith(II, Op0);
2887 // Zero Mask - select 1st argument.
2888 if (isa<ConstantAggregateZero>(Mask)) {
2889 return IC.replaceInstUsesWith(II, Op0);
2892 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2893 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2894 Constant *NewSelector =
2895 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2896 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2899 Mask = InstCombiner::peekThroughBitcast(Mask);
2901 // Peek through a one-use shuffle - VectorCombine should have simplified
2902 // this for cases where we're splitting wider vectors to use blendv
2903 // intrinsics.
2904 Value *MaskSrc = nullptr;
2905 ArrayRef<int> ShuffleMask;
2906 if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
2907 m_Mask(ShuffleMask))))) {
2908 // Bail if the shuffle was irregular or contains undefs.
2909 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2910 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2911 any_of(ShuffleMask,
2912 [NumElts](int M) { return M < 0 || M >= NumElts; }))
2913 break;
2914 Mask = InstCombiner::peekThroughBitcast(MaskSrc);
2917 // Convert to a vector select if we can bypass casts and find a boolean
2918 // vector condition value.
2919 Value *BoolVec;
2920 if (match(Mask, m_SExt(m_Value(BoolVec))) &&
2921 BoolVec->getType()->isVectorTy() &&
2922 BoolVec->getType()->getScalarSizeInBits() == 1) {
2923 auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2924 auto *OpTy = cast<FixedVectorType>(II.getType());
2925 unsigned NumMaskElts = MaskTy->getNumElements();
2926 unsigned NumOperandElts = OpTy->getNumElements();
2928 // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2929 if (MaskSrc) {
2930 unsigned NumMaskSrcElts =
2931 cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2932 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2933 // Multiple mask bits maps to the same operand element - bail out.
2934 if (NumMaskElts > NumOperandElts)
2935 break;
2936 SmallVector<int> ScaledMask;
2937 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2938 break;
2939 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2940 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2942 assert(MaskTy->getPrimitiveSizeInBits() ==
2943 OpTy->getPrimitiveSizeInBits() &&
2944 "Not expecting mask and operands with different sizes");
2946 if (NumMaskElts == NumOperandElts) {
2947 return SelectInst::Create(BoolVec, Op1, Op0);
2950 // If the mask has less elements than the operands, each mask bit maps to
2951 // multiple elements of the operands. Bitcast back and forth.
2952 if (NumMaskElts < NumOperandElts) {
2953 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2954 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2955 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2956 return new BitCastInst(Sel, II.getType());
2960 break;
2963 case Intrinsic::x86_ssse3_pshuf_b_128:
2964 case Intrinsic::x86_avx2_pshuf_b:
2965 case Intrinsic::x86_avx512_pshuf_b_512: {
2966 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2967 return IC.replaceInstUsesWith(II, V);
2970 KnownBits KnownMask(8);
2971 if (IC.SimplifyDemandedBits(&II, 1, APInt(8, 0b10001111), KnownMask))
2972 return &II;
2973 break;
2976 case Intrinsic::x86_avx_vpermilvar_ps:
2977 case Intrinsic::x86_avx_vpermilvar_ps_256:
2978 case Intrinsic::x86_avx512_vpermilvar_ps_512: {
2979 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2980 return IC.replaceInstUsesWith(II, V);
2983 KnownBits KnownMask(32);
2984 if (IC.SimplifyDemandedBits(&II, 1, APInt(32, 0b00011), KnownMask))
2985 return &II;
2986 break;
2989 case Intrinsic::x86_avx_vpermilvar_pd:
2990 case Intrinsic::x86_avx_vpermilvar_pd_256:
2991 case Intrinsic::x86_avx512_vpermilvar_pd_512: {
2992 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2993 return IC.replaceInstUsesWith(II, V);
2996 KnownBits KnownMask(64);
2997 if (IC.SimplifyDemandedBits(&II, 1, APInt(64, 0b00010), KnownMask))
2998 return &II;
2999 break;
3002 case Intrinsic::x86_avx2_permd:
3003 case Intrinsic::x86_avx2_permps:
3004 case Intrinsic::x86_avx512_permvar_df_256:
3005 case Intrinsic::x86_avx512_permvar_df_512:
3006 case Intrinsic::x86_avx512_permvar_di_256:
3007 case Intrinsic::x86_avx512_permvar_di_512:
3008 case Intrinsic::x86_avx512_permvar_hi_128:
3009 case Intrinsic::x86_avx512_permvar_hi_256:
3010 case Intrinsic::x86_avx512_permvar_hi_512:
3011 case Intrinsic::x86_avx512_permvar_qi_128:
3012 case Intrinsic::x86_avx512_permvar_qi_256:
3013 case Intrinsic::x86_avx512_permvar_qi_512:
3014 case Intrinsic::x86_avx512_permvar_sf_512:
3015 case Intrinsic::x86_avx512_permvar_si_512:
3016 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
3017 return IC.replaceInstUsesWith(II, V);
3019 if (simplifyX86VPERMMask(&II, /*IsBinary=*/false, IC))
3020 return &II;
3021 break;
3023 case Intrinsic::x86_avx512_vpermi2var_d_128:
3024 case Intrinsic::x86_avx512_vpermi2var_d_256:
3025 case Intrinsic::x86_avx512_vpermi2var_d_512:
3026 case Intrinsic::x86_avx512_vpermi2var_hi_128:
3027 case Intrinsic::x86_avx512_vpermi2var_hi_256:
3028 case Intrinsic::x86_avx512_vpermi2var_hi_512:
3029 case Intrinsic::x86_avx512_vpermi2var_pd_128:
3030 case Intrinsic::x86_avx512_vpermi2var_pd_256:
3031 case Intrinsic::x86_avx512_vpermi2var_pd_512:
3032 case Intrinsic::x86_avx512_vpermi2var_ps_128:
3033 case Intrinsic::x86_avx512_vpermi2var_ps_256:
3034 case Intrinsic::x86_avx512_vpermi2var_ps_512:
3035 case Intrinsic::x86_avx512_vpermi2var_q_128:
3036 case Intrinsic::x86_avx512_vpermi2var_q_256:
3037 case Intrinsic::x86_avx512_vpermi2var_q_512:
3038 case Intrinsic::x86_avx512_vpermi2var_qi_128:
3039 case Intrinsic::x86_avx512_vpermi2var_qi_256:
3040 case Intrinsic::x86_avx512_vpermi2var_qi_512:
3041 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3042 return IC.replaceInstUsesWith(II, V);
3044 if (simplifyX86VPERMMask(&II, /*IsBinary=*/true, IC))
3045 return &II;
3046 break;
3048 case Intrinsic::x86_avx_maskload_ps:
3049 case Intrinsic::x86_avx_maskload_pd:
3050 case Intrinsic::x86_avx_maskload_ps_256:
3051 case Intrinsic::x86_avx_maskload_pd_256:
3052 case Intrinsic::x86_avx2_maskload_d:
3053 case Intrinsic::x86_avx2_maskload_q:
3054 case Intrinsic::x86_avx2_maskload_d_256:
3055 case Intrinsic::x86_avx2_maskload_q_256:
3056 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3057 return I;
3059 break;
3061 case Intrinsic::x86_sse2_maskmov_dqu:
3062 case Intrinsic::x86_avx_maskstore_ps:
3063 case Intrinsic::x86_avx_maskstore_pd:
3064 case Intrinsic::x86_avx_maskstore_ps_256:
3065 case Intrinsic::x86_avx_maskstore_pd_256:
3066 case Intrinsic::x86_avx2_maskstore_d:
3067 case Intrinsic::x86_avx2_maskstore_q:
3068 case Intrinsic::x86_avx2_maskstore_d_256:
3069 case Intrinsic::x86_avx2_maskstore_q_256:
3070 if (simplifyX86MaskedStore(II, IC)) {
3071 return nullptr;
3073 break;
3075 case Intrinsic::x86_addcarry_32:
3076 case Intrinsic::x86_addcarry_64:
3077 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3078 return IC.replaceInstUsesWith(II, V);
3080 break;
3082 case Intrinsic::x86_avx512_pternlog_d_128:
3083 case Intrinsic::x86_avx512_pternlog_d_256:
3084 case Intrinsic::x86_avx512_pternlog_d_512:
3085 case Intrinsic::x86_avx512_pternlog_q_128:
3086 case Intrinsic::x86_avx512_pternlog_q_256:
3087 case Intrinsic::x86_avx512_pternlog_q_512:
3088 if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3089 return IC.replaceInstUsesWith(II, V);
3091 break;
3092 default:
3093 break;
3095 return std::nullopt;
3098 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3099 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3100 bool &KnownBitsComputed) const {
3101 switch (II.getIntrinsicID()) {
3102 default:
3103 break;
3104 case Intrinsic::x86_mmx_pmovmskb:
3105 case Intrinsic::x86_sse_movmsk_ps:
3106 case Intrinsic::x86_sse2_movmsk_pd:
3107 case Intrinsic::x86_sse2_pmovmskb_128:
3108 case Intrinsic::x86_avx_movmsk_ps_256:
3109 case Intrinsic::x86_avx_movmsk_pd_256:
3110 case Intrinsic::x86_avx2_pmovmskb: {
3111 // MOVMSK copies the vector elements' sign bits to the low bits
3112 // and zeros the high bits.
3113 unsigned ArgWidth;
3114 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3115 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3116 } else {
3117 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3118 ArgWidth = ArgType->getNumElements();
3121 // If we don't need any of low bits then return zero,
3122 // we know that DemandedMask is non-zero already.
3123 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3124 Type *VTy = II.getType();
3125 if (DemandedElts.isZero()) {
3126 return ConstantInt::getNullValue(VTy);
3129 // We know that the upper bits are set to zero.
3130 Known.Zero.setBitsFrom(ArgWidth);
3131 KnownBitsComputed = true;
3132 break;
3135 return std::nullopt;
3138 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3139 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3140 APInt &UndefElts2, APInt &UndefElts3,
3141 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3142 simplifyAndSetOp) const {
3143 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3144 switch (II.getIntrinsicID()) {
3145 default:
3146 break;
3147 case Intrinsic::x86_xop_vfrcz_ss:
3148 case Intrinsic::x86_xop_vfrcz_sd:
3149 // The instructions for these intrinsics are speced to zero upper bits not
3150 // pass them through like other scalar intrinsics. So we shouldn't just
3151 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3152 // Instead we should return a zero vector.
3153 if (!DemandedElts[0]) {
3154 IC.addToWorklist(&II);
3155 return ConstantAggregateZero::get(II.getType());
3158 // Only the lower element is used.
3159 DemandedElts = 1;
3160 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3162 // Only the lower element is undefined. The high elements are zero.
3163 UndefElts = UndefElts[0];
3164 break;
3166 // Unary scalar-as-vector operations that work column-wise.
3167 case Intrinsic::x86_sse_rcp_ss:
3168 case Intrinsic::x86_sse_rsqrt_ss:
3169 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3171 // If lowest element of a scalar op isn't used then use Arg0.
3172 if (!DemandedElts[0]) {
3173 IC.addToWorklist(&II);
3174 return II.getArgOperand(0);
3176 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3177 // checks).
3178 break;
3180 // Binary scalar-as-vector operations that work column-wise. The high
3181 // elements come from operand 0. The low element is a function of both
3182 // operands.
3183 case Intrinsic::x86_sse_min_ss:
3184 case Intrinsic::x86_sse_max_ss:
3185 case Intrinsic::x86_sse_cmp_ss:
3186 case Intrinsic::x86_sse2_min_sd:
3187 case Intrinsic::x86_sse2_max_sd:
3188 case Intrinsic::x86_sse2_cmp_sd: {
3189 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3191 // If lowest element of a scalar op isn't used then use Arg0.
3192 if (!DemandedElts[0]) {
3193 IC.addToWorklist(&II);
3194 return II.getArgOperand(0);
3197 // Only lower element is used for operand 1.
3198 DemandedElts = 1;
3199 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3201 // Lower element is undefined if both lower elements are undefined.
3202 // Consider things like undef&0. The result is known zero, not undef.
3203 if (!UndefElts2[0])
3204 UndefElts.clearBit(0);
3206 break;
3209 // Binary scalar-as-vector operations that work column-wise. The high
3210 // elements come from operand 0 and the low element comes from operand 1.
3211 case Intrinsic::x86_sse41_round_ss:
3212 case Intrinsic::x86_sse41_round_sd: {
3213 // Don't use the low element of operand 0.
3214 APInt DemandedElts2 = DemandedElts;
3215 DemandedElts2.clearBit(0);
3216 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3218 // If lowest element of a scalar op isn't used then use Arg0.
3219 if (!DemandedElts[0]) {
3220 IC.addToWorklist(&II);
3221 return II.getArgOperand(0);
3224 // Only lower element is used for operand 1.
3225 DemandedElts = 1;
3226 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3228 // Take the high undef elements from operand 0 and take the lower element
3229 // from operand 1.
3230 UndefElts.clearBit(0);
3231 UndefElts |= UndefElts2[0];
3232 break;
3235 // Three input scalar-as-vector operations that work column-wise. The high
3236 // elements come from operand 0 and the low element is a function of all
3237 // three inputs.
3238 case Intrinsic::x86_avx512_mask_add_ss_round:
3239 case Intrinsic::x86_avx512_mask_div_ss_round:
3240 case Intrinsic::x86_avx512_mask_mul_ss_round:
3241 case Intrinsic::x86_avx512_mask_sub_ss_round:
3242 case Intrinsic::x86_avx512_mask_max_ss_round:
3243 case Intrinsic::x86_avx512_mask_min_ss_round:
3244 case Intrinsic::x86_avx512_mask_add_sd_round:
3245 case Intrinsic::x86_avx512_mask_div_sd_round:
3246 case Intrinsic::x86_avx512_mask_mul_sd_round:
3247 case Intrinsic::x86_avx512_mask_sub_sd_round:
3248 case Intrinsic::x86_avx512_mask_max_sd_round:
3249 case Intrinsic::x86_avx512_mask_min_sd_round:
3250 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3252 // If lowest element of a scalar op isn't used then use Arg0.
3253 if (!DemandedElts[0]) {
3254 IC.addToWorklist(&II);
3255 return II.getArgOperand(0);
3258 // Only lower element is used for operand 1 and 2.
3259 DemandedElts = 1;
3260 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3261 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3263 // Lower element is undefined if all three lower elements are undefined.
3264 // Consider things like undef&0. The result is known zero, not undef.
3265 if (!UndefElts2[0] || !UndefElts3[0])
3266 UndefElts.clearBit(0);
3267 break;
3269 // TODO: Add fmaddsub support?
3270 case Intrinsic::x86_sse3_addsub_pd:
3271 case Intrinsic::x86_sse3_addsub_ps:
3272 case Intrinsic::x86_avx_addsub_pd_256:
3273 case Intrinsic::x86_avx_addsub_ps_256: {
3274 // If none of the even or none of the odd lanes are required, turn this
3275 // into a generic FP math instruction.
3276 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3277 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3278 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3279 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3280 if (IsSubOnly || IsAddOnly) {
3281 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3282 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3283 IC.Builder.SetInsertPoint(&II);
3284 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3285 return IC.Builder.CreateBinOp(
3286 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3289 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3290 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3291 UndefElts &= UndefElts2;
3292 break;
3295 // General per-element vector operations.
3296 case Intrinsic::x86_avx2_psllv_d:
3297 case Intrinsic::x86_avx2_psllv_d_256:
3298 case Intrinsic::x86_avx2_psllv_q:
3299 case Intrinsic::x86_avx2_psllv_q_256:
3300 case Intrinsic::x86_avx2_psrlv_d:
3301 case Intrinsic::x86_avx2_psrlv_d_256:
3302 case Intrinsic::x86_avx2_psrlv_q:
3303 case Intrinsic::x86_avx2_psrlv_q_256:
3304 case Intrinsic::x86_avx2_psrav_d:
3305 case Intrinsic::x86_avx2_psrav_d_256: {
3306 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3307 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3308 UndefElts &= UndefElts2;
3309 break;
3312 case Intrinsic::x86_sse2_pmulh_w:
3313 case Intrinsic::x86_avx2_pmulh_w:
3314 case Intrinsic::x86_avx512_pmulh_w_512:
3315 case Intrinsic::x86_sse2_pmulhu_w:
3316 case Intrinsic::x86_avx2_pmulhu_w:
3317 case Intrinsic::x86_avx512_pmulhu_w_512:
3318 case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3319 case Intrinsic::x86_avx2_pmul_hr_sw:
3320 case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3321 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3322 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3323 // NOTE: mulh(undef,undef) != undef.
3324 break;
3327 case Intrinsic::x86_sse2_packssdw_128:
3328 case Intrinsic::x86_sse2_packsswb_128:
3329 case Intrinsic::x86_sse2_packuswb_128:
3330 case Intrinsic::x86_sse41_packusdw:
3331 case Intrinsic::x86_avx2_packssdw:
3332 case Intrinsic::x86_avx2_packsswb:
3333 case Intrinsic::x86_avx2_packusdw:
3334 case Intrinsic::x86_avx2_packuswb:
3335 case Intrinsic::x86_avx512_packssdw_512:
3336 case Intrinsic::x86_avx512_packsswb_512:
3337 case Intrinsic::x86_avx512_packusdw_512:
3338 case Intrinsic::x86_avx512_packuswb_512: {
3339 auto *Ty0 = II.getArgOperand(0)->getType();
3340 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3341 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3343 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3344 unsigned VWidthPerLane = VWidth / NumLanes;
3345 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3347 // Per lane, pack the elements of the first input and then the second.
3348 // e.g.
3349 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3350 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3351 for (int OpNum = 0; OpNum != 2; ++OpNum) {
3352 APInt OpDemandedElts(InnerVWidth, 0);
3353 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3354 unsigned LaneIdx = Lane * VWidthPerLane;
3355 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3356 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3357 if (DemandedElts[Idx])
3358 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3362 // Demand elements from the operand.
3363 APInt OpUndefElts(InnerVWidth, 0);
3364 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3366 // Pack the operand's UNDEF elements, one lane at a time.
3367 OpUndefElts = OpUndefElts.zext(VWidth);
3368 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3369 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3370 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3371 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3372 UndefElts |= LaneElts;
3375 break;
3378 case Intrinsic::x86_sse2_pmadd_wd:
3379 case Intrinsic::x86_avx2_pmadd_wd:
3380 case Intrinsic::x86_avx512_pmaddw_d_512:
3381 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3382 case Intrinsic::x86_avx2_pmadd_ub_sw:
3383 case Intrinsic::x86_avx512_pmaddubs_w_512: {
3384 // PMADD - demand both src elements that map to each dst element.
3385 auto *ArgTy = II.getArgOperand(0)->getType();
3386 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3387 assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3388 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3389 APInt Op0UndefElts(InnerVWidth, 0);
3390 APInt Op1UndefElts(InnerVWidth, 0);
3391 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3392 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3393 // NOTE: madd(undef,undef) != undef.
3394 break;
3397 // PSHUFB
3398 case Intrinsic::x86_ssse3_pshuf_b_128:
3399 case Intrinsic::x86_avx2_pshuf_b:
3400 case Intrinsic::x86_avx512_pshuf_b_512:
3401 // PERMILVAR
3402 case Intrinsic::x86_avx_vpermilvar_ps:
3403 case Intrinsic::x86_avx_vpermilvar_ps_256:
3404 case Intrinsic::x86_avx512_vpermilvar_ps_512:
3405 case Intrinsic::x86_avx_vpermilvar_pd:
3406 case Intrinsic::x86_avx_vpermilvar_pd_256:
3407 case Intrinsic::x86_avx512_vpermilvar_pd_512:
3408 // PERMV
3409 case Intrinsic::x86_avx2_permd:
3410 case Intrinsic::x86_avx2_permps: {
3411 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3412 break;
3415 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3416 // in an undefined state.
3417 case Intrinsic::x86_sse4a_extrq:
3418 case Intrinsic::x86_sse4a_extrqi:
3419 case Intrinsic::x86_sse4a_insertq:
3420 case Intrinsic::x86_sse4a_insertqi:
3421 UndefElts.setHighBits(VWidth / 2);
3422 break;
3424 return std::nullopt;