1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
14 //===----------------------------------------------------------------------===//
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
24 using namespace llvm::PatternMatch
;
26 #define DEBUG_TYPE "x86tti"
28 /// Return a constant boolean vector that has true elements in all positions
29 /// where the input constant data vector has an element with the sign bit set.
30 static Constant
*getNegativeIsTrueBoolVec(Constant
*V
, const DataLayout
&DL
) {
31 VectorType
*IntTy
= VectorType::getInteger(cast
<VectorType
>(V
->getType()));
32 V
= ConstantExpr::getBitCast(V
, IntTy
);
33 V
= ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT
,
34 Constant::getNullValue(IntTy
), V
, DL
);
35 assert(V
&& "Vector must be foldable");
39 /// Convert the x86 XMM integer vector mask to a vector of bools based on
40 /// each element's most significant bit (the sign bit).
41 static Value
*getBoolVecFromMask(Value
*Mask
, const DataLayout
&DL
) {
42 // Fold Constant Mask.
43 if (auto *ConstantMask
= dyn_cast
<ConstantDataVector
>(Mask
))
44 return getNegativeIsTrueBoolVec(ConstantMask
, DL
);
46 // Mask was extended from a boolean vector.
48 if (match(Mask
, m_SExt(m_Value(ExtMask
))) &&
49 ExtMask
->getType()->isIntOrIntVectorTy(1))
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58 static Instruction
*simplifyX86MaskedLoad(IntrinsicInst
&II
, InstCombiner
&IC
) {
59 Value
*Ptr
= II
.getOperand(0);
60 Value
*Mask
= II
.getOperand(1);
61 Constant
*ZeroVec
= Constant::getNullValue(II
.getType());
63 // Zero Mask - masked load instruction creates a zero vector.
64 if (isa
<ConstantAggregateZero
>(Mask
))
65 return IC
.replaceInstUsesWith(II
, ZeroVec
);
67 // The mask is constant or extended from a bool vector. Convert this x86
68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69 if (Value
*BoolMask
= getBoolVecFromMask(Mask
, IC
.getDataLayout())) {
70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71 // the LLVM intrinsic definition for the pointer argument.
72 unsigned AddrSpace
= cast
<PointerType
>(Ptr
->getType())->getAddressSpace();
73 PointerType
*VecPtrTy
= PointerType::get(II
.getType(), AddrSpace
);
74 Value
*PtrCast
= IC
.Builder
.CreateBitCast(Ptr
, VecPtrTy
, "castvec");
76 // The pass-through vector for an x86 masked load is a zero vector.
77 CallInst
*NewMaskedLoad
= IC
.Builder
.CreateMaskedLoad(
78 II
.getType(), PtrCast
, Align(1), BoolMask
, ZeroVec
);
79 return IC
.replaceInstUsesWith(II
, NewMaskedLoad
);
85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88 static bool simplifyX86MaskedStore(IntrinsicInst
&II
, InstCombiner
&IC
) {
89 Value
*Ptr
= II
.getOperand(0);
90 Value
*Mask
= II
.getOperand(1);
91 Value
*Vec
= II
.getOperand(2);
93 // Zero Mask - this masked store instruction does nothing.
94 if (isa
<ConstantAggregateZero
>(Mask
)) {
95 IC
.eraseInstFromFunction(II
);
99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100 // anything else at this level.
101 if (II
.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu
)
104 // The mask is constant or extended from a bool vector. Convert this x86
105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106 if (Value
*BoolMask
= getBoolVecFromMask(Mask
, IC
.getDataLayout())) {
107 unsigned AddrSpace
= cast
<PointerType
>(Ptr
->getType())->getAddressSpace();
108 PointerType
*VecPtrTy
= PointerType::get(Vec
->getType(), AddrSpace
);
109 Value
*PtrCast
= IC
.Builder
.CreateBitCast(Ptr
, VecPtrTy
, "castvec");
111 IC
.Builder
.CreateMaskedStore(Vec
, PtrCast
, Align(1), BoolMask
);
113 // 'Replace uses' doesn't work for stores. Erase the original masked store.
114 IC
.eraseInstFromFunction(II
);
121 static Value
*simplifyX86immShift(const IntrinsicInst
&II
,
122 InstCombiner::BuilderTy
&Builder
) {
123 bool LogicalShift
= false;
124 bool ShiftLeft
= false;
127 switch (II
.getIntrinsicID()) {
129 llvm_unreachable("Unexpected intrinsic!");
130 case Intrinsic::x86_sse2_psrai_d
:
131 case Intrinsic::x86_sse2_psrai_w
:
132 case Intrinsic::x86_avx2_psrai_d
:
133 case Intrinsic::x86_avx2_psrai_w
:
134 case Intrinsic::x86_avx512_psrai_q_128
:
135 case Intrinsic::x86_avx512_psrai_q_256
:
136 case Intrinsic::x86_avx512_psrai_d_512
:
137 case Intrinsic::x86_avx512_psrai_q_512
:
138 case Intrinsic::x86_avx512_psrai_w_512
:
141 case Intrinsic::x86_sse2_psra_d
:
142 case Intrinsic::x86_sse2_psra_w
:
143 case Intrinsic::x86_avx2_psra_d
:
144 case Intrinsic::x86_avx2_psra_w
:
145 case Intrinsic::x86_avx512_psra_q_128
:
146 case Intrinsic::x86_avx512_psra_q_256
:
147 case Intrinsic::x86_avx512_psra_d_512
:
148 case Intrinsic::x86_avx512_psra_q_512
:
149 case Intrinsic::x86_avx512_psra_w_512
:
150 LogicalShift
= false;
153 case Intrinsic::x86_sse2_psrli_d
:
154 case Intrinsic::x86_sse2_psrli_q
:
155 case Intrinsic::x86_sse2_psrli_w
:
156 case Intrinsic::x86_avx2_psrli_d
:
157 case Intrinsic::x86_avx2_psrli_q
:
158 case Intrinsic::x86_avx2_psrli_w
:
159 case Intrinsic::x86_avx512_psrli_d_512
:
160 case Intrinsic::x86_avx512_psrli_q_512
:
161 case Intrinsic::x86_avx512_psrli_w_512
:
164 case Intrinsic::x86_sse2_psrl_d
:
165 case Intrinsic::x86_sse2_psrl_q
:
166 case Intrinsic::x86_sse2_psrl_w
:
167 case Intrinsic::x86_avx2_psrl_d
:
168 case Intrinsic::x86_avx2_psrl_q
:
169 case Intrinsic::x86_avx2_psrl_w
:
170 case Intrinsic::x86_avx512_psrl_d_512
:
171 case Intrinsic::x86_avx512_psrl_q_512
:
172 case Intrinsic::x86_avx512_psrl_w_512
:
176 case Intrinsic::x86_sse2_pslli_d
:
177 case Intrinsic::x86_sse2_pslli_q
:
178 case Intrinsic::x86_sse2_pslli_w
:
179 case Intrinsic::x86_avx2_pslli_d
:
180 case Intrinsic::x86_avx2_pslli_q
:
181 case Intrinsic::x86_avx2_pslli_w
:
182 case Intrinsic::x86_avx512_pslli_d_512
:
183 case Intrinsic::x86_avx512_pslli_q_512
:
184 case Intrinsic::x86_avx512_pslli_w_512
:
187 case Intrinsic::x86_sse2_psll_d
:
188 case Intrinsic::x86_sse2_psll_q
:
189 case Intrinsic::x86_sse2_psll_w
:
190 case Intrinsic::x86_avx2_psll_d
:
191 case Intrinsic::x86_avx2_psll_q
:
192 case Intrinsic::x86_avx2_psll_w
:
193 case Intrinsic::x86_avx512_psll_d_512
:
194 case Intrinsic::x86_avx512_psll_q_512
:
195 case Intrinsic::x86_avx512_psll_w_512
:
200 assert((LogicalShift
|| !ShiftLeft
) && "Only logical shifts can shift left");
202 Value
*Vec
= II
.getArgOperand(0);
203 Value
*Amt
= II
.getArgOperand(1);
204 auto *VT
= cast
<FixedVectorType
>(Vec
->getType());
205 Type
*SVT
= VT
->getElementType();
206 Type
*AmtVT
= Amt
->getType();
207 unsigned VWidth
= VT
->getNumElements();
208 unsigned BitWidth
= SVT
->getPrimitiveSizeInBits();
210 // If the shift amount is guaranteed to be in-range we can replace it with a
211 // generic shift. If its guaranteed to be out of range, logical shifts combine
212 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
214 assert(AmtVT
->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215 KnownBits KnownAmtBits
=
216 llvm::computeKnownBits(Amt
, II
.getDataLayout());
217 if (KnownAmtBits
.getMaxValue().ult(BitWidth
)) {
218 Amt
= Builder
.CreateZExtOrTrunc(Amt
, SVT
);
219 Amt
= Builder
.CreateVectorSplat(VWidth
, Amt
);
220 return (LogicalShift
? (ShiftLeft
? Builder
.CreateShl(Vec
, Amt
)
221 : Builder
.CreateLShr(Vec
, Amt
))
222 : Builder
.CreateAShr(Vec
, Amt
));
224 if (KnownAmtBits
.getMinValue().uge(BitWidth
)) {
226 return ConstantAggregateZero::get(VT
);
227 Amt
= ConstantInt::get(SVT
, BitWidth
- 1);
228 return Builder
.CreateAShr(Vec
, Builder
.CreateVectorSplat(VWidth
, Amt
));
231 // Ensure the first element has an in-range value and the rest of the
232 // elements in the bottom 64 bits are zero.
233 assert(AmtVT
->isVectorTy() && AmtVT
->getPrimitiveSizeInBits() == 128 &&
234 cast
<VectorType
>(AmtVT
)->getElementType() == SVT
&&
235 "Unexpected shift-by-scalar type");
236 unsigned NumAmtElts
= cast
<FixedVectorType
>(AmtVT
)->getNumElements();
237 APInt DemandedLower
= APInt::getOneBitSet(NumAmtElts
, 0);
238 APInt DemandedUpper
= APInt::getBitsSet(NumAmtElts
, 1, NumAmtElts
/ 2);
239 KnownBits KnownLowerBits
= llvm::computeKnownBits(
240 Amt
, DemandedLower
, II
.getDataLayout());
241 KnownBits KnownUpperBits
= llvm::computeKnownBits(
242 Amt
, DemandedUpper
, II
.getDataLayout());
243 if (KnownLowerBits
.getMaxValue().ult(BitWidth
) &&
244 (DemandedUpper
.isZero() || KnownUpperBits
.isZero())) {
245 SmallVector
<int, 16> ZeroSplat(VWidth
, 0);
246 Amt
= Builder
.CreateShuffleVector(Amt
, ZeroSplat
);
247 return (LogicalShift
? (ShiftLeft
? Builder
.CreateShl(Vec
, Amt
)
248 : Builder
.CreateLShr(Vec
, Amt
))
249 : Builder
.CreateAShr(Vec
, Amt
));
253 // Simplify if count is constant vector.
254 auto *CDV
= dyn_cast
<ConstantDataVector
>(Amt
);
258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259 // operand to compute the shift amount.
260 assert(AmtVT
->isVectorTy() && AmtVT
->getPrimitiveSizeInBits() == 128 &&
261 cast
<VectorType
>(AmtVT
)->getElementType() == SVT
&&
262 "Unexpected shift-by-scalar type");
264 // Concatenate the sub-elements to create the 64-bit value.
266 for (unsigned i
= 0, NumSubElts
= 64 / BitWidth
; i
!= NumSubElts
; ++i
) {
267 unsigned SubEltIdx
= (NumSubElts
- 1) - i
;
268 auto *SubElt
= cast
<ConstantInt
>(CDV
->getElementAsConstant(SubEltIdx
));
270 Count
|= SubElt
->getValue().zextOrTrunc(64);
273 // If shift-by-zero then just return the original value.
277 // Handle cases when Shift >= BitWidth.
278 if (Count
.uge(BitWidth
)) {
279 // If LogicalShift - just return zero.
281 return ConstantAggregateZero::get(VT
);
283 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284 Count
= APInt(64, BitWidth
- 1);
287 // Get a constant vector of the same type as the first operand.
288 auto ShiftAmt
= ConstantInt::get(SVT
, Count
.zextOrTrunc(BitWidth
));
289 auto ShiftVec
= Builder
.CreateVectorSplat(VWidth
, ShiftAmt
);
292 return Builder
.CreateShl(Vec
, ShiftVec
);
295 return Builder
.CreateLShr(Vec
, ShiftVec
);
297 return Builder
.CreateAShr(Vec
, ShiftVec
);
300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303 static Value
*simplifyX86varShift(const IntrinsicInst
&II
,
304 InstCombiner::BuilderTy
&Builder
) {
305 bool LogicalShift
= false;
306 bool ShiftLeft
= false;
308 switch (II
.getIntrinsicID()) {
310 llvm_unreachable("Unexpected intrinsic!");
311 case Intrinsic::x86_avx2_psrav_d
:
312 case Intrinsic::x86_avx2_psrav_d_256
:
313 case Intrinsic::x86_avx512_psrav_q_128
:
314 case Intrinsic::x86_avx512_psrav_q_256
:
315 case Intrinsic::x86_avx512_psrav_d_512
:
316 case Intrinsic::x86_avx512_psrav_q_512
:
317 case Intrinsic::x86_avx512_psrav_w_128
:
318 case Intrinsic::x86_avx512_psrav_w_256
:
319 case Intrinsic::x86_avx512_psrav_w_512
:
320 LogicalShift
= false;
323 case Intrinsic::x86_avx2_psrlv_d
:
324 case Intrinsic::x86_avx2_psrlv_d_256
:
325 case Intrinsic::x86_avx2_psrlv_q
:
326 case Intrinsic::x86_avx2_psrlv_q_256
:
327 case Intrinsic::x86_avx512_psrlv_d_512
:
328 case Intrinsic::x86_avx512_psrlv_q_512
:
329 case Intrinsic::x86_avx512_psrlv_w_128
:
330 case Intrinsic::x86_avx512_psrlv_w_256
:
331 case Intrinsic::x86_avx512_psrlv_w_512
:
335 case Intrinsic::x86_avx2_psllv_d
:
336 case Intrinsic::x86_avx2_psllv_d_256
:
337 case Intrinsic::x86_avx2_psllv_q
:
338 case Intrinsic::x86_avx2_psllv_q_256
:
339 case Intrinsic::x86_avx512_psllv_d_512
:
340 case Intrinsic::x86_avx512_psllv_q_512
:
341 case Intrinsic::x86_avx512_psllv_w_128
:
342 case Intrinsic::x86_avx512_psllv_w_256
:
343 case Intrinsic::x86_avx512_psllv_w_512
:
348 assert((LogicalShift
|| !ShiftLeft
) && "Only logical shifts can shift left");
350 Value
*Vec
= II
.getArgOperand(0);
351 Value
*Amt
= II
.getArgOperand(1);
352 auto *VT
= cast
<FixedVectorType
>(II
.getType());
353 Type
*SVT
= VT
->getElementType();
354 int NumElts
= VT
->getNumElements();
355 int BitWidth
= SVT
->getIntegerBitWidth();
357 // If the shift amount is guaranteed to be in-range we can replace it with a
360 llvm::computeKnownBits(Amt
, II
.getDataLayout());
361 if (KnownAmt
.getMaxValue().ult(BitWidth
)) {
362 return (LogicalShift
? (ShiftLeft
? Builder
.CreateShl(Vec
, Amt
)
363 : Builder
.CreateLShr(Vec
, Amt
))
364 : Builder
.CreateAShr(Vec
, Amt
));
367 // Simplify if all shift amounts are constant/undef.
368 auto *CShift
= dyn_cast
<Constant
>(Amt
);
372 // Collect each element's shift amount.
373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374 bool AnyOutOfRange
= false;
375 SmallVector
<int, 8> ShiftAmts
;
376 for (int I
= 0; I
< NumElts
; ++I
) {
377 auto *CElt
= CShift
->getAggregateElement(I
);
378 if (isa_and_nonnull
<UndefValue
>(CElt
)) {
379 ShiftAmts
.push_back(-1);
383 auto *COp
= dyn_cast_or_null
<ConstantInt
>(CElt
);
387 // Handle out of range shifts.
388 // If LogicalShift - set to BitWidth (special case).
389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390 APInt ShiftVal
= COp
->getValue();
391 if (ShiftVal
.uge(BitWidth
)) {
392 AnyOutOfRange
= LogicalShift
;
393 ShiftAmts
.push_back(LogicalShift
? BitWidth
: BitWidth
- 1);
397 ShiftAmts
.push_back((int)ShiftVal
.getZExtValue());
400 // If all elements out of range or UNDEF, return vector of zeros/undefs.
401 // ArithmeticShift should only hit this if they are all UNDEF.
402 auto OutOfRange
= [&](int Idx
) { return (Idx
< 0) || (BitWidth
<= Idx
); };
403 if (llvm::all_of(ShiftAmts
, OutOfRange
)) {
404 SmallVector
<Constant
*, 8> ConstantVec
;
405 for (int Idx
: ShiftAmts
) {
407 ConstantVec
.push_back(UndefValue::get(SVT
));
409 assert(LogicalShift
&& "Logical shift expected");
410 ConstantVec
.push_back(ConstantInt::getNullValue(SVT
));
413 return ConstantVector::get(ConstantVec
);
416 // We can't handle only some out of range values with generic logical shifts.
420 // Build the shift amount constant vector.
421 SmallVector
<Constant
*, 8> ShiftVecAmts
;
422 for (int Idx
: ShiftAmts
) {
424 ShiftVecAmts
.push_back(UndefValue::get(SVT
));
426 ShiftVecAmts
.push_back(ConstantInt::get(SVT
, Idx
));
428 auto ShiftVec
= ConstantVector::get(ShiftVecAmts
);
431 return Builder
.CreateShl(Vec
, ShiftVec
);
434 return Builder
.CreateLShr(Vec
, ShiftVec
);
436 return Builder
.CreateAShr(Vec
, ShiftVec
);
439 static Value
*simplifyX86pack(IntrinsicInst
&II
,
440 InstCombiner::BuilderTy
&Builder
, bool IsSigned
) {
441 Value
*Arg0
= II
.getArgOperand(0);
442 Value
*Arg1
= II
.getArgOperand(1);
443 Type
*ResTy
= II
.getType();
445 // Fast all undef handling.
446 if (isa
<UndefValue
>(Arg0
) && isa
<UndefValue
>(Arg1
))
447 return UndefValue::get(ResTy
);
449 auto *ArgTy
= cast
<FixedVectorType
>(Arg0
->getType());
450 unsigned NumLanes
= ResTy
->getPrimitiveSizeInBits() / 128;
451 unsigned NumSrcElts
= ArgTy
->getNumElements();
452 assert(cast
<FixedVectorType
>(ResTy
)->getNumElements() == (2 * NumSrcElts
) &&
453 "Unexpected packing types");
455 unsigned NumSrcEltsPerLane
= NumSrcElts
/ NumLanes
;
456 unsigned DstScalarSizeInBits
= ResTy
->getScalarSizeInBits();
457 unsigned SrcScalarSizeInBits
= ArgTy
->getScalarSizeInBits();
458 assert(SrcScalarSizeInBits
== (2 * DstScalarSizeInBits
) &&
459 "Unexpected packing types");
462 if (!isa
<Constant
>(Arg0
) || !isa
<Constant
>(Arg1
))
465 // Clamp Values - signed/unsigned both use signed clamp values, but they
466 // differ on the min/max values.
467 APInt MinValue
, MaxValue
;
469 // PACKSS: Truncate signed value with signed saturation.
470 // Source values less than dst minint are saturated to minint.
471 // Source values greater than dst maxint are saturated to maxint.
473 APInt::getSignedMinValue(DstScalarSizeInBits
).sext(SrcScalarSizeInBits
);
475 APInt::getSignedMaxValue(DstScalarSizeInBits
).sext(SrcScalarSizeInBits
);
477 // PACKUS: Truncate signed value with unsigned saturation.
478 // Source values less than zero are saturated to zero.
479 // Source values greater than dst maxuint are saturated to maxuint.
480 MinValue
= APInt::getZero(SrcScalarSizeInBits
);
481 MaxValue
= APInt::getLowBitsSet(SrcScalarSizeInBits
, DstScalarSizeInBits
);
484 auto *MinC
= Constant::getIntegerValue(ArgTy
, MinValue
);
485 auto *MaxC
= Constant::getIntegerValue(ArgTy
, MaxValue
);
486 Arg0
= Builder
.CreateSelect(Builder
.CreateICmpSLT(Arg0
, MinC
), MinC
, Arg0
);
487 Arg1
= Builder
.CreateSelect(Builder
.CreateICmpSLT(Arg1
, MinC
), MinC
, Arg1
);
488 Arg0
= Builder
.CreateSelect(Builder
.CreateICmpSGT(Arg0
, MaxC
), MaxC
, Arg0
);
489 Arg1
= Builder
.CreateSelect(Builder
.CreateICmpSGT(Arg1
, MaxC
), MaxC
, Arg1
);
491 // Shuffle clamped args together at the lane level.
492 SmallVector
<int, 32> PackMask
;
493 for (unsigned Lane
= 0; Lane
!= NumLanes
; ++Lane
) {
494 for (unsigned Elt
= 0; Elt
!= NumSrcEltsPerLane
; ++Elt
)
495 PackMask
.push_back(Elt
+ (Lane
* NumSrcEltsPerLane
));
496 for (unsigned Elt
= 0; Elt
!= NumSrcEltsPerLane
; ++Elt
)
497 PackMask
.push_back(Elt
+ (Lane
* NumSrcEltsPerLane
) + NumSrcElts
);
499 auto *Shuffle
= Builder
.CreateShuffleVector(Arg0
, Arg1
, PackMask
);
501 // Truncate to dst size.
502 return Builder
.CreateTrunc(Shuffle
, ResTy
);
505 static Value
*simplifyX86pmulh(IntrinsicInst
&II
,
506 InstCombiner::BuilderTy
&Builder
, bool IsSigned
,
508 Value
*Arg0
= II
.getArgOperand(0);
509 Value
*Arg1
= II
.getArgOperand(1);
510 auto *ResTy
= cast
<FixedVectorType
>(II
.getType());
511 auto *ArgTy
= cast
<FixedVectorType
>(Arg0
->getType());
512 assert(ArgTy
== ResTy
&& ResTy
->getScalarSizeInBits() == 16 &&
513 "Unexpected PMULH types");
514 assert((!IsRounding
|| IsSigned
) && "PMULHRS instruction must be signed");
516 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
517 if (isa
<UndefValue
>(Arg0
) || isa
<UndefValue
>(Arg1
))
518 return ConstantAggregateZero::get(ResTy
);
521 if (isa
<ConstantAggregateZero
>(Arg0
) || isa
<ConstantAggregateZero
>(Arg1
))
522 return ConstantAggregateZero::get(ResTy
);
526 if (match(Arg0
, m_One()))
527 return IsSigned
? Builder
.CreateAShr(Arg1
, 15)
528 : ConstantAggregateZero::get(ResTy
);
529 if (match(Arg1
, m_One()))
530 return IsSigned
? Builder
.CreateAShr(Arg0
, 15)
531 : ConstantAggregateZero::get(ResTy
);
535 if (!isa
<Constant
>(Arg0
) || !isa
<Constant
>(Arg1
))
538 // Extend to twice the width and multiply.
540 IsSigned
? Instruction::CastOps::SExt
: Instruction::CastOps::ZExt
;
541 auto *ExtTy
= FixedVectorType::getExtendedElementVectorType(ArgTy
);
542 Value
*LHS
= Builder
.CreateCast(Cast
, Arg0
, ExtTy
);
543 Value
*RHS
= Builder
.CreateCast(Cast
, Arg1
, ExtTy
);
544 Value
*Mul
= Builder
.CreateMul(LHS
, RHS
);
547 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
548 // extract bits[16:1].
549 auto *RndEltTy
= IntegerType::get(ExtTy
->getContext(), 18);
550 auto *RndTy
= FixedVectorType::get(RndEltTy
, ExtTy
);
551 Mul
= Builder
.CreateLShr(Mul
, 14);
552 Mul
= Builder
.CreateTrunc(Mul
, RndTy
);
553 Mul
= Builder
.CreateAdd(Mul
, ConstantInt::get(RndTy
, 1));
554 Mul
= Builder
.CreateLShr(Mul
, 1);
556 // PMULH/PMULHU: extract the vXi16 most significant bits.
557 Mul
= Builder
.CreateLShr(Mul
, 16);
560 return Builder
.CreateTrunc(Mul
, ResTy
);
563 static Value
*simplifyX86pmadd(IntrinsicInst
&II
,
564 InstCombiner::BuilderTy
&Builder
,
566 Value
*Arg0
= II
.getArgOperand(0);
567 Value
*Arg1
= II
.getArgOperand(1);
568 auto *ResTy
= cast
<FixedVectorType
>(II
.getType());
569 [[maybe_unused
]] auto *ArgTy
= cast
<FixedVectorType
>(Arg0
->getType());
571 unsigned NumDstElts
= ResTy
->getNumElements();
572 assert(ArgTy
->getNumElements() == (2 * NumDstElts
) &&
573 ResTy
->getScalarSizeInBits() == (2 * ArgTy
->getScalarSizeInBits()) &&
574 "Unexpected PMADD types");
576 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
577 if (isa
<UndefValue
>(Arg0
) || isa
<UndefValue
>(Arg1
))
578 return ConstantAggregateZero::get(ResTy
);
581 if (isa
<ConstantAggregateZero
>(Arg0
) || isa
<ConstantAggregateZero
>(Arg1
))
582 return ConstantAggregateZero::get(ResTy
);
585 if (!isa
<Constant
>(Arg0
) || !isa
<Constant
>(Arg1
))
588 // Split Lo/Hi elements pairs, extend and add together.
590 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
592 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
593 SmallVector
<int> LoMask
, HiMask
;
594 for (unsigned I
= 0; I
!= NumDstElts
; ++I
) {
595 LoMask
.push_back(2 * I
+ 0);
596 HiMask
.push_back(2 * I
+ 1);
599 auto *LHSLo
= Builder
.CreateShuffleVector(Arg0
, LoMask
);
600 auto *LHSHi
= Builder
.CreateShuffleVector(Arg0
, HiMask
);
601 auto *RHSLo
= Builder
.CreateShuffleVector(Arg1
, LoMask
);
602 auto *RHSHi
= Builder
.CreateShuffleVector(Arg1
, HiMask
);
605 IsPMADDWD
? Instruction::CastOps::SExt
: Instruction::CastOps::ZExt
;
606 LHSLo
= Builder
.CreateCast(LHSCast
, LHSLo
, ResTy
);
607 LHSHi
= Builder
.CreateCast(LHSCast
, LHSHi
, ResTy
);
608 RHSLo
= Builder
.CreateCast(Instruction::CastOps::SExt
, RHSLo
, ResTy
);
609 RHSHi
= Builder
.CreateCast(Instruction::CastOps::SExt
, RHSHi
, ResTy
);
610 Value
*Lo
= Builder
.CreateMul(LHSLo
, RHSLo
);
611 Value
*Hi
= Builder
.CreateMul(LHSHi
, RHSHi
);
613 ? Builder
.CreateAdd(Lo
, Hi
)
614 : Builder
.CreateIntrinsic(ResTy
, Intrinsic::sadd_sat
, {Lo
, Hi
});
617 static Value
*simplifyX86movmsk(const IntrinsicInst
&II
,
618 InstCombiner::BuilderTy
&Builder
) {
619 Value
*Arg
= II
.getArgOperand(0);
620 Type
*ResTy
= II
.getType();
622 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
623 if (isa
<UndefValue
>(Arg
))
624 return Constant::getNullValue(ResTy
);
626 // Preserve previous behavior and give up.
627 // TODO: treat as <8 x i8>.
628 if (II
.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb
)
631 auto *ArgTy
= cast
<FixedVectorType
>(Arg
->getType());
633 // Expand MOVMSK to compare/bitcast/zext:
634 // e.g. PMOVMSKB(v16i8 x):
635 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
636 // %int = bitcast <16 x i1> %cmp to i16
637 // %res = zext i16 %int to i32
638 unsigned NumElts
= ArgTy
->getNumElements();
639 Type
*IntegerTy
= Builder
.getIntNTy(NumElts
);
641 Value
*Res
= Builder
.CreateBitCast(Arg
, VectorType::getInteger(ArgTy
));
642 Res
= Builder
.CreateIsNeg(Res
);
643 Res
= Builder
.CreateBitCast(Res
, IntegerTy
);
644 Res
= Builder
.CreateZExtOrTrunc(Res
, ResTy
);
648 static Value
*simplifyX86addcarry(const IntrinsicInst
&II
,
649 InstCombiner::BuilderTy
&Builder
) {
650 Value
*CarryIn
= II
.getArgOperand(0);
651 Value
*Op1
= II
.getArgOperand(1);
652 Value
*Op2
= II
.getArgOperand(2);
653 Type
*RetTy
= II
.getType();
654 Type
*OpTy
= Op1
->getType();
655 assert(RetTy
->getStructElementType(0)->isIntegerTy(8) &&
656 RetTy
->getStructElementType(1) == OpTy
&& OpTy
== Op2
->getType() &&
657 "Unexpected types for x86 addcarry");
659 // If carry-in is zero, this is just an unsigned add with overflow.
660 if (match(CarryIn
, m_ZeroInt())) {
661 Value
*UAdd
= Builder
.CreateIntrinsic(Intrinsic::uadd_with_overflow
, OpTy
,
663 // The types have to be adjusted to match the x86 call types.
664 Value
*UAddResult
= Builder
.CreateExtractValue(UAdd
, 0);
665 Value
*UAddOV
= Builder
.CreateZExt(Builder
.CreateExtractValue(UAdd
, 1),
666 Builder
.getInt8Ty());
667 Value
*Res
= PoisonValue::get(RetTy
);
668 Res
= Builder
.CreateInsertValue(Res
, UAddOV
, 0);
669 return Builder
.CreateInsertValue(Res
, UAddResult
, 1);
675 static Value
*simplifyTernarylogic(const IntrinsicInst
&II
,
676 InstCombiner::BuilderTy
&Builder
) {
678 auto *ArgImm
= dyn_cast
<ConstantInt
>(II
.getArgOperand(3));
679 if (!ArgImm
|| ArgImm
->getValue().uge(256))
682 Value
*ArgA
= II
.getArgOperand(0);
683 Value
*ArgB
= II
.getArgOperand(1);
684 Value
*ArgC
= II
.getArgOperand(2);
686 Type
*Ty
= II
.getType();
688 auto Or
= [&](auto Lhs
, auto Rhs
) -> std::pair
<Value
*, uint8_t> {
689 return {Builder
.CreateOr(Lhs
.first
, Rhs
.first
), Lhs
.second
| Rhs
.second
};
691 auto Xor
= [&](auto Lhs
, auto Rhs
) -> std::pair
<Value
*, uint8_t> {
692 return {Builder
.CreateXor(Lhs
.first
, Rhs
.first
), Lhs
.second
^ Rhs
.second
};
694 auto And
= [&](auto Lhs
, auto Rhs
) -> std::pair
<Value
*, uint8_t> {
695 return {Builder
.CreateAnd(Lhs
.first
, Rhs
.first
), Lhs
.second
& Rhs
.second
};
697 auto Not
= [&](auto V
) -> std::pair
<Value
*, uint8_t> {
698 return {Builder
.CreateNot(V
.first
), ~V
.second
};
700 auto Nor
= [&](auto Lhs
, auto Rhs
) { return Not(Or(Lhs
, Rhs
)); };
701 auto Xnor
= [&](auto Lhs
, auto Rhs
) { return Not(Xor(Lhs
, Rhs
)); };
702 auto Nand
= [&](auto Lhs
, auto Rhs
) { return Not(And(Lhs
, Rhs
)); };
704 bool AIsConst
= match(ArgA
, m_ImmConstant());
705 bool BIsConst
= match(ArgB
, m_ImmConstant());
706 bool CIsConst
= match(ArgC
, m_ImmConstant());
708 bool ABIsConst
= AIsConst
&& BIsConst
;
709 bool ACIsConst
= AIsConst
&& CIsConst
;
710 bool BCIsConst
= BIsConst
&& CIsConst
;
711 bool ABCIsConst
= AIsConst
&& BIsConst
&& CIsConst
;
713 // Use for verification. Its a big table. Its difficult to go from Imm ->
714 // logic ops, but easy to verify that a set of logic ops is correct. We track
715 // the logic ops through the second value in the pair. At the end it should
717 std::pair
<Value
*, uint8_t> A
= {ArgA
, 0xf0};
718 std::pair
<Value
*, uint8_t> B
= {ArgB
, 0xcc};
719 std::pair
<Value
*, uint8_t> C
= {ArgC
, 0xaa};
720 std::pair
<Value
*, uint8_t> Res
= {nullptr, 0};
722 // Currently we only handle cases that convert directly to another instruction
723 // or cases where all the ops are constant. This is because we don't properly
724 // handle creating ternary ops in the backend, so splitting them here may
725 // cause regressions. As the backend improves, uncomment more cases.
727 uint8_t Imm
= ArgImm
->getValue().getZExtValue();
730 Res
= {Constant::getNullValue(Ty
), 0};
734 Res
= Nor(Or(A
, B
), C
);
738 Res
= And(Nor(A
, B
), C
);
746 Res
= And(Nor(A
, C
), B
);
754 Res
= Nor(A
, Xnor(B
, C
));
758 Res
= Nor(A
, And(B
, C
));
762 Res
= Nor(A
, Nand(B
, C
));
766 Res
= Nor(A
, Xor(B
, C
));
770 Res
= Nor(A
, Not(C
));
774 Res
= Nor(A
, Nor(C
, Not(B
)));
778 Res
= Nor(A
, Not(B
));
782 Res
= Nor(A
, Nor(B
, Not(C
)));
786 Res
= Nor(A
, Nor(B
, C
));
793 Res
= And(A
, Nor(B
, C
));
801 Res
= Nor(Xnor(A
, C
), B
);
805 Res
= Nor(And(A
, C
), B
);
809 Res
= Nor(Xnor(A
, B
), C
);
813 Res
= Nor(And(A
, B
), C
);
817 Res
= Xor(Xor(A
, B
), And(Nand(A
, B
), C
));
821 Res
= Xor(Or(A
, B
), Or(Xnor(A
, B
), C
));
825 Res
= Nor(Xnor(A
, B
), Xnor(A
, C
));
829 Res
= And(Nand(A
, B
), Xnor(B
, C
));
833 Res
= Xor(A
, Or(And(A
, B
), C
));
837 Res
= Xor(A
, Or(Xnor(A
, B
), C
));
841 Res
= Xor(A
, Or(And(A
, C
), B
));
845 Res
= Xor(A
, Or(Xnor(A
, C
), B
));
849 Res
= Xor(A
, Or(B
, C
));
853 Res
= Nand(A
, Or(B
, C
));
857 Res
= Nor(Nand(A
, C
), B
);
861 Res
= Nor(Xor(A
, C
), B
);
865 Res
= Nor(B
, Not(C
));
869 Res
= Nor(B
, Nor(C
, Not(A
)));
873 Res
= Nor(Xnor(A
, B
), Xor(A
, C
));
877 Res
= Xor(A
, Nand(Nand(A
, B
), C
));
881 Res
= And(Nand(A
, B
), Xor(B
, C
));
885 Res
= Xor(Or(Xnor(A
, B
), C
), B
);
889 Res
= And(Xor(A
, B
), C
);
893 Res
= Xor(Xor(A
, B
), Nor(And(A
, B
), C
));
897 Res
= And(Nand(A
, B
), C
);
901 Res
= Xor(Or(Xnor(A
, B
), Xor(A
, C
)), A
);
905 Res
= Nor(Xnor(A
, B
), Nor(B
, C
));
909 Res
= Xor(A
, Or(B
, Not(C
)));
913 Res
= Xor(A
, Or(Xor(A
, C
), B
));
917 Res
= Nand(A
, Or(B
, Not(C
)));
921 Res
= Nor(B
, Not(A
));
925 Res
= Nor(Nor(A
, Not(C
)), B
);
929 Res
= Nor(Nor(A
, C
), B
);
936 Res
= And(Xor(A
, B
), Nand(B
, C
));
940 Res
= Xor(B
, Or(A
, Xnor(B
, C
)));
944 Res
= Xor(Or(A
, C
), B
);
948 Res
= Nand(Or(A
, C
), B
);
952 Res
= Nor(Xnor(A
, B
), Nor(A
, C
));
956 Res
= Xor(Or(A
, Not(C
)), B
);
960 Res
= Xor(B
, Or(A
, Xor(B
, C
)));
964 Res
= Nand(Or(A
, Not(C
)), B
);
971 Res
= Xor(A
, Or(Nor(A
, C
), B
));
975 Res
= Xor(A
, Or(Nor(A
, Not(C
)), B
));
983 Res
= Nor(Nand(A
, B
), C
);
987 Res
= Nor(Xor(A
, B
), C
);
991 Res
= Nor(Xor(A
, B
), Xnor(A
, C
));
995 Res
= Xor(A
, Nand(Nand(A
, C
), B
));
999 Res
= Nor(C
, Not(B
));
1003 Res
= Nor(Nor(B
, Not(A
)), C
);
1007 Res
= Xor(Or(And(A
, C
), B
), C
);
1011 Res
= Xor(Or(Xnor(A
, C
), B
), C
);
1015 Res
= And(Xor(A
, C
), B
);
1019 Res
= Xor(Or(Xnor(A
, B
), And(A
, C
)), C
);
1023 Res
= Nor(Xnor(A
, C
), Nor(B
, C
));
1027 Res
= Xor(A
, Or(C
, Not(B
)));
1031 Res
= And(Nand(A
, C
), B
);
1035 Res
= Xor(Or(Xor(A
, B
), Xnor(A
, C
)), A
);
1039 Res
= Xor(A
, Or(Xor(A
, B
), C
));
1043 Res
= Nand(A
, Nand(B
, Not(C
)));
1047 Res
= Nor(C
, Not(A
));
1051 Res
= Nor(Nor(A
, Not(B
)), C
);
1055 Res
= And(Xor(A
, C
), Nand(B
, C
));
1059 Res
= Xor(Or(Xnor(B
, C
), A
), C
);
1063 Res
= Nor(Nor(A
, B
), C
);
1070 Res
= Xor(Or(A
, B
), C
);
1074 Res
= Nand(Or(A
, B
), C
);
1078 Res
= Nor(Nor(A
, B
), Xnor(A
, C
));
1082 Res
= Xor(Or(A
, Not(B
)), C
);
1089 Res
= Xor(A
, Or(Nor(A
, B
), C
));
1093 Res
= Xor(Or(Xor(B
, C
), A
), C
);
1097 Res
= Nand(Or(A
, Not(B
)), C
);
1101 Res
= Xor(A
, Or(Nor(A
, Not(B
)), C
));
1109 Res
= And(A
, Xor(B
, C
));
1113 Res
= Xor(Or(Xnor(A
, B
), And(B
, C
)), C
);
1117 Res
= Nor(Nor(A
, C
), Xnor(B
, C
));
1121 Res
= Xor(B
, Or(C
, Not(A
)));
1125 Res
= Nor(Nor(A
, B
), Xnor(B
, C
));
1129 Res
= Xor(Or(B
, Not(A
)), C
);
1136 Res
= Or(Nor(A
, B
), Xor(B
, C
));
1140 Res
= Xor(Xor(A
, B
), Nor(Nor(A
, B
), C
));
1144 Res
= Xor(Xnor(A
, B
), C
);
1148 Res
= Xor(And(A
, B
), C
);
1152 Res
= Or(Nor(A
, B
), Xor(Xnor(A
, B
), C
));
1156 Res
= Xor(And(A
, C
), B
);
1160 Res
= Xor(Or(Xnor(A
, B
), Nor(A
, C
)), C
);
1164 Res
= Or(Nor(A
, Not(B
)), Xor(B
, C
));
1168 Res
= Nand(A
, Xnor(B
, C
));
1172 Res
= And(A
, Nand(B
, C
));
1176 Res
= Xor(Nor(Xor(A
, B
), Xor(A
, C
)), A
);
1180 Res
= Xor(Or(Xor(A
, B
), C
), B
);
1184 Res
= Nand(Nand(A
, Not(C
)), B
);
1188 Res
= Xor(Or(Xor(A
, C
), B
), C
);
1192 Res
= Nand(Nand(A
, Not(B
)), C
);
1196 Res
= Xor(B
, Or(Nor(B
, Not(A
)), C
));
1204 Res
= Xor(A
, And(B
, C
));
1208 Res
= Xor(Or(Xnor(A
, B
), Nor(B
, C
)), C
);
1212 Res
= Or(Xor(A
, C
), Nor(B
, Not(A
)));
1216 Res
= Nand(Xnor(A
, C
), B
);
1220 Res
= Or(Xor(A
, B
), Nor(C
, Not(A
)));
1224 Res
= Nand(Xnor(A
, B
), C
);
1228 Res
= Or(Xor(A
, B
), Xor(A
, C
));
1232 Res
= Nand(And(A
, B
), C
);
1236 Res
= And(And(A
, B
), C
);
1240 Res
= Nor(Xor(A
, B
), Xor(A
, C
));
1244 Res
= And(Xnor(A
, B
), C
);
1248 Res
= Nor(Xor(A
, B
), Nor(C
, Not(A
)));
1252 Res
= And(Xnor(A
, C
), B
);
1256 Res
= Nor(Xor(A
, C
), Nor(B
, Not(A
)));
1260 Res
= Xor(Nor(Xnor(A
, B
), Nor(B
, C
)), C
);
1264 Res
= Xor(A
, Nand(B
, C
));
1271 Res
= Xor(B
, Nor(Nor(B
, Not(A
)), C
));
1275 Res
= And(Nand(A
, Not(B
)), C
);
1279 Res
= Xor(Nor(Xor(A
, C
), B
), C
);
1283 Res
= And(Nand(A
, Not(C
)), B
);
1287 Res
= Xor(Nor(Xor(A
, B
), C
), B
);
1291 Res
= Xor(Or(Xor(A
, B
), Xor(A
, C
)), A
);
1295 Res
= Nand(A
, Nand(B
, C
));
1299 Res
= And(A
, Xnor(B
, C
));
1303 Res
= Nor(Nor(A
, Not(B
)), Xor(B
, C
));
1307 Res
= Xor(Nor(Xnor(A
, B
), Nor(A
, C
)), C
);
1311 Res
= Xor(Nand(A
, C
), B
);
1315 Res
= Nor(Nor(A
, B
), Xor(Xnor(A
, B
), C
));
1319 Res
= Xor(Nand(A
, B
), C
);
1323 Res
= Xor(Xor(A
, B
), C
);
1327 Res
= Xor(Xor(A
, B
), Or(Nor(A
, B
), C
));
1331 Res
= Nor(Nor(A
, B
), Xor(B
, C
));
1339 Res
= Xor(Nor(B
, Not(A
)), C
);
1343 Res
= Or(Nor(A
, B
), Xnor(B
, C
));
1347 Res
= Xor(B
, Nor(C
, Not(A
)));
1351 Res
= Or(Nor(A
, C
), Xnor(B
, C
));
1355 Res
= Xor(And(Xor(A
, B
), Nand(B
, C
)), C
);
1359 Res
= Nand(A
, Xor(B
, C
));
1366 Res
= Xor(A
, Nor(Nor(A
, Not(B
)), C
));
1370 Res
= And(Or(A
, Not(B
)), C
);
1374 Res
= Xor(Nor(Xor(B
, C
), A
), C
);
1378 Res
= Xor(A
, Nor(Nor(A
, B
), C
));
1386 Res
= Xor(Nor(A
, Not(B
)), C
);
1390 Res
= Or(Nor(A
, B
), Xnor(A
, C
));
1394 Res
= And(Or(A
, B
), C
);
1398 Res
= Xor(Nor(A
, B
), C
);
1405 Res
= Or(Nor(A
, B
), C
);
1409 Res
= Xor(Nor(Xnor(B
, C
), A
), C
);
1413 Res
= Or(Xnor(A
, C
), And(B
, C
));
1417 Res
= Or(Nor(A
, Not(B
)), C
);
1421 Res
= Or(C
, Not(A
));
1425 Res
= And(A
, Nand(B
, Not(C
)));
1429 Res
= Xor(A
, Nor(Xor(A
, B
), C
));
1433 Res
= Xor(Nor(Xor(A
, B
), Xnor(A
, C
)), A
);
1437 Res
= Nand(Nand(A
, C
), B
);
1441 Res
= Xor(A
, Nor(C
, Not(B
)));
1445 Res
= Or(Xnor(A
, C
), Nor(B
, C
));
1449 Res
= Xor(And(Xor(A
, B
), Nand(A
, C
)), C
);
1453 Res
= Nand(Xor(A
, C
), B
);
1457 Res
= Xor(Nor(Xnor(A
, C
), B
), C
);
1461 Res
= Xor(Nor(And(A
, C
), B
), C
);
1465 Res
= Or(Nor(B
, Not(A
)), C
);
1469 Res
= Or(C
, Not(B
));
1473 Res
= Xor(A
, And(Nand(A
, C
), B
));
1477 Res
= Or(Xor(A
, B
), Xnor(A
, C
));
1481 Res
= Or(Xor(A
, B
), C
);
1485 Res
= Or(Nand(A
, B
), C
);
1492 Res
= Xor(A
, Nor(Nor(A
, Not(C
)), B
));
1496 Res
= Xor(A
, Nor(Nor(A
, C
), B
));
1504 Res
= And(Or(A
, Not(C
)), B
);
1508 Res
= Xor(B
, Nor(A
, Xor(B
, C
)));
1512 Res
= Xor(Nor(A
, Not(C
)), B
);
1516 Res
= Or(Xnor(A
, B
), Nor(A
, C
));
1520 Res
= And(Or(A
, C
), B
);
1524 Res
= Xor(Nor(A
, C
), B
);
1528 Res
= Xor(B
, Nor(A
, Xnor(B
, C
)));
1532 Res
= Or(Xnor(A
, B
), And(B
, C
));
1539 Res
= Or(Nor(A
, C
), B
);
1543 Res
= Or(Nor(A
, Not(C
)), B
);
1547 Res
= Or(B
, Not(A
));
1551 Res
= And(A
, Or(B
, Not(C
)));
1555 Res
= Xor(A
, Nor(Xor(A
, C
), B
));
1559 Res
= Xor(A
, Nor(B
, Not(C
)));
1563 Res
= Or(Xnor(A
, B
), Nor(B
, C
));
1567 Res
= Xor(Nor(Xnor(A
, B
), Xor(A
, C
)), A
);
1571 Res
= Nand(Nand(A
, B
), C
);
1575 Res
= Xor(Xor(A
, B
), Or(And(A
, B
), C
));
1579 Res
= Nand(Xor(A
, B
), C
);
1583 Res
= Xor(Nor(Xnor(A
, B
), C
), B
);
1587 Res
= Or(And(A
, B
), Xnor(B
, C
));
1591 Res
= Xor(A
, And(Nand(A
, B
), C
));
1595 Res
= Or(Xnor(A
, B
), Xor(A
, C
));
1599 Res
= Or(B
, Nor(C
, Not(A
)));
1603 Res
= Or(B
, Not(C
));
1607 Res
= Or(Xor(A
, C
), B
);
1611 Res
= Or(Nand(A
, C
), B
);
1615 Res
= And(A
, Or(B
, C
));
1619 Res
= Xor(A
, Nor(B
, C
));
1623 Res
= Xor(A
, Nor(Xnor(A
, C
), B
));
1627 Res
= Xor(A
, Nor(And(A
, C
), B
));
1631 Res
= Xor(A
, Nor(Xnor(A
, B
), C
));
1635 Res
= Xor(A
, Nor(And(A
, B
), C
));
1639 Res
= Or(And(A
, B
), Xor(B
, C
));
1643 Res
= Or(Xnor(A
, B
), Xnor(A
, C
));
1647 Res
= Xor(Or(A
, B
), Nor(Xnor(A
, B
), C
));
1651 Res
= Xor(Xor(A
, B
), Nand(Nand(A
, B
), C
));
1655 Res
= Or(And(A
, B
), C
);
1659 Res
= Or(Xnor(A
, B
), C
);
1663 Res
= Or(And(A
, C
), B
);
1667 Res
= Or(Xnor(A
, C
), B
);
1674 Res
= Nand(A
, Nor(B
, C
));
1681 Res
= Or(A
, Nor(B
, C
));
1685 Res
= Or(A
, Nor(B
, Not(C
)));
1689 Res
= Or(A
, Not(B
));
1693 Res
= Or(A
, Nor(C
, Not(B
)));
1697 Res
= Or(A
, Not(C
));
1701 Res
= Or(A
, Xor(B
, C
));
1705 Res
= Or(A
, Nand(B
, C
));
1709 Res
= Or(A
, And(B
, C
));
1713 Res
= Or(A
, Xnor(B
, C
));
1720 Res
= Nand(Nor(A
, C
), B
);
1727 Res
= Nand(Nor(A
, B
), C
);
1731 Res
= Or(Or(A
, B
), C
);
1734 Res
= {Constant::getAllOnesValue(Ty
), 0xff};
1738 assert((Res
.first
== nullptr || Res
.second
== Imm
) &&
1739 "Simplification of ternary logic does not verify!");
1743 static Value
*simplifyX86insertps(const IntrinsicInst
&II
,
1744 InstCombiner::BuilderTy
&Builder
) {
1745 auto *CInt
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
1749 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
1750 assert(VecTy
->getNumElements() == 4 && "insertps with wrong vector type");
1752 // The immediate permute control byte looks like this:
1753 // [3:0] - zero mask for each 32-bit lane
1754 // [5:4] - select one 32-bit destination lane
1755 // [7:6] - select one 32-bit source lane
1757 uint8_t Imm
= CInt
->getZExtValue();
1758 uint8_t ZMask
= Imm
& 0xf;
1759 uint8_t DestLane
= (Imm
>> 4) & 0x3;
1760 uint8_t SourceLane
= (Imm
>> 6) & 0x3;
1762 ConstantAggregateZero
*ZeroVector
= ConstantAggregateZero::get(VecTy
);
1764 // If all zero mask bits are set, this was just a weird way to
1765 // generate a zero vector.
1769 // Initialize by passing all of the first source bits through.
1770 int ShuffleMask
[4] = {0, 1, 2, 3};
1772 // We may replace the second operand with the zero vector.
1773 Value
*V1
= II
.getArgOperand(1);
1776 // If the zero mask is being used with a single input or the zero mask
1777 // overrides the destination lane, this is a shuffle with the zero vector.
1778 if ((II
.getArgOperand(0) == II
.getArgOperand(1)) ||
1779 (ZMask
& (1 << DestLane
))) {
1781 // We may still move 32-bits of the first source vector from one lane
1783 ShuffleMask
[DestLane
] = SourceLane
;
1784 // The zero mask may override the previous insert operation.
1785 for (unsigned i
= 0; i
< 4; ++i
)
1786 if ((ZMask
>> i
) & 0x1)
1787 ShuffleMask
[i
] = i
+ 4;
1789 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1793 // Replace the selected destination lane with the selected source lane.
1794 ShuffleMask
[DestLane
] = SourceLane
+ 4;
1797 return Builder
.CreateShuffleVector(II
.getArgOperand(0), V1
, ShuffleMask
);
1800 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1801 /// or conversion to a shuffle vector.
1802 static Value
*simplifyX86extrq(IntrinsicInst
&II
, Value
*Op0
,
1803 ConstantInt
*CILength
, ConstantInt
*CIIndex
,
1804 InstCombiner::BuilderTy
&Builder
) {
1805 auto LowConstantHighUndef
= [&](uint64_t Val
) {
1806 Type
*IntTy64
= Type::getInt64Ty(II
.getContext());
1807 Constant
*Args
[] = {ConstantInt::get(IntTy64
, Val
),
1808 UndefValue::get(IntTy64
)};
1809 return ConstantVector::get(Args
);
1812 // See if we're dealing with constant values.
1813 auto *C0
= dyn_cast
<Constant
>(Op0
);
1815 C0
? dyn_cast_or_null
<ConstantInt
>(C0
->getAggregateElement((unsigned)0))
1818 // Attempt to constant fold.
1819 if (CILength
&& CIIndex
) {
1820 // From AMD documentation: "The bit index and field length are each six
1821 // bits in length other bits of the field are ignored."
1822 APInt APIndex
= CIIndex
->getValue().zextOrTrunc(6);
1823 APInt APLength
= CILength
->getValue().zextOrTrunc(6);
1825 unsigned Index
= APIndex
.getZExtValue();
1827 // From AMD documentation: "a value of zero in the field length is
1828 // defined as length of 64".
1829 unsigned Length
= APLength
== 0 ? 64 : APLength
.getZExtValue();
1831 // From AMD documentation: "If the sum of the bit index + length field
1832 // is greater than 64, the results are undefined".
1833 unsigned End
= Index
+ Length
;
1835 // Note that both field index and field length are 8-bit quantities.
1836 // Since variables 'Index' and 'Length' are unsigned values
1837 // obtained from zero-extending field index and field length
1838 // respectively, their sum should never wrap around.
1840 return UndefValue::get(II
.getType());
1842 // If we are inserting whole bytes, we can convert this to a shuffle.
1843 // Lowering can recognize EXTRQI shuffle masks.
1844 if ((Length
% 8) == 0 && (Index
% 8) == 0) {
1845 // Convert bit indices to byte indices.
1849 Type
*IntTy8
= Type::getInt8Ty(II
.getContext());
1850 auto *ShufTy
= FixedVectorType::get(IntTy8
, 16);
1852 SmallVector
<int, 16> ShuffleMask
;
1853 for (int i
= 0; i
!= (int)Length
; ++i
)
1854 ShuffleMask
.push_back(i
+ Index
);
1855 for (int i
= Length
; i
!= 8; ++i
)
1856 ShuffleMask
.push_back(i
+ 16);
1857 for (int i
= 8; i
!= 16; ++i
)
1858 ShuffleMask
.push_back(-1);
1860 Value
*SV
= Builder
.CreateShuffleVector(
1861 Builder
.CreateBitCast(Op0
, ShufTy
),
1862 ConstantAggregateZero::get(ShufTy
), ShuffleMask
);
1863 return Builder
.CreateBitCast(SV
, II
.getType());
1866 // Constant Fold - shift Index'th bit to lowest position and mask off
1869 APInt Elt
= CI0
->getValue();
1870 Elt
.lshrInPlace(Index
);
1871 Elt
= Elt
.zextOrTrunc(Length
);
1872 return LowConstantHighUndef(Elt
.getZExtValue());
1875 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1876 if (II
.getIntrinsicID() == Intrinsic::x86_sse4a_extrq
) {
1877 Value
*Args
[] = {Op0
, CILength
, CIIndex
};
1878 return Builder
.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi
, {}, Args
);
1882 // Constant Fold - extraction from zero is always {zero, undef}.
1883 if (CI0
&& CI0
->isZero())
1884 return LowConstantHighUndef(0);
1889 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1890 /// folding or conversion to a shuffle vector.
1891 static Value
*simplifyX86insertq(IntrinsicInst
&II
, Value
*Op0
, Value
*Op1
,
1892 APInt APLength
, APInt APIndex
,
1893 InstCombiner::BuilderTy
&Builder
) {
1894 // From AMD documentation: "The bit index and field length are each six bits
1895 // in length other bits of the field are ignored."
1896 APIndex
= APIndex
.zextOrTrunc(6);
1897 APLength
= APLength
.zextOrTrunc(6);
1899 // Attempt to constant fold.
1900 unsigned Index
= APIndex
.getZExtValue();
1902 // From AMD documentation: "a value of zero in the field length is
1903 // defined as length of 64".
1904 unsigned Length
= APLength
== 0 ? 64 : APLength
.getZExtValue();
1906 // From AMD documentation: "If the sum of the bit index + length field
1907 // is greater than 64, the results are undefined".
1908 unsigned End
= Index
+ Length
;
1910 // Note that both field index and field length are 8-bit quantities.
1911 // Since variables 'Index' and 'Length' are unsigned values
1912 // obtained from zero-extending field index and field length
1913 // respectively, their sum should never wrap around.
1915 return UndefValue::get(II
.getType());
1917 // If we are inserting whole bytes, we can convert this to a shuffle.
1918 // Lowering can recognize INSERTQI shuffle masks.
1919 if ((Length
% 8) == 0 && (Index
% 8) == 0) {
1920 // Convert bit indices to byte indices.
1924 Type
*IntTy8
= Type::getInt8Ty(II
.getContext());
1925 auto *ShufTy
= FixedVectorType::get(IntTy8
, 16);
1927 SmallVector
<int, 16> ShuffleMask
;
1928 for (int i
= 0; i
!= (int)Index
; ++i
)
1929 ShuffleMask
.push_back(i
);
1930 for (int i
= 0; i
!= (int)Length
; ++i
)
1931 ShuffleMask
.push_back(i
+ 16);
1932 for (int i
= Index
+ Length
; i
!= 8; ++i
)
1933 ShuffleMask
.push_back(i
);
1934 for (int i
= 8; i
!= 16; ++i
)
1935 ShuffleMask
.push_back(-1);
1937 Value
*SV
= Builder
.CreateShuffleVector(Builder
.CreateBitCast(Op0
, ShufTy
),
1938 Builder
.CreateBitCast(Op1
, ShufTy
),
1940 return Builder
.CreateBitCast(SV
, II
.getType());
1943 // See if we're dealing with constant values.
1944 auto *C0
= dyn_cast
<Constant
>(Op0
);
1945 auto *C1
= dyn_cast
<Constant
>(Op1
);
1947 C0
? dyn_cast_or_null
<ConstantInt
>(C0
->getAggregateElement((unsigned)0))
1950 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)0))
1953 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1955 APInt V00
= CI00
->getValue();
1956 APInt V10
= CI10
->getValue();
1957 APInt Mask
= APInt::getLowBitsSet(64, Length
).shl(Index
);
1959 V10
= V10
.zextOrTrunc(Length
).zextOrTrunc(64).shl(Index
);
1960 APInt Val
= V00
| V10
;
1961 Type
*IntTy64
= Type::getInt64Ty(II
.getContext());
1962 Constant
*Args
[] = {ConstantInt::get(IntTy64
, Val
.getZExtValue()),
1963 UndefValue::get(IntTy64
)};
1964 return ConstantVector::get(Args
);
1967 // If we were an INSERTQ call, we'll save demanded elements if we convert to
1969 if (II
.getIntrinsicID() == Intrinsic::x86_sse4a_insertq
) {
1970 Type
*IntTy8
= Type::getInt8Ty(II
.getContext());
1971 Constant
*CILength
= ConstantInt::get(IntTy8
, Length
, false);
1972 Constant
*CIIndex
= ConstantInt::get(IntTy8
, Index
, false);
1974 Value
*Args
[] = {Op0
, Op1
, CILength
, CIIndex
};
1975 return Builder
.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi
, {}, Args
);
1981 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1982 static Value
*simplifyX86pshufb(const IntrinsicInst
&II
,
1983 InstCombiner::BuilderTy
&Builder
) {
1984 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
1988 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
1989 unsigned NumElts
= VecTy
->getNumElements();
1990 assert((NumElts
== 16 || NumElts
== 32 || NumElts
== 64) &&
1991 "Unexpected number of elements in shuffle mask!");
1993 // Construct a shuffle mask from constant integers or UNDEFs.
1996 // Each byte in the shuffle control mask forms an index to permute the
1997 // corresponding byte in the destination operand.
1998 for (unsigned I
= 0; I
< NumElts
; ++I
) {
1999 Constant
*COp
= V
->getAggregateElement(I
);
2000 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
2003 if (isa
<UndefValue
>(COp
)) {
2008 int8_t Index
= cast
<ConstantInt
>(COp
)->getValue().getZExtValue();
2010 // If the most significant bit (bit[7]) of each byte of the shuffle
2011 // control mask is set, then zero is written in the result byte.
2012 // The zero vector is in the right-hand side of the resulting
2015 // The value of each index for the high 128-bit lane is the least
2016 // significant 4 bits of the respective shuffle control byte.
2017 Index
= ((Index
< 0) ? NumElts
: Index
& 0x0F) + (I
& 0xF0);
2021 auto V1
= II
.getArgOperand(0);
2022 auto V2
= Constant::getNullValue(VecTy
);
2023 return Builder
.CreateShuffleVector(V1
, V2
, ArrayRef(Indexes
, NumElts
));
2026 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2027 static Value
*simplifyX86vpermilvar(const IntrinsicInst
&II
,
2028 InstCombiner::BuilderTy
&Builder
) {
2029 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
2033 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
2034 unsigned NumElts
= VecTy
->getNumElements();
2035 bool IsPD
= VecTy
->getScalarType()->isDoubleTy();
2036 unsigned NumLaneElts
= IsPD
? 2 : 4;
2037 assert(NumElts
== 16 || NumElts
== 8 || NumElts
== 4 || NumElts
== 2);
2039 // Construct a shuffle mask from constant integers or UNDEFs.
2042 // The intrinsics only read one or two bits, clear the rest.
2043 for (unsigned I
= 0; I
< NumElts
; ++I
) {
2044 Constant
*COp
= V
->getAggregateElement(I
);
2045 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
2048 if (isa
<UndefValue
>(COp
)) {
2053 APInt Index
= cast
<ConstantInt
>(COp
)->getValue();
2054 Index
= Index
.zextOrTrunc(32).getLoBits(2);
2056 // The PD variants uses bit 1 to select per-lane element index, so
2057 // shift down to convert to generic shuffle mask index.
2059 Index
.lshrInPlace(1);
2061 // The _256 variants are a bit trickier since the mask bits always index
2062 // into the corresponding 128 half. In order to convert to a generic
2063 // shuffle, we have to make that explicit.
2064 Index
+= APInt(32, (I
/ NumLaneElts
) * NumLaneElts
);
2066 Indexes
[I
] = Index
.getZExtValue();
2069 auto V1
= II
.getArgOperand(0);
2070 return Builder
.CreateShuffleVector(V1
, ArrayRef(Indexes
, NumElts
));
2073 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2074 static Value
*simplifyX86vpermv(const IntrinsicInst
&II
,
2075 InstCombiner::BuilderTy
&Builder
) {
2076 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
2080 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
2081 unsigned Size
= VecTy
->getNumElements();
2082 assert((Size
== 4 || Size
== 8 || Size
== 16 || Size
== 32 || Size
== 64) &&
2083 "Unexpected shuffle mask size");
2085 // Construct a shuffle mask from constant integers or UNDEFs.
2088 for (unsigned I
= 0; I
< Size
; ++I
) {
2089 Constant
*COp
= V
->getAggregateElement(I
);
2090 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
2093 if (isa
<UndefValue
>(COp
)) {
2098 uint32_t Index
= cast
<ConstantInt
>(COp
)->getZExtValue();
2103 auto V1
= II
.getArgOperand(0);
2104 return Builder
.CreateShuffleVector(V1
, ArrayRef(Indexes
, Size
));
2107 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2108 static Value
*simplifyX86vpermv3(const IntrinsicInst
&II
,
2109 InstCombiner::BuilderTy
&Builder
) {
2110 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
2114 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
2115 unsigned Size
= VecTy
->getNumElements();
2116 assert((Size
== 2 || Size
== 4 || Size
== 8 || Size
== 16 || Size
== 32 ||
2118 "Unexpected shuffle mask size");
2120 // Construct a shuffle mask from constant integers or UNDEFs.
2123 for (unsigned I
= 0; I
< Size
; ++I
) {
2124 Constant
*COp
= V
->getAggregateElement(I
);
2125 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
2128 if (isa
<UndefValue
>(COp
)) {
2133 uint32_t Index
= cast
<ConstantInt
>(COp
)->getZExtValue();
2134 Index
&= (2 * Size
) - 1;
2138 auto V1
= II
.getArgOperand(0);
2139 auto V2
= II
.getArgOperand(2);
2140 return Builder
.CreateShuffleVector(V1
, V2
, ArrayRef(Indexes
, Size
));
2143 // Simplify VPERMV/VPERMV3 mask - only demand the active index bits.
2144 static bool simplifyX86VPERMMask(Instruction
*II
, bool IsBinary
,
2146 auto *VecTy
= cast
<FixedVectorType
>(II
->getType());
2147 unsigned EltSizeInBits
= VecTy
->getScalarSizeInBits();
2148 unsigned NumElts
= VecTy
->getNumElements();
2149 assert(isPowerOf2_32(NumElts
) && isPowerOf2_32(EltSizeInBits
) &&
2150 "Unexpected shuffle mask size");
2152 unsigned IdxSizeInBits
= Log2_32(IsBinary
? (2 * NumElts
) : NumElts
);
2153 APInt DemandedMask
= APInt::getLowBitsSet(EltSizeInBits
, IdxSizeInBits
);
2155 KnownBits
KnownMask(EltSizeInBits
);
2156 return IC
.SimplifyDemandedBits(II
, /*OpNo=*/1, DemandedMask
, KnownMask
);
2159 std::optional
<Instruction
*>
2160 X86TTIImpl::instCombineIntrinsic(InstCombiner
&IC
, IntrinsicInst
&II
) const {
2161 auto SimplifyDemandedVectorEltsLow
= [&IC
](Value
*Op
, unsigned Width
,
2162 unsigned DemandedWidth
) {
2163 APInt
UndefElts(Width
, 0);
2164 APInt DemandedElts
= APInt::getLowBitsSet(Width
, DemandedWidth
);
2165 return IC
.SimplifyDemandedVectorElts(Op
, DemandedElts
, UndefElts
);
2168 Intrinsic::ID IID
= II
.getIntrinsicID();
2170 case Intrinsic::x86_bmi_bextr_32
:
2171 case Intrinsic::x86_bmi_bextr_64
:
2172 case Intrinsic::x86_tbm_bextri_u32
:
2173 case Intrinsic::x86_tbm_bextri_u64
:
2174 // If the RHS is a constant we can try some simplifications.
2175 if (auto *C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2176 uint64_t Shift
= C
->getZExtValue();
2177 uint64_t Length
= (Shift
>> 8) & 0xff;
2179 unsigned BitWidth
= II
.getType()->getIntegerBitWidth();
2180 // If the length is 0 or the shift is out of range, replace with zero.
2181 if (Length
== 0 || Shift
>= BitWidth
) {
2182 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2184 // If the LHS is also a constant, we can completely constant fold this.
2185 if (auto *InC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2186 uint64_t Result
= InC
->getZExtValue() >> Shift
;
2187 if (Length
> BitWidth
)
2189 Result
&= maskTrailingOnes
<uint64_t>(Length
);
2190 return IC
.replaceInstUsesWith(II
,
2191 ConstantInt::get(II
.getType(), Result
));
2193 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2194 // are only masking bits that a shift already cleared?
2198 case Intrinsic::x86_bmi_bzhi_32
:
2199 case Intrinsic::x86_bmi_bzhi_64
:
2200 // If the RHS is a constant we can try some simplifications.
2201 if (auto *C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2202 uint64_t Index
= C
->getZExtValue() & 0xff;
2203 unsigned BitWidth
= II
.getType()->getIntegerBitWidth();
2204 if (Index
>= BitWidth
) {
2205 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
2208 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2210 // If the LHS is also a constant, we can completely constant fold this.
2211 if (auto *InC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2212 uint64_t Result
= InC
->getZExtValue();
2213 Result
&= maskTrailingOnes
<uint64_t>(Index
);
2214 return IC
.replaceInstUsesWith(II
,
2215 ConstantInt::get(II
.getType(), Result
));
2217 // TODO should we convert this to an AND if the RHS is constant?
2220 case Intrinsic::x86_bmi_pext_32
:
2221 case Intrinsic::x86_bmi_pext_64
:
2222 if (auto *MaskC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2223 if (MaskC
->isNullValue()) {
2224 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2226 if (MaskC
->isAllOnesValue()) {
2227 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
2230 unsigned MaskIdx
, MaskLen
;
2231 if (MaskC
->getValue().isShiftedMask(MaskIdx
, MaskLen
)) {
2232 // any single contingous sequence of 1s anywhere in the mask simply
2233 // describes a subset of the input bits shifted to the appropriate
2234 // position. Replace with the straight forward IR.
2235 Value
*Input
= II
.getArgOperand(0);
2236 Value
*Masked
= IC
.Builder
.CreateAnd(Input
, II
.getArgOperand(1));
2237 Value
*ShiftAmt
= ConstantInt::get(II
.getType(), MaskIdx
);
2238 Value
*Shifted
= IC
.Builder
.CreateLShr(Masked
, ShiftAmt
);
2239 return IC
.replaceInstUsesWith(II
, Shifted
);
2242 if (auto *SrcC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2243 uint64_t Src
= SrcC
->getZExtValue();
2244 uint64_t Mask
= MaskC
->getZExtValue();
2245 uint64_t Result
= 0;
2246 uint64_t BitToSet
= 1;
2249 // Isolate lowest set bit.
2250 uint64_t BitToTest
= Mask
& -Mask
;
2251 if (BitToTest
& Src
)
2255 // Clear lowest set bit.
2259 return IC
.replaceInstUsesWith(II
,
2260 ConstantInt::get(II
.getType(), Result
));
2264 case Intrinsic::x86_bmi_pdep_32
:
2265 case Intrinsic::x86_bmi_pdep_64
:
2266 if (auto *MaskC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2267 if (MaskC
->isNullValue()) {
2268 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2270 if (MaskC
->isAllOnesValue()) {
2271 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
2274 unsigned MaskIdx
, MaskLen
;
2275 if (MaskC
->getValue().isShiftedMask(MaskIdx
, MaskLen
)) {
2276 // any single contingous sequence of 1s anywhere in the mask simply
2277 // describes a subset of the input bits shifted to the appropriate
2278 // position. Replace with the straight forward IR.
2279 Value
*Input
= II
.getArgOperand(0);
2280 Value
*ShiftAmt
= ConstantInt::get(II
.getType(), MaskIdx
);
2281 Value
*Shifted
= IC
.Builder
.CreateShl(Input
, ShiftAmt
);
2282 Value
*Masked
= IC
.Builder
.CreateAnd(Shifted
, II
.getArgOperand(1));
2283 return IC
.replaceInstUsesWith(II
, Masked
);
2286 if (auto *SrcC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2287 uint64_t Src
= SrcC
->getZExtValue();
2288 uint64_t Mask
= MaskC
->getZExtValue();
2289 uint64_t Result
= 0;
2290 uint64_t BitToTest
= 1;
2293 // Isolate lowest set bit.
2294 uint64_t BitToSet
= Mask
& -Mask
;
2295 if (BitToTest
& Src
)
2299 // Clear lowest set bit;
2303 return IC
.replaceInstUsesWith(II
,
2304 ConstantInt::get(II
.getType(), Result
));
2309 case Intrinsic::x86_sse_cvtss2si
:
2310 case Intrinsic::x86_sse_cvtss2si64
:
2311 case Intrinsic::x86_sse_cvttss2si
:
2312 case Intrinsic::x86_sse_cvttss2si64
:
2313 case Intrinsic::x86_sse2_cvtsd2si
:
2314 case Intrinsic::x86_sse2_cvtsd2si64
:
2315 case Intrinsic::x86_sse2_cvttsd2si
:
2316 case Intrinsic::x86_sse2_cvttsd2si64
:
2317 case Intrinsic::x86_avx512_vcvtss2si32
:
2318 case Intrinsic::x86_avx512_vcvtss2si64
:
2319 case Intrinsic::x86_avx512_vcvtss2usi32
:
2320 case Intrinsic::x86_avx512_vcvtss2usi64
:
2321 case Intrinsic::x86_avx512_vcvtsd2si32
:
2322 case Intrinsic::x86_avx512_vcvtsd2si64
:
2323 case Intrinsic::x86_avx512_vcvtsd2usi32
:
2324 case Intrinsic::x86_avx512_vcvtsd2usi64
:
2325 case Intrinsic::x86_avx512_cvttss2si
:
2326 case Intrinsic::x86_avx512_cvttss2si64
:
2327 case Intrinsic::x86_avx512_cvttss2usi
:
2328 case Intrinsic::x86_avx512_cvttss2usi64
:
2329 case Intrinsic::x86_avx512_cvttsd2si
:
2330 case Intrinsic::x86_avx512_cvttsd2si64
:
2331 case Intrinsic::x86_avx512_cvttsd2usi
:
2332 case Intrinsic::x86_avx512_cvttsd2usi64
: {
2333 // These intrinsics only demand the 0th element of their input vectors. If
2334 // we can simplify the input based on that, do so now.
2335 Value
*Arg
= II
.getArgOperand(0);
2336 unsigned VWidth
= cast
<FixedVectorType
>(Arg
->getType())->getNumElements();
2337 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg
, VWidth
, 1)) {
2338 return IC
.replaceOperand(II
, 0, V
);
2343 case Intrinsic::x86_mmx_pmovmskb
:
2344 case Intrinsic::x86_sse_movmsk_ps
:
2345 case Intrinsic::x86_sse2_movmsk_pd
:
2346 case Intrinsic::x86_sse2_pmovmskb_128
:
2347 case Intrinsic::x86_avx_movmsk_pd_256
:
2348 case Intrinsic::x86_avx_movmsk_ps_256
:
2349 case Intrinsic::x86_avx2_pmovmskb
:
2350 if (Value
*V
= simplifyX86movmsk(II
, IC
.Builder
)) {
2351 return IC
.replaceInstUsesWith(II
, V
);
2355 case Intrinsic::x86_sse_comieq_ss
:
2356 case Intrinsic::x86_sse_comige_ss
:
2357 case Intrinsic::x86_sse_comigt_ss
:
2358 case Intrinsic::x86_sse_comile_ss
:
2359 case Intrinsic::x86_sse_comilt_ss
:
2360 case Intrinsic::x86_sse_comineq_ss
:
2361 case Intrinsic::x86_sse_ucomieq_ss
:
2362 case Intrinsic::x86_sse_ucomige_ss
:
2363 case Intrinsic::x86_sse_ucomigt_ss
:
2364 case Intrinsic::x86_sse_ucomile_ss
:
2365 case Intrinsic::x86_sse_ucomilt_ss
:
2366 case Intrinsic::x86_sse_ucomineq_ss
:
2367 case Intrinsic::x86_sse2_comieq_sd
:
2368 case Intrinsic::x86_sse2_comige_sd
:
2369 case Intrinsic::x86_sse2_comigt_sd
:
2370 case Intrinsic::x86_sse2_comile_sd
:
2371 case Intrinsic::x86_sse2_comilt_sd
:
2372 case Intrinsic::x86_sse2_comineq_sd
:
2373 case Intrinsic::x86_sse2_ucomieq_sd
:
2374 case Intrinsic::x86_sse2_ucomige_sd
:
2375 case Intrinsic::x86_sse2_ucomigt_sd
:
2376 case Intrinsic::x86_sse2_ucomile_sd
:
2377 case Intrinsic::x86_sse2_ucomilt_sd
:
2378 case Intrinsic::x86_sse2_ucomineq_sd
:
2379 case Intrinsic::x86_avx512_vcomi_ss
:
2380 case Intrinsic::x86_avx512_vcomi_sd
:
2381 case Intrinsic::x86_avx512_mask_cmp_ss
:
2382 case Intrinsic::x86_avx512_mask_cmp_sd
: {
2383 // These intrinsics only demand the 0th element of their input vectors. If
2384 // we can simplify the input based on that, do so now.
2385 bool MadeChange
= false;
2386 Value
*Arg0
= II
.getArgOperand(0);
2387 Value
*Arg1
= II
.getArgOperand(1);
2388 unsigned VWidth
= cast
<FixedVectorType
>(Arg0
->getType())->getNumElements();
2389 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg0
, VWidth
, 1)) {
2390 IC
.replaceOperand(II
, 0, V
);
2393 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg1
, VWidth
, 1)) {
2394 IC
.replaceOperand(II
, 1, V
);
2403 case Intrinsic::x86_avx512_add_ps_512
:
2404 case Intrinsic::x86_avx512_div_ps_512
:
2405 case Intrinsic::x86_avx512_mul_ps_512
:
2406 case Intrinsic::x86_avx512_sub_ps_512
:
2407 case Intrinsic::x86_avx512_add_pd_512
:
2408 case Intrinsic::x86_avx512_div_pd_512
:
2409 case Intrinsic::x86_avx512_mul_pd_512
:
2410 case Intrinsic::x86_avx512_sub_pd_512
:
2411 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2413 if (auto *R
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2))) {
2414 if (R
->getValue() == 4) {
2415 Value
*Arg0
= II
.getArgOperand(0);
2416 Value
*Arg1
= II
.getArgOperand(1);
2421 llvm_unreachable("Case stmts out of sync!");
2422 case Intrinsic::x86_avx512_add_ps_512
:
2423 case Intrinsic::x86_avx512_add_pd_512
:
2424 V
= IC
.Builder
.CreateFAdd(Arg0
, Arg1
);
2426 case Intrinsic::x86_avx512_sub_ps_512
:
2427 case Intrinsic::x86_avx512_sub_pd_512
:
2428 V
= IC
.Builder
.CreateFSub(Arg0
, Arg1
);
2430 case Intrinsic::x86_avx512_mul_ps_512
:
2431 case Intrinsic::x86_avx512_mul_pd_512
:
2432 V
= IC
.Builder
.CreateFMul(Arg0
, Arg1
);
2434 case Intrinsic::x86_avx512_div_ps_512
:
2435 case Intrinsic::x86_avx512_div_pd_512
:
2436 V
= IC
.Builder
.CreateFDiv(Arg0
, Arg1
);
2440 return IC
.replaceInstUsesWith(II
, V
);
2445 case Intrinsic::x86_avx512_mask_add_ss_round
:
2446 case Intrinsic::x86_avx512_mask_div_ss_round
:
2447 case Intrinsic::x86_avx512_mask_mul_ss_round
:
2448 case Intrinsic::x86_avx512_mask_sub_ss_round
:
2449 case Intrinsic::x86_avx512_mask_add_sd_round
:
2450 case Intrinsic::x86_avx512_mask_div_sd_round
:
2451 case Intrinsic::x86_avx512_mask_mul_sd_round
:
2452 case Intrinsic::x86_avx512_mask_sub_sd_round
:
2453 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2455 if (auto *R
= dyn_cast
<ConstantInt
>(II
.getArgOperand(4))) {
2456 if (R
->getValue() == 4) {
2457 // Extract the element as scalars.
2458 Value
*Arg0
= II
.getArgOperand(0);
2459 Value
*Arg1
= II
.getArgOperand(1);
2460 Value
*LHS
= IC
.Builder
.CreateExtractElement(Arg0
, (uint64_t)0);
2461 Value
*RHS
= IC
.Builder
.CreateExtractElement(Arg1
, (uint64_t)0);
2466 llvm_unreachable("Case stmts out of sync!");
2467 case Intrinsic::x86_avx512_mask_add_ss_round
:
2468 case Intrinsic::x86_avx512_mask_add_sd_round
:
2469 V
= IC
.Builder
.CreateFAdd(LHS
, RHS
);
2471 case Intrinsic::x86_avx512_mask_sub_ss_round
:
2472 case Intrinsic::x86_avx512_mask_sub_sd_round
:
2473 V
= IC
.Builder
.CreateFSub(LHS
, RHS
);
2475 case Intrinsic::x86_avx512_mask_mul_ss_round
:
2476 case Intrinsic::x86_avx512_mask_mul_sd_round
:
2477 V
= IC
.Builder
.CreateFMul(LHS
, RHS
);
2479 case Intrinsic::x86_avx512_mask_div_ss_round
:
2480 case Intrinsic::x86_avx512_mask_div_sd_round
:
2481 V
= IC
.Builder
.CreateFDiv(LHS
, RHS
);
2485 // Handle the masking aspect of the intrinsic.
2486 Value
*Mask
= II
.getArgOperand(3);
2487 auto *C
= dyn_cast
<ConstantInt
>(Mask
);
2488 // We don't need a select if we know the mask bit is a 1.
2489 if (!C
|| !C
->getValue()[0]) {
2490 // Cast the mask to an i1 vector and then extract the lowest element.
2491 auto *MaskTy
= FixedVectorType::get(
2492 IC
.Builder
.getInt1Ty(),
2493 cast
<IntegerType
>(Mask
->getType())->getBitWidth());
2494 Mask
= IC
.Builder
.CreateBitCast(Mask
, MaskTy
);
2495 Mask
= IC
.Builder
.CreateExtractElement(Mask
, (uint64_t)0);
2496 // Extract the lowest element from the passthru operand.
2498 IC
.Builder
.CreateExtractElement(II
.getArgOperand(2), (uint64_t)0);
2499 V
= IC
.Builder
.CreateSelect(Mask
, V
, Passthru
);
2502 // Insert the result back into the original argument 0.
2503 V
= IC
.Builder
.CreateInsertElement(Arg0
, V
, (uint64_t)0);
2505 return IC
.replaceInstUsesWith(II
, V
);
2510 // Constant fold ashr( <A x Bi>, Ci ).
2511 // Constant fold lshr( <A x Bi>, Ci ).
2512 // Constant fold shl( <A x Bi>, Ci ).
2513 case Intrinsic::x86_sse2_psrai_d
:
2514 case Intrinsic::x86_sse2_psrai_w
:
2515 case Intrinsic::x86_avx2_psrai_d
:
2516 case Intrinsic::x86_avx2_psrai_w
:
2517 case Intrinsic::x86_avx512_psrai_q_128
:
2518 case Intrinsic::x86_avx512_psrai_q_256
:
2519 case Intrinsic::x86_avx512_psrai_d_512
:
2520 case Intrinsic::x86_avx512_psrai_q_512
:
2521 case Intrinsic::x86_avx512_psrai_w_512
:
2522 case Intrinsic::x86_sse2_psrli_d
:
2523 case Intrinsic::x86_sse2_psrli_q
:
2524 case Intrinsic::x86_sse2_psrli_w
:
2525 case Intrinsic::x86_avx2_psrli_d
:
2526 case Intrinsic::x86_avx2_psrli_q
:
2527 case Intrinsic::x86_avx2_psrli_w
:
2528 case Intrinsic::x86_avx512_psrli_d_512
:
2529 case Intrinsic::x86_avx512_psrli_q_512
:
2530 case Intrinsic::x86_avx512_psrli_w_512
:
2531 case Intrinsic::x86_sse2_pslli_d
:
2532 case Intrinsic::x86_sse2_pslli_q
:
2533 case Intrinsic::x86_sse2_pslli_w
:
2534 case Intrinsic::x86_avx2_pslli_d
:
2535 case Intrinsic::x86_avx2_pslli_q
:
2536 case Intrinsic::x86_avx2_pslli_w
:
2537 case Intrinsic::x86_avx512_pslli_d_512
:
2538 case Intrinsic::x86_avx512_pslli_q_512
:
2539 case Intrinsic::x86_avx512_pslli_w_512
:
2540 if (Value
*V
= simplifyX86immShift(II
, IC
.Builder
)) {
2541 return IC
.replaceInstUsesWith(II
, V
);
2545 case Intrinsic::x86_sse2_psra_d
:
2546 case Intrinsic::x86_sse2_psra_w
:
2547 case Intrinsic::x86_avx2_psra_d
:
2548 case Intrinsic::x86_avx2_psra_w
:
2549 case Intrinsic::x86_avx512_psra_q_128
:
2550 case Intrinsic::x86_avx512_psra_q_256
:
2551 case Intrinsic::x86_avx512_psra_d_512
:
2552 case Intrinsic::x86_avx512_psra_q_512
:
2553 case Intrinsic::x86_avx512_psra_w_512
:
2554 case Intrinsic::x86_sse2_psrl_d
:
2555 case Intrinsic::x86_sse2_psrl_q
:
2556 case Intrinsic::x86_sse2_psrl_w
:
2557 case Intrinsic::x86_avx2_psrl_d
:
2558 case Intrinsic::x86_avx2_psrl_q
:
2559 case Intrinsic::x86_avx2_psrl_w
:
2560 case Intrinsic::x86_avx512_psrl_d_512
:
2561 case Intrinsic::x86_avx512_psrl_q_512
:
2562 case Intrinsic::x86_avx512_psrl_w_512
:
2563 case Intrinsic::x86_sse2_psll_d
:
2564 case Intrinsic::x86_sse2_psll_q
:
2565 case Intrinsic::x86_sse2_psll_w
:
2566 case Intrinsic::x86_avx2_psll_d
:
2567 case Intrinsic::x86_avx2_psll_q
:
2568 case Intrinsic::x86_avx2_psll_w
:
2569 case Intrinsic::x86_avx512_psll_d_512
:
2570 case Intrinsic::x86_avx512_psll_q_512
:
2571 case Intrinsic::x86_avx512_psll_w_512
: {
2572 if (Value
*V
= simplifyX86immShift(II
, IC
.Builder
)) {
2573 return IC
.replaceInstUsesWith(II
, V
);
2576 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2577 // operand to compute the shift amount.
2578 Value
*Arg1
= II
.getArgOperand(1);
2579 assert(Arg1
->getType()->getPrimitiveSizeInBits() == 128 &&
2580 "Unexpected packed shift size");
2581 unsigned VWidth
= cast
<FixedVectorType
>(Arg1
->getType())->getNumElements();
2583 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg1
, VWidth
, VWidth
/ 2)) {
2584 return IC
.replaceOperand(II
, 1, V
);
2589 case Intrinsic::x86_avx2_psllv_d
:
2590 case Intrinsic::x86_avx2_psllv_d_256
:
2591 case Intrinsic::x86_avx2_psllv_q
:
2592 case Intrinsic::x86_avx2_psllv_q_256
:
2593 case Intrinsic::x86_avx512_psllv_d_512
:
2594 case Intrinsic::x86_avx512_psllv_q_512
:
2595 case Intrinsic::x86_avx512_psllv_w_128
:
2596 case Intrinsic::x86_avx512_psllv_w_256
:
2597 case Intrinsic::x86_avx512_psllv_w_512
:
2598 case Intrinsic::x86_avx2_psrav_d
:
2599 case Intrinsic::x86_avx2_psrav_d_256
:
2600 case Intrinsic::x86_avx512_psrav_q_128
:
2601 case Intrinsic::x86_avx512_psrav_q_256
:
2602 case Intrinsic::x86_avx512_psrav_d_512
:
2603 case Intrinsic::x86_avx512_psrav_q_512
:
2604 case Intrinsic::x86_avx512_psrav_w_128
:
2605 case Intrinsic::x86_avx512_psrav_w_256
:
2606 case Intrinsic::x86_avx512_psrav_w_512
:
2607 case Intrinsic::x86_avx2_psrlv_d
:
2608 case Intrinsic::x86_avx2_psrlv_d_256
:
2609 case Intrinsic::x86_avx2_psrlv_q
:
2610 case Intrinsic::x86_avx2_psrlv_q_256
:
2611 case Intrinsic::x86_avx512_psrlv_d_512
:
2612 case Intrinsic::x86_avx512_psrlv_q_512
:
2613 case Intrinsic::x86_avx512_psrlv_w_128
:
2614 case Intrinsic::x86_avx512_psrlv_w_256
:
2615 case Intrinsic::x86_avx512_psrlv_w_512
:
2616 if (Value
*V
= simplifyX86varShift(II
, IC
.Builder
)) {
2617 return IC
.replaceInstUsesWith(II
, V
);
2621 case Intrinsic::x86_sse2_packssdw_128
:
2622 case Intrinsic::x86_sse2_packsswb_128
:
2623 case Intrinsic::x86_avx2_packssdw
:
2624 case Intrinsic::x86_avx2_packsswb
:
2625 case Intrinsic::x86_avx512_packssdw_512
:
2626 case Intrinsic::x86_avx512_packsswb_512
:
2627 if (Value
*V
= simplifyX86pack(II
, IC
.Builder
, true)) {
2628 return IC
.replaceInstUsesWith(II
, V
);
2632 case Intrinsic::x86_sse2_packuswb_128
:
2633 case Intrinsic::x86_sse41_packusdw
:
2634 case Intrinsic::x86_avx2_packusdw
:
2635 case Intrinsic::x86_avx2_packuswb
:
2636 case Intrinsic::x86_avx512_packusdw_512
:
2637 case Intrinsic::x86_avx512_packuswb_512
:
2638 if (Value
*V
= simplifyX86pack(II
, IC
.Builder
, false)) {
2639 return IC
.replaceInstUsesWith(II
, V
);
2643 case Intrinsic::x86_sse2_pmulh_w
:
2644 case Intrinsic::x86_avx2_pmulh_w
:
2645 case Intrinsic::x86_avx512_pmulh_w_512
:
2646 if (Value
*V
= simplifyX86pmulh(II
, IC
.Builder
, true, false)) {
2647 return IC
.replaceInstUsesWith(II
, V
);
2651 case Intrinsic::x86_sse2_pmulhu_w
:
2652 case Intrinsic::x86_avx2_pmulhu_w
:
2653 case Intrinsic::x86_avx512_pmulhu_w_512
:
2654 if (Value
*V
= simplifyX86pmulh(II
, IC
.Builder
, false, false)) {
2655 return IC
.replaceInstUsesWith(II
, V
);
2659 case Intrinsic::x86_ssse3_pmul_hr_sw_128
:
2660 case Intrinsic::x86_avx2_pmul_hr_sw
:
2661 case Intrinsic::x86_avx512_pmul_hr_sw_512
:
2662 if (Value
*V
= simplifyX86pmulh(II
, IC
.Builder
, true, true)) {
2663 return IC
.replaceInstUsesWith(II
, V
);
2667 case Intrinsic::x86_sse2_pmadd_wd
:
2668 case Intrinsic::x86_avx2_pmadd_wd
:
2669 case Intrinsic::x86_avx512_pmaddw_d_512
:
2670 if (Value
*V
= simplifyX86pmadd(II
, IC
.Builder
, true)) {
2671 return IC
.replaceInstUsesWith(II
, V
);
2675 case Intrinsic::x86_ssse3_pmadd_ub_sw_128
:
2676 case Intrinsic::x86_avx2_pmadd_ub_sw
:
2677 case Intrinsic::x86_avx512_pmaddubs_w_512
:
2678 if (Value
*V
= simplifyX86pmadd(II
, IC
.Builder
, false)) {
2679 return IC
.replaceInstUsesWith(II
, V
);
2683 case Intrinsic::x86_pclmulqdq
:
2684 case Intrinsic::x86_pclmulqdq_256
:
2685 case Intrinsic::x86_pclmulqdq_512
: {
2686 if (auto *C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2))) {
2687 unsigned Imm
= C
->getZExtValue();
2689 bool MadeChange
= false;
2690 Value
*Arg0
= II
.getArgOperand(0);
2691 Value
*Arg1
= II
.getArgOperand(1);
2693 cast
<FixedVectorType
>(Arg0
->getType())->getNumElements();
2695 APInt
UndefElts1(VWidth
, 0);
2696 APInt DemandedElts1
=
2697 APInt::getSplat(VWidth
, APInt(2, (Imm
& 0x01) ? 2 : 1));
2699 IC
.SimplifyDemandedVectorElts(Arg0
, DemandedElts1
, UndefElts1
)) {
2700 IC
.replaceOperand(II
, 0, V
);
2704 APInt
UndefElts2(VWidth
, 0);
2705 APInt DemandedElts2
=
2706 APInt::getSplat(VWidth
, APInt(2, (Imm
& 0x10) ? 2 : 1));
2708 IC
.SimplifyDemandedVectorElts(Arg1
, DemandedElts2
, UndefElts2
)) {
2709 IC
.replaceOperand(II
, 1, V
);
2713 // If either input elements are undef, the result is zero.
2714 if (DemandedElts1
.isSubsetOf(UndefElts1
) ||
2715 DemandedElts2
.isSubsetOf(UndefElts2
)) {
2716 return IC
.replaceInstUsesWith(II
,
2717 ConstantAggregateZero::get(II
.getType()));
2727 case Intrinsic::x86_sse41_insertps
:
2728 if (Value
*V
= simplifyX86insertps(II
, IC
.Builder
)) {
2729 return IC
.replaceInstUsesWith(II
, V
);
2733 case Intrinsic::x86_sse4a_extrq
: {
2734 Value
*Op0
= II
.getArgOperand(0);
2735 Value
*Op1
= II
.getArgOperand(1);
2736 unsigned VWidth0
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2737 unsigned VWidth1
= cast
<FixedVectorType
>(Op1
->getType())->getNumElements();
2738 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 &&
2739 Op1
->getType()->getPrimitiveSizeInBits() == 128 && VWidth0
== 2 &&
2740 VWidth1
== 16 && "Unexpected operand sizes");
2742 // See if we're dealing with constant values.
2743 auto *C1
= dyn_cast
<Constant
>(Op1
);
2745 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)0))
2748 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)1))
2751 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2752 if (Value
*V
= simplifyX86extrq(II
, Op0
, CILength
, CIIndex
, IC
.Builder
)) {
2753 return IC
.replaceInstUsesWith(II
, V
);
2756 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2757 // operands and the lowest 16-bits of the second.
2758 bool MadeChange
= false;
2759 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth0
, 1)) {
2760 IC
.replaceOperand(II
, 0, V
);
2763 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op1
, VWidth1
, 2)) {
2764 IC
.replaceOperand(II
, 1, V
);
2773 case Intrinsic::x86_sse4a_extrqi
: {
2774 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2775 // bits of the lower 64-bits. The upper 64-bits are undefined.
2776 Value
*Op0
= II
.getArgOperand(0);
2777 unsigned VWidth
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2778 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 && VWidth
== 2 &&
2779 "Unexpected operand size");
2781 // See if we're dealing with constant values.
2782 auto *CILength
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1));
2783 auto *CIIndex
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
2785 // Attempt to simplify to a constant or shuffle vector.
2786 if (Value
*V
= simplifyX86extrq(II
, Op0
, CILength
, CIIndex
, IC
.Builder
)) {
2787 return IC
.replaceInstUsesWith(II
, V
);
2790 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2792 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth
, 1)) {
2793 return IC
.replaceOperand(II
, 0, V
);
2798 case Intrinsic::x86_sse4a_insertq
: {
2799 Value
*Op0
= II
.getArgOperand(0);
2800 Value
*Op1
= II
.getArgOperand(1);
2801 unsigned VWidth
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2802 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 &&
2803 Op1
->getType()->getPrimitiveSizeInBits() == 128 && VWidth
== 2 &&
2804 cast
<FixedVectorType
>(Op1
->getType())->getNumElements() == 2 &&
2805 "Unexpected operand size");
2807 // See if we're dealing with constant values.
2808 auto *C1
= dyn_cast
<Constant
>(Op1
);
2810 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)1))
2813 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2815 const APInt
&V11
= CI11
->getValue();
2816 APInt Len
= V11
.zextOrTrunc(6);
2817 APInt Idx
= V11
.lshr(8).zextOrTrunc(6);
2818 if (Value
*V
= simplifyX86insertq(II
, Op0
, Op1
, Len
, Idx
, IC
.Builder
)) {
2819 return IC
.replaceInstUsesWith(II
, V
);
2823 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2825 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth
, 1)) {
2826 return IC
.replaceOperand(II
, 0, V
);
2831 case Intrinsic::x86_sse4a_insertqi
: {
2832 // INSERTQI: Extract lowest Length bits from lower half of second source and
2833 // insert over first source starting at Index bit. The upper 64-bits are
2835 Value
*Op0
= II
.getArgOperand(0);
2836 Value
*Op1
= II
.getArgOperand(1);
2837 unsigned VWidth0
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2838 unsigned VWidth1
= cast
<FixedVectorType
>(Op1
->getType())->getNumElements();
2839 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 &&
2840 Op1
->getType()->getPrimitiveSizeInBits() == 128 && VWidth0
== 2 &&
2841 VWidth1
== 2 && "Unexpected operand sizes");
2843 // See if we're dealing with constant values.
2844 auto *CILength
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
2845 auto *CIIndex
= dyn_cast
<ConstantInt
>(II
.getArgOperand(3));
2847 // Attempt to simplify to a constant or shuffle vector.
2848 if (CILength
&& CIIndex
) {
2849 APInt Len
= CILength
->getValue().zextOrTrunc(6);
2850 APInt Idx
= CIIndex
->getValue().zextOrTrunc(6);
2851 if (Value
*V
= simplifyX86insertq(II
, Op0
, Op1
, Len
, Idx
, IC
.Builder
)) {
2852 return IC
.replaceInstUsesWith(II
, V
);
2856 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2858 bool MadeChange
= false;
2859 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth0
, 1)) {
2860 IC
.replaceOperand(II
, 0, V
);
2863 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op1
, VWidth1
, 1)) {
2864 IC
.replaceOperand(II
, 1, V
);
2873 case Intrinsic::x86_sse41_pblendvb
:
2874 case Intrinsic::x86_sse41_blendvps
:
2875 case Intrinsic::x86_sse41_blendvpd
:
2876 case Intrinsic::x86_avx_blendv_ps_256
:
2877 case Intrinsic::x86_avx_blendv_pd_256
:
2878 case Intrinsic::x86_avx2_pblendvb
: {
2879 // fold (blend A, A, Mask) -> A
2880 Value
*Op0
= II
.getArgOperand(0);
2881 Value
*Op1
= II
.getArgOperand(1);
2882 Value
*Mask
= II
.getArgOperand(2);
2884 return IC
.replaceInstUsesWith(II
, Op0
);
2887 // Zero Mask - select 1st argument.
2888 if (isa
<ConstantAggregateZero
>(Mask
)) {
2889 return IC
.replaceInstUsesWith(II
, Op0
);
2892 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2893 if (auto *ConstantMask
= dyn_cast
<ConstantDataVector
>(Mask
)) {
2894 Constant
*NewSelector
=
2895 getNegativeIsTrueBoolVec(ConstantMask
, IC
.getDataLayout());
2896 return SelectInst::Create(NewSelector
, Op1
, Op0
, "blendv");
2899 Mask
= InstCombiner::peekThroughBitcast(Mask
);
2901 // Peek through a one-use shuffle - VectorCombine should have simplified
2902 // this for cases where we're splitting wider vectors to use blendv
2904 Value
*MaskSrc
= nullptr;
2905 ArrayRef
<int> ShuffleMask
;
2906 if (match(Mask
, m_OneUse(m_Shuffle(m_Value(MaskSrc
), m_Undef(),
2907 m_Mask(ShuffleMask
))))) {
2908 // Bail if the shuffle was irregular or contains undefs.
2909 int NumElts
= cast
<FixedVectorType
>(MaskSrc
->getType())->getNumElements();
2910 if (NumElts
< (int)ShuffleMask
.size() || !isPowerOf2_32(NumElts
) ||
2912 [NumElts
](int M
) { return M
< 0 || M
>= NumElts
; }))
2914 Mask
= InstCombiner::peekThroughBitcast(MaskSrc
);
2917 // Convert to a vector select if we can bypass casts and find a boolean
2918 // vector condition value.
2920 if (match(Mask
, m_SExt(m_Value(BoolVec
))) &&
2921 BoolVec
->getType()->isVectorTy() &&
2922 BoolVec
->getType()->getScalarSizeInBits() == 1) {
2923 auto *MaskTy
= cast
<FixedVectorType
>(Mask
->getType());
2924 auto *OpTy
= cast
<FixedVectorType
>(II
.getType());
2925 unsigned NumMaskElts
= MaskTy
->getNumElements();
2926 unsigned NumOperandElts
= OpTy
->getNumElements();
2928 // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2930 unsigned NumMaskSrcElts
=
2931 cast
<FixedVectorType
>(MaskSrc
->getType())->getNumElements();
2932 NumMaskElts
= (ShuffleMask
.size() * NumMaskElts
) / NumMaskSrcElts
;
2933 // Multiple mask bits maps to the same operand element - bail out.
2934 if (NumMaskElts
> NumOperandElts
)
2936 SmallVector
<int> ScaledMask
;
2937 if (!llvm::scaleShuffleMaskElts(NumMaskElts
, ShuffleMask
, ScaledMask
))
2939 BoolVec
= IC
.Builder
.CreateShuffleVector(BoolVec
, ScaledMask
);
2940 MaskTy
= FixedVectorType::get(MaskTy
->getElementType(), NumMaskElts
);
2942 assert(MaskTy
->getPrimitiveSizeInBits() ==
2943 OpTy
->getPrimitiveSizeInBits() &&
2944 "Not expecting mask and operands with different sizes");
2946 if (NumMaskElts
== NumOperandElts
) {
2947 return SelectInst::Create(BoolVec
, Op1
, Op0
);
2950 // If the mask has less elements than the operands, each mask bit maps to
2951 // multiple elements of the operands. Bitcast back and forth.
2952 if (NumMaskElts
< NumOperandElts
) {
2953 Value
*CastOp0
= IC
.Builder
.CreateBitCast(Op0
, MaskTy
);
2954 Value
*CastOp1
= IC
.Builder
.CreateBitCast(Op1
, MaskTy
);
2955 Value
*Sel
= IC
.Builder
.CreateSelect(BoolVec
, CastOp1
, CastOp0
);
2956 return new BitCastInst(Sel
, II
.getType());
2963 case Intrinsic::x86_ssse3_pshuf_b_128
:
2964 case Intrinsic::x86_avx2_pshuf_b
:
2965 case Intrinsic::x86_avx512_pshuf_b_512
: {
2966 if (Value
*V
= simplifyX86pshufb(II
, IC
.Builder
)) {
2967 return IC
.replaceInstUsesWith(II
, V
);
2970 KnownBits
KnownMask(8);
2971 if (IC
.SimplifyDemandedBits(&II
, 1, APInt(8, 0b10001111), KnownMask
))
2976 case Intrinsic::x86_avx_vpermilvar_ps
:
2977 case Intrinsic::x86_avx_vpermilvar_ps_256
:
2978 case Intrinsic::x86_avx512_vpermilvar_ps_512
: {
2979 if (Value
*V
= simplifyX86vpermilvar(II
, IC
.Builder
)) {
2980 return IC
.replaceInstUsesWith(II
, V
);
2983 KnownBits
KnownMask(32);
2984 if (IC
.SimplifyDemandedBits(&II
, 1, APInt(32, 0b00011), KnownMask
))
2989 case Intrinsic::x86_avx_vpermilvar_pd
:
2990 case Intrinsic::x86_avx_vpermilvar_pd_256
:
2991 case Intrinsic::x86_avx512_vpermilvar_pd_512
: {
2992 if (Value
*V
= simplifyX86vpermilvar(II
, IC
.Builder
)) {
2993 return IC
.replaceInstUsesWith(II
, V
);
2996 KnownBits
KnownMask(64);
2997 if (IC
.SimplifyDemandedBits(&II
, 1, APInt(64, 0b00010), KnownMask
))
3002 case Intrinsic::x86_avx2_permd
:
3003 case Intrinsic::x86_avx2_permps
:
3004 case Intrinsic::x86_avx512_permvar_df_256
:
3005 case Intrinsic::x86_avx512_permvar_df_512
:
3006 case Intrinsic::x86_avx512_permvar_di_256
:
3007 case Intrinsic::x86_avx512_permvar_di_512
:
3008 case Intrinsic::x86_avx512_permvar_hi_128
:
3009 case Intrinsic::x86_avx512_permvar_hi_256
:
3010 case Intrinsic::x86_avx512_permvar_hi_512
:
3011 case Intrinsic::x86_avx512_permvar_qi_128
:
3012 case Intrinsic::x86_avx512_permvar_qi_256
:
3013 case Intrinsic::x86_avx512_permvar_qi_512
:
3014 case Intrinsic::x86_avx512_permvar_sf_512
:
3015 case Intrinsic::x86_avx512_permvar_si_512
:
3016 if (Value
*V
= simplifyX86vpermv(II
, IC
.Builder
)) {
3017 return IC
.replaceInstUsesWith(II
, V
);
3019 if (simplifyX86VPERMMask(&II
, /*IsBinary=*/false, IC
))
3023 case Intrinsic::x86_avx512_vpermi2var_d_128
:
3024 case Intrinsic::x86_avx512_vpermi2var_d_256
:
3025 case Intrinsic::x86_avx512_vpermi2var_d_512
:
3026 case Intrinsic::x86_avx512_vpermi2var_hi_128
:
3027 case Intrinsic::x86_avx512_vpermi2var_hi_256
:
3028 case Intrinsic::x86_avx512_vpermi2var_hi_512
:
3029 case Intrinsic::x86_avx512_vpermi2var_pd_128
:
3030 case Intrinsic::x86_avx512_vpermi2var_pd_256
:
3031 case Intrinsic::x86_avx512_vpermi2var_pd_512
:
3032 case Intrinsic::x86_avx512_vpermi2var_ps_128
:
3033 case Intrinsic::x86_avx512_vpermi2var_ps_256
:
3034 case Intrinsic::x86_avx512_vpermi2var_ps_512
:
3035 case Intrinsic::x86_avx512_vpermi2var_q_128
:
3036 case Intrinsic::x86_avx512_vpermi2var_q_256
:
3037 case Intrinsic::x86_avx512_vpermi2var_q_512
:
3038 case Intrinsic::x86_avx512_vpermi2var_qi_128
:
3039 case Intrinsic::x86_avx512_vpermi2var_qi_256
:
3040 case Intrinsic::x86_avx512_vpermi2var_qi_512
:
3041 if (Value
*V
= simplifyX86vpermv3(II
, IC
.Builder
)) {
3042 return IC
.replaceInstUsesWith(II
, V
);
3044 if (simplifyX86VPERMMask(&II
, /*IsBinary=*/true, IC
))
3048 case Intrinsic::x86_avx_maskload_ps
:
3049 case Intrinsic::x86_avx_maskload_pd
:
3050 case Intrinsic::x86_avx_maskload_ps_256
:
3051 case Intrinsic::x86_avx_maskload_pd_256
:
3052 case Intrinsic::x86_avx2_maskload_d
:
3053 case Intrinsic::x86_avx2_maskload_q
:
3054 case Intrinsic::x86_avx2_maskload_d_256
:
3055 case Intrinsic::x86_avx2_maskload_q_256
:
3056 if (Instruction
*I
= simplifyX86MaskedLoad(II
, IC
)) {
3061 case Intrinsic::x86_sse2_maskmov_dqu
:
3062 case Intrinsic::x86_avx_maskstore_ps
:
3063 case Intrinsic::x86_avx_maskstore_pd
:
3064 case Intrinsic::x86_avx_maskstore_ps_256
:
3065 case Intrinsic::x86_avx_maskstore_pd_256
:
3066 case Intrinsic::x86_avx2_maskstore_d
:
3067 case Intrinsic::x86_avx2_maskstore_q
:
3068 case Intrinsic::x86_avx2_maskstore_d_256
:
3069 case Intrinsic::x86_avx2_maskstore_q_256
:
3070 if (simplifyX86MaskedStore(II
, IC
)) {
3075 case Intrinsic::x86_addcarry_32
:
3076 case Intrinsic::x86_addcarry_64
:
3077 if (Value
*V
= simplifyX86addcarry(II
, IC
.Builder
)) {
3078 return IC
.replaceInstUsesWith(II
, V
);
3082 case Intrinsic::x86_avx512_pternlog_d_128
:
3083 case Intrinsic::x86_avx512_pternlog_d_256
:
3084 case Intrinsic::x86_avx512_pternlog_d_512
:
3085 case Intrinsic::x86_avx512_pternlog_q_128
:
3086 case Intrinsic::x86_avx512_pternlog_q_256
:
3087 case Intrinsic::x86_avx512_pternlog_q_512
:
3088 if (Value
*V
= simplifyTernarylogic(II
, IC
.Builder
)) {
3089 return IC
.replaceInstUsesWith(II
, V
);
3095 return std::nullopt
;
3098 std::optional
<Value
*> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3099 InstCombiner
&IC
, IntrinsicInst
&II
, APInt DemandedMask
, KnownBits
&Known
,
3100 bool &KnownBitsComputed
) const {
3101 switch (II
.getIntrinsicID()) {
3104 case Intrinsic::x86_mmx_pmovmskb
:
3105 case Intrinsic::x86_sse_movmsk_ps
:
3106 case Intrinsic::x86_sse2_movmsk_pd
:
3107 case Intrinsic::x86_sse2_pmovmskb_128
:
3108 case Intrinsic::x86_avx_movmsk_ps_256
:
3109 case Intrinsic::x86_avx_movmsk_pd_256
:
3110 case Intrinsic::x86_avx2_pmovmskb
: {
3111 // MOVMSK copies the vector elements' sign bits to the low bits
3112 // and zeros the high bits.
3114 if (II
.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb
) {
3115 ArgWidth
= 8; // Arg is x86_mmx, but treated as <8 x i8>.
3117 auto *ArgType
= cast
<FixedVectorType
>(II
.getArgOperand(0)->getType());
3118 ArgWidth
= ArgType
->getNumElements();
3121 // If we don't need any of low bits then return zero,
3122 // we know that DemandedMask is non-zero already.
3123 APInt DemandedElts
= DemandedMask
.zextOrTrunc(ArgWidth
);
3124 Type
*VTy
= II
.getType();
3125 if (DemandedElts
.isZero()) {
3126 return ConstantInt::getNullValue(VTy
);
3129 // We know that the upper bits are set to zero.
3130 Known
.Zero
.setBitsFrom(ArgWidth
);
3131 KnownBitsComputed
= true;
3135 return std::nullopt
;
3138 std::optional
<Value
*> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3139 InstCombiner
&IC
, IntrinsicInst
&II
, APInt DemandedElts
, APInt
&UndefElts
,
3140 APInt
&UndefElts2
, APInt
&UndefElts3
,
3141 std::function
<void(Instruction
*, unsigned, APInt
, APInt
&)>
3142 simplifyAndSetOp
) const {
3143 unsigned VWidth
= cast
<FixedVectorType
>(II
.getType())->getNumElements();
3144 switch (II
.getIntrinsicID()) {
3147 case Intrinsic::x86_xop_vfrcz_ss
:
3148 case Intrinsic::x86_xop_vfrcz_sd
:
3149 // The instructions for these intrinsics are speced to zero upper bits not
3150 // pass them through like other scalar intrinsics. So we shouldn't just
3151 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3152 // Instead we should return a zero vector.
3153 if (!DemandedElts
[0]) {
3154 IC
.addToWorklist(&II
);
3155 return ConstantAggregateZero::get(II
.getType());
3158 // Only the lower element is used.
3160 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3162 // Only the lower element is undefined. The high elements are zero.
3163 UndefElts
= UndefElts
[0];
3166 // Unary scalar-as-vector operations that work column-wise.
3167 case Intrinsic::x86_sse_rcp_ss
:
3168 case Intrinsic::x86_sse_rsqrt_ss
:
3169 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3171 // If lowest element of a scalar op isn't used then use Arg0.
3172 if (!DemandedElts
[0]) {
3173 IC
.addToWorklist(&II
);
3174 return II
.getArgOperand(0);
3176 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3180 // Binary scalar-as-vector operations that work column-wise. The high
3181 // elements come from operand 0. The low element is a function of both
3183 case Intrinsic::x86_sse_min_ss
:
3184 case Intrinsic::x86_sse_max_ss
:
3185 case Intrinsic::x86_sse_cmp_ss
:
3186 case Intrinsic::x86_sse2_min_sd
:
3187 case Intrinsic::x86_sse2_max_sd
:
3188 case Intrinsic::x86_sse2_cmp_sd
: {
3189 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3191 // If lowest element of a scalar op isn't used then use Arg0.
3192 if (!DemandedElts
[0]) {
3193 IC
.addToWorklist(&II
);
3194 return II
.getArgOperand(0);
3197 // Only lower element is used for operand 1.
3199 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3201 // Lower element is undefined if both lower elements are undefined.
3202 // Consider things like undef&0. The result is known zero, not undef.
3204 UndefElts
.clearBit(0);
3209 // Binary scalar-as-vector operations that work column-wise. The high
3210 // elements come from operand 0 and the low element comes from operand 1.
3211 case Intrinsic::x86_sse41_round_ss
:
3212 case Intrinsic::x86_sse41_round_sd
: {
3213 // Don't use the low element of operand 0.
3214 APInt DemandedElts2
= DemandedElts
;
3215 DemandedElts2
.clearBit(0);
3216 simplifyAndSetOp(&II
, 0, DemandedElts2
, UndefElts
);
3218 // If lowest element of a scalar op isn't used then use Arg0.
3219 if (!DemandedElts
[0]) {
3220 IC
.addToWorklist(&II
);
3221 return II
.getArgOperand(0);
3224 // Only lower element is used for operand 1.
3226 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3228 // Take the high undef elements from operand 0 and take the lower element
3230 UndefElts
.clearBit(0);
3231 UndefElts
|= UndefElts2
[0];
3235 // Three input scalar-as-vector operations that work column-wise. The high
3236 // elements come from operand 0 and the low element is a function of all
3238 case Intrinsic::x86_avx512_mask_add_ss_round
:
3239 case Intrinsic::x86_avx512_mask_div_ss_round
:
3240 case Intrinsic::x86_avx512_mask_mul_ss_round
:
3241 case Intrinsic::x86_avx512_mask_sub_ss_round
:
3242 case Intrinsic::x86_avx512_mask_max_ss_round
:
3243 case Intrinsic::x86_avx512_mask_min_ss_round
:
3244 case Intrinsic::x86_avx512_mask_add_sd_round
:
3245 case Intrinsic::x86_avx512_mask_div_sd_round
:
3246 case Intrinsic::x86_avx512_mask_mul_sd_round
:
3247 case Intrinsic::x86_avx512_mask_sub_sd_round
:
3248 case Intrinsic::x86_avx512_mask_max_sd_round
:
3249 case Intrinsic::x86_avx512_mask_min_sd_round
:
3250 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3252 // If lowest element of a scalar op isn't used then use Arg0.
3253 if (!DemandedElts
[0]) {
3254 IC
.addToWorklist(&II
);
3255 return II
.getArgOperand(0);
3258 // Only lower element is used for operand 1 and 2.
3260 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3261 simplifyAndSetOp(&II
, 2, DemandedElts
, UndefElts3
);
3263 // Lower element is undefined if all three lower elements are undefined.
3264 // Consider things like undef&0. The result is known zero, not undef.
3265 if (!UndefElts2
[0] || !UndefElts3
[0])
3266 UndefElts
.clearBit(0);
3269 // TODO: Add fmaddsub support?
3270 case Intrinsic::x86_sse3_addsub_pd
:
3271 case Intrinsic::x86_sse3_addsub_ps
:
3272 case Intrinsic::x86_avx_addsub_pd_256
:
3273 case Intrinsic::x86_avx_addsub_ps_256
: {
3274 // If none of the even or none of the odd lanes are required, turn this
3275 // into a generic FP math instruction.
3276 APInt SubMask
= APInt::getSplat(VWidth
, APInt(2, 0x1));
3277 APInt AddMask
= APInt::getSplat(VWidth
, APInt(2, 0x2));
3278 bool IsSubOnly
= DemandedElts
.isSubsetOf(SubMask
);
3279 bool IsAddOnly
= DemandedElts
.isSubsetOf(AddMask
);
3280 if (IsSubOnly
|| IsAddOnly
) {
3281 assert((IsSubOnly
^ IsAddOnly
) && "Can't be both add-only and sub-only");
3282 IRBuilderBase::InsertPointGuard
Guard(IC
.Builder
);
3283 IC
.Builder
.SetInsertPoint(&II
);
3284 Value
*Arg0
= II
.getArgOperand(0), *Arg1
= II
.getArgOperand(1);
3285 return IC
.Builder
.CreateBinOp(
3286 IsSubOnly
? Instruction::FSub
: Instruction::FAdd
, Arg0
, Arg1
);
3289 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3290 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3291 UndefElts
&= UndefElts2
;
3295 // General per-element vector operations.
3296 case Intrinsic::x86_avx2_psllv_d
:
3297 case Intrinsic::x86_avx2_psllv_d_256
:
3298 case Intrinsic::x86_avx2_psllv_q
:
3299 case Intrinsic::x86_avx2_psllv_q_256
:
3300 case Intrinsic::x86_avx2_psrlv_d
:
3301 case Intrinsic::x86_avx2_psrlv_d_256
:
3302 case Intrinsic::x86_avx2_psrlv_q
:
3303 case Intrinsic::x86_avx2_psrlv_q_256
:
3304 case Intrinsic::x86_avx2_psrav_d
:
3305 case Intrinsic::x86_avx2_psrav_d_256
: {
3306 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3307 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3308 UndefElts
&= UndefElts2
;
3312 case Intrinsic::x86_sse2_pmulh_w
:
3313 case Intrinsic::x86_avx2_pmulh_w
:
3314 case Intrinsic::x86_avx512_pmulh_w_512
:
3315 case Intrinsic::x86_sse2_pmulhu_w
:
3316 case Intrinsic::x86_avx2_pmulhu_w
:
3317 case Intrinsic::x86_avx512_pmulhu_w_512
:
3318 case Intrinsic::x86_ssse3_pmul_hr_sw_128
:
3319 case Intrinsic::x86_avx2_pmul_hr_sw
:
3320 case Intrinsic::x86_avx512_pmul_hr_sw_512
: {
3321 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3322 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3323 // NOTE: mulh(undef,undef) != undef.
3327 case Intrinsic::x86_sse2_packssdw_128
:
3328 case Intrinsic::x86_sse2_packsswb_128
:
3329 case Intrinsic::x86_sse2_packuswb_128
:
3330 case Intrinsic::x86_sse41_packusdw
:
3331 case Intrinsic::x86_avx2_packssdw
:
3332 case Intrinsic::x86_avx2_packsswb
:
3333 case Intrinsic::x86_avx2_packusdw
:
3334 case Intrinsic::x86_avx2_packuswb
:
3335 case Intrinsic::x86_avx512_packssdw_512
:
3336 case Intrinsic::x86_avx512_packsswb_512
:
3337 case Intrinsic::x86_avx512_packusdw_512
:
3338 case Intrinsic::x86_avx512_packuswb_512
: {
3339 auto *Ty0
= II
.getArgOperand(0)->getType();
3340 unsigned InnerVWidth
= cast
<FixedVectorType
>(Ty0
)->getNumElements();
3341 assert(VWidth
== (InnerVWidth
* 2) && "Unexpected input size");
3343 unsigned NumLanes
= Ty0
->getPrimitiveSizeInBits() / 128;
3344 unsigned VWidthPerLane
= VWidth
/ NumLanes
;
3345 unsigned InnerVWidthPerLane
= InnerVWidth
/ NumLanes
;
3347 // Per lane, pack the elements of the first input and then the second.
3349 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3350 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3351 for (int OpNum
= 0; OpNum
!= 2; ++OpNum
) {
3352 APInt
OpDemandedElts(InnerVWidth
, 0);
3353 for (unsigned Lane
= 0; Lane
!= NumLanes
; ++Lane
) {
3354 unsigned LaneIdx
= Lane
* VWidthPerLane
;
3355 for (unsigned Elt
= 0; Elt
!= InnerVWidthPerLane
; ++Elt
) {
3356 unsigned Idx
= LaneIdx
+ Elt
+ InnerVWidthPerLane
* OpNum
;
3357 if (DemandedElts
[Idx
])
3358 OpDemandedElts
.setBit((Lane
* InnerVWidthPerLane
) + Elt
);
3362 // Demand elements from the operand.
3363 APInt
OpUndefElts(InnerVWidth
, 0);
3364 simplifyAndSetOp(&II
, OpNum
, OpDemandedElts
, OpUndefElts
);
3366 // Pack the operand's UNDEF elements, one lane at a time.
3367 OpUndefElts
= OpUndefElts
.zext(VWidth
);
3368 for (unsigned Lane
= 0; Lane
!= NumLanes
; ++Lane
) {
3369 APInt LaneElts
= OpUndefElts
.lshr(InnerVWidthPerLane
* Lane
);
3370 LaneElts
= LaneElts
.getLoBits(InnerVWidthPerLane
);
3371 LaneElts
<<= InnerVWidthPerLane
* (2 * Lane
+ OpNum
);
3372 UndefElts
|= LaneElts
;
3378 case Intrinsic::x86_sse2_pmadd_wd
:
3379 case Intrinsic::x86_avx2_pmadd_wd
:
3380 case Intrinsic::x86_avx512_pmaddw_d_512
:
3381 case Intrinsic::x86_ssse3_pmadd_ub_sw_128
:
3382 case Intrinsic::x86_avx2_pmadd_ub_sw
:
3383 case Intrinsic::x86_avx512_pmaddubs_w_512
: {
3384 // PMADD - demand both src elements that map to each dst element.
3385 auto *ArgTy
= II
.getArgOperand(0)->getType();
3386 unsigned InnerVWidth
= cast
<FixedVectorType
>(ArgTy
)->getNumElements();
3387 assert((VWidth
* 2) == InnerVWidth
&& "Unexpected input size");
3388 APInt OpDemandedElts
= APIntOps::ScaleBitMask(DemandedElts
, InnerVWidth
);
3389 APInt
Op0UndefElts(InnerVWidth
, 0);
3390 APInt
Op1UndefElts(InnerVWidth
, 0);
3391 simplifyAndSetOp(&II
, 0, OpDemandedElts
, Op0UndefElts
);
3392 simplifyAndSetOp(&II
, 1, OpDemandedElts
, Op1UndefElts
);
3393 // NOTE: madd(undef,undef) != undef.
3398 case Intrinsic::x86_ssse3_pshuf_b_128
:
3399 case Intrinsic::x86_avx2_pshuf_b
:
3400 case Intrinsic::x86_avx512_pshuf_b_512
:
3402 case Intrinsic::x86_avx_vpermilvar_ps
:
3403 case Intrinsic::x86_avx_vpermilvar_ps_256
:
3404 case Intrinsic::x86_avx512_vpermilvar_ps_512
:
3405 case Intrinsic::x86_avx_vpermilvar_pd
:
3406 case Intrinsic::x86_avx_vpermilvar_pd_256
:
3407 case Intrinsic::x86_avx512_vpermilvar_pd_512
:
3409 case Intrinsic::x86_avx2_permd
:
3410 case Intrinsic::x86_avx2_permps
: {
3411 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts
);
3415 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3416 // in an undefined state.
3417 case Intrinsic::x86_sse4a_extrq
:
3418 case Intrinsic::x86_sse4a_extrqi
:
3419 case Intrinsic::x86_sse4a_insertq
:
3420 case Intrinsic::x86_sse4a_insertqi
:
3421 UndefElts
.setHighBits(VWidth
/ 2);
3424 return std::nullopt
;