1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
14 //===----------------------------------------------------------------------===//
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
25 #define DEBUG_TYPE "x86tti"
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
29 static Constant
*getNegativeIsTrueBoolVec(Constant
*V
) {
30 VectorType
*IntTy
= VectorType::getInteger(cast
<VectorType
>(V
->getType()));
31 V
= ConstantExpr::getBitCast(V
, IntTy
);
32 V
= ConstantExpr::getICmp(CmpInst::ICMP_SGT
, Constant::getNullValue(IntTy
),
37 /// Convert the x86 XMM integer vector mask to a vector of bools based on
38 /// each element's most significant bit (the sign bit).
39 static Value
*getBoolVecFromMask(Value
*Mask
) {
40 // Fold Constant Mask.
41 if (auto *ConstantMask
= dyn_cast
<ConstantDataVector
>(Mask
))
42 return getNegativeIsTrueBoolVec(ConstantMask
);
44 // Mask was extended from a boolean vector.
46 if (PatternMatch::match(
47 Mask
, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask
))) &&
48 ExtMask
->getType()->isIntOrIntVectorTy(1))
54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
55 // XMM register mask efficiently, we could transform all x86 masked intrinsics
56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
57 static Instruction
*simplifyX86MaskedLoad(IntrinsicInst
&II
, InstCombiner
&IC
) {
58 Value
*Ptr
= II
.getOperand(0);
59 Value
*Mask
= II
.getOperand(1);
60 Constant
*ZeroVec
= Constant::getNullValue(II
.getType());
62 // Zero Mask - masked load instruction creates a zero vector.
63 if (isa
<ConstantAggregateZero
>(Mask
))
64 return IC
.replaceInstUsesWith(II
, ZeroVec
);
66 // The mask is constant or extended from a bool vector. Convert this x86
67 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
68 if (Value
*BoolMask
= getBoolVecFromMask(Mask
)) {
69 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
70 // the LLVM intrinsic definition for the pointer argument.
71 unsigned AddrSpace
= cast
<PointerType
>(Ptr
->getType())->getAddressSpace();
72 PointerType
*VecPtrTy
= PointerType::get(II
.getType(), AddrSpace
);
73 Value
*PtrCast
= IC
.Builder
.CreateBitCast(Ptr
, VecPtrTy
, "castvec");
75 // The pass-through vector for an x86 masked load is a zero vector.
76 CallInst
*NewMaskedLoad
= IC
.Builder
.CreateMaskedLoad(
77 II
.getType(), PtrCast
, Align(1), BoolMask
, ZeroVec
);
78 return IC
.replaceInstUsesWith(II
, NewMaskedLoad
);
84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
85 // XMM register mask efficiently, we could transform all x86 masked intrinsics
86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
87 static bool simplifyX86MaskedStore(IntrinsicInst
&II
, InstCombiner
&IC
) {
88 Value
*Ptr
= II
.getOperand(0);
89 Value
*Mask
= II
.getOperand(1);
90 Value
*Vec
= II
.getOperand(2);
92 // Zero Mask - this masked store instruction does nothing.
93 if (isa
<ConstantAggregateZero
>(Mask
)) {
94 IC
.eraseInstFromFunction(II
);
98 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
99 // anything else at this level.
100 if (II
.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu
)
103 // The mask is constant or extended from a bool vector. Convert this x86
104 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
105 if (Value
*BoolMask
= getBoolVecFromMask(Mask
)) {
106 unsigned AddrSpace
= cast
<PointerType
>(Ptr
->getType())->getAddressSpace();
107 PointerType
*VecPtrTy
= PointerType::get(Vec
->getType(), AddrSpace
);
108 Value
*PtrCast
= IC
.Builder
.CreateBitCast(Ptr
, VecPtrTy
, "castvec");
110 IC
.Builder
.CreateMaskedStore(Vec
, PtrCast
, Align(1), BoolMask
);
112 // 'Replace uses' doesn't work for stores. Erase the original masked store.
113 IC
.eraseInstFromFunction(II
);
120 static Value
*simplifyX86immShift(const IntrinsicInst
&II
,
121 InstCombiner::BuilderTy
&Builder
) {
122 bool LogicalShift
= false;
123 bool ShiftLeft
= false;
126 switch (II
.getIntrinsicID()) {
128 llvm_unreachable("Unexpected intrinsic!");
129 case Intrinsic::x86_sse2_psrai_d
:
130 case Intrinsic::x86_sse2_psrai_w
:
131 case Intrinsic::x86_avx2_psrai_d
:
132 case Intrinsic::x86_avx2_psrai_w
:
133 case Intrinsic::x86_avx512_psrai_q_128
:
134 case Intrinsic::x86_avx512_psrai_q_256
:
135 case Intrinsic::x86_avx512_psrai_d_512
:
136 case Intrinsic::x86_avx512_psrai_q_512
:
137 case Intrinsic::x86_avx512_psrai_w_512
:
140 case Intrinsic::x86_sse2_psra_d
:
141 case Intrinsic::x86_sse2_psra_w
:
142 case Intrinsic::x86_avx2_psra_d
:
143 case Intrinsic::x86_avx2_psra_w
:
144 case Intrinsic::x86_avx512_psra_q_128
:
145 case Intrinsic::x86_avx512_psra_q_256
:
146 case Intrinsic::x86_avx512_psra_d_512
:
147 case Intrinsic::x86_avx512_psra_q_512
:
148 case Intrinsic::x86_avx512_psra_w_512
:
149 LogicalShift
= false;
152 case Intrinsic::x86_sse2_psrli_d
:
153 case Intrinsic::x86_sse2_psrli_q
:
154 case Intrinsic::x86_sse2_psrli_w
:
155 case Intrinsic::x86_avx2_psrli_d
:
156 case Intrinsic::x86_avx2_psrli_q
:
157 case Intrinsic::x86_avx2_psrli_w
:
158 case Intrinsic::x86_avx512_psrli_d_512
:
159 case Intrinsic::x86_avx512_psrli_q_512
:
160 case Intrinsic::x86_avx512_psrli_w_512
:
163 case Intrinsic::x86_sse2_psrl_d
:
164 case Intrinsic::x86_sse2_psrl_q
:
165 case Intrinsic::x86_sse2_psrl_w
:
166 case Intrinsic::x86_avx2_psrl_d
:
167 case Intrinsic::x86_avx2_psrl_q
:
168 case Intrinsic::x86_avx2_psrl_w
:
169 case Intrinsic::x86_avx512_psrl_d_512
:
170 case Intrinsic::x86_avx512_psrl_q_512
:
171 case Intrinsic::x86_avx512_psrl_w_512
:
175 case Intrinsic::x86_sse2_pslli_d
:
176 case Intrinsic::x86_sse2_pslli_q
:
177 case Intrinsic::x86_sse2_pslli_w
:
178 case Intrinsic::x86_avx2_pslli_d
:
179 case Intrinsic::x86_avx2_pslli_q
:
180 case Intrinsic::x86_avx2_pslli_w
:
181 case Intrinsic::x86_avx512_pslli_d_512
:
182 case Intrinsic::x86_avx512_pslli_q_512
:
183 case Intrinsic::x86_avx512_pslli_w_512
:
186 case Intrinsic::x86_sse2_psll_d
:
187 case Intrinsic::x86_sse2_psll_q
:
188 case Intrinsic::x86_sse2_psll_w
:
189 case Intrinsic::x86_avx2_psll_d
:
190 case Intrinsic::x86_avx2_psll_q
:
191 case Intrinsic::x86_avx2_psll_w
:
192 case Intrinsic::x86_avx512_psll_d_512
:
193 case Intrinsic::x86_avx512_psll_q_512
:
194 case Intrinsic::x86_avx512_psll_w_512
:
199 assert((LogicalShift
|| !ShiftLeft
) && "Only logical shifts can shift left");
201 Value
*Vec
= II
.getArgOperand(0);
202 Value
*Amt
= II
.getArgOperand(1);
203 auto *VT
= cast
<FixedVectorType
>(Vec
->getType());
204 Type
*SVT
= VT
->getElementType();
205 Type
*AmtVT
= Amt
->getType();
206 unsigned VWidth
= VT
->getNumElements();
207 unsigned BitWidth
= SVT
->getPrimitiveSizeInBits();
209 // If the shift amount is guaranteed to be in-range we can replace it with a
210 // generic shift. If its guaranteed to be out of range, logical shifts combine
211 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213 assert(AmtVT
->isIntegerTy(32) && "Unexpected shift-by-immediate type");
214 KnownBits KnownAmtBits
=
215 llvm::computeKnownBits(Amt
, II
.getModule()->getDataLayout());
216 if (KnownAmtBits
.getMaxValue().ult(BitWidth
)) {
217 Amt
= Builder
.CreateZExtOrTrunc(Amt
, SVT
);
218 Amt
= Builder
.CreateVectorSplat(VWidth
, Amt
);
219 return (LogicalShift
? (ShiftLeft
? Builder
.CreateShl(Vec
, Amt
)
220 : Builder
.CreateLShr(Vec
, Amt
))
221 : Builder
.CreateAShr(Vec
, Amt
));
223 if (KnownAmtBits
.getMinValue().uge(BitWidth
)) {
225 return ConstantAggregateZero::get(VT
);
226 Amt
= ConstantInt::get(SVT
, BitWidth
- 1);
227 return Builder
.CreateAShr(Vec
, Builder
.CreateVectorSplat(VWidth
, Amt
));
230 // Ensure the first element has an in-range value and the rest of the
231 // elements in the bottom 64 bits are zero.
232 assert(AmtVT
->isVectorTy() && AmtVT
->getPrimitiveSizeInBits() == 128 &&
233 cast
<VectorType
>(AmtVT
)->getElementType() == SVT
&&
234 "Unexpected shift-by-scalar type");
235 unsigned NumAmtElts
= cast
<FixedVectorType
>(AmtVT
)->getNumElements();
236 APInt DemandedLower
= APInt::getOneBitSet(NumAmtElts
, 0);
237 APInt DemandedUpper
= APInt::getBitsSet(NumAmtElts
, 1, NumAmtElts
/ 2);
238 KnownBits KnownLowerBits
= llvm::computeKnownBits(
239 Amt
, DemandedLower
, II
.getModule()->getDataLayout());
240 KnownBits KnownUpperBits
= llvm::computeKnownBits(
241 Amt
, DemandedUpper
, II
.getModule()->getDataLayout());
242 if (KnownLowerBits
.getMaxValue().ult(BitWidth
) &&
243 (DemandedUpper
.isZero() || KnownUpperBits
.isZero())) {
244 SmallVector
<int, 16> ZeroSplat(VWidth
, 0);
245 Amt
= Builder
.CreateShuffleVector(Amt
, ZeroSplat
);
246 return (LogicalShift
? (ShiftLeft
? Builder
.CreateShl(Vec
, Amt
)
247 : Builder
.CreateLShr(Vec
, Amt
))
248 : Builder
.CreateAShr(Vec
, Amt
));
252 // Simplify if count is constant vector.
253 auto *CDV
= dyn_cast
<ConstantDataVector
>(Amt
);
257 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
258 // operand to compute the shift amount.
259 assert(AmtVT
->isVectorTy() && AmtVT
->getPrimitiveSizeInBits() == 128 &&
260 cast
<VectorType
>(AmtVT
)->getElementType() == SVT
&&
261 "Unexpected shift-by-scalar type");
263 // Concatenate the sub-elements to create the 64-bit value.
265 for (unsigned i
= 0, NumSubElts
= 64 / BitWidth
; i
!= NumSubElts
; ++i
) {
266 unsigned SubEltIdx
= (NumSubElts
- 1) - i
;
267 auto *SubElt
= cast
<ConstantInt
>(CDV
->getElementAsConstant(SubEltIdx
));
269 Count
|= SubElt
->getValue().zextOrTrunc(64);
272 // If shift-by-zero then just return the original value.
276 // Handle cases when Shift >= BitWidth.
277 if (Count
.uge(BitWidth
)) {
278 // If LogicalShift - just return zero.
280 return ConstantAggregateZero::get(VT
);
282 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
283 Count
= APInt(64, BitWidth
- 1);
286 // Get a constant vector of the same type as the first operand.
287 auto ShiftAmt
= ConstantInt::get(SVT
, Count
.zextOrTrunc(BitWidth
));
288 auto ShiftVec
= Builder
.CreateVectorSplat(VWidth
, ShiftAmt
);
291 return Builder
.CreateShl(Vec
, ShiftVec
);
294 return Builder
.CreateLShr(Vec
, ShiftVec
);
296 return Builder
.CreateAShr(Vec
, ShiftVec
);
299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
302 static Value
*simplifyX86varShift(const IntrinsicInst
&II
,
303 InstCombiner::BuilderTy
&Builder
) {
304 bool LogicalShift
= false;
305 bool ShiftLeft
= false;
307 switch (II
.getIntrinsicID()) {
309 llvm_unreachable("Unexpected intrinsic!");
310 case Intrinsic::x86_avx2_psrav_d
:
311 case Intrinsic::x86_avx2_psrav_d_256
:
312 case Intrinsic::x86_avx512_psrav_q_128
:
313 case Intrinsic::x86_avx512_psrav_q_256
:
314 case Intrinsic::x86_avx512_psrav_d_512
:
315 case Intrinsic::x86_avx512_psrav_q_512
:
316 case Intrinsic::x86_avx512_psrav_w_128
:
317 case Intrinsic::x86_avx512_psrav_w_256
:
318 case Intrinsic::x86_avx512_psrav_w_512
:
319 LogicalShift
= false;
322 case Intrinsic::x86_avx2_psrlv_d
:
323 case Intrinsic::x86_avx2_psrlv_d_256
:
324 case Intrinsic::x86_avx2_psrlv_q
:
325 case Intrinsic::x86_avx2_psrlv_q_256
:
326 case Intrinsic::x86_avx512_psrlv_d_512
:
327 case Intrinsic::x86_avx512_psrlv_q_512
:
328 case Intrinsic::x86_avx512_psrlv_w_128
:
329 case Intrinsic::x86_avx512_psrlv_w_256
:
330 case Intrinsic::x86_avx512_psrlv_w_512
:
334 case Intrinsic::x86_avx2_psllv_d
:
335 case Intrinsic::x86_avx2_psllv_d_256
:
336 case Intrinsic::x86_avx2_psllv_q
:
337 case Intrinsic::x86_avx2_psllv_q_256
:
338 case Intrinsic::x86_avx512_psllv_d_512
:
339 case Intrinsic::x86_avx512_psllv_q_512
:
340 case Intrinsic::x86_avx512_psllv_w_128
:
341 case Intrinsic::x86_avx512_psllv_w_256
:
342 case Intrinsic::x86_avx512_psllv_w_512
:
347 assert((LogicalShift
|| !ShiftLeft
) && "Only logical shifts can shift left");
349 Value
*Vec
= II
.getArgOperand(0);
350 Value
*Amt
= II
.getArgOperand(1);
351 auto *VT
= cast
<FixedVectorType
>(II
.getType());
352 Type
*SVT
= VT
->getElementType();
353 int NumElts
= VT
->getNumElements();
354 int BitWidth
= SVT
->getIntegerBitWidth();
356 // If the shift amount is guaranteed to be in-range we can replace it with a
359 llvm::computeKnownBits(Amt
, II
.getModule()->getDataLayout());
360 if (KnownAmt
.getMaxValue().ult(BitWidth
)) {
361 return (LogicalShift
? (ShiftLeft
? Builder
.CreateShl(Vec
, Amt
)
362 : Builder
.CreateLShr(Vec
, Amt
))
363 : Builder
.CreateAShr(Vec
, Amt
));
366 // Simplify if all shift amounts are constant/undef.
367 auto *CShift
= dyn_cast
<Constant
>(Amt
);
371 // Collect each element's shift amount.
372 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373 bool AnyOutOfRange
= false;
374 SmallVector
<int, 8> ShiftAmts
;
375 for (int I
= 0; I
< NumElts
; ++I
) {
376 auto *CElt
= CShift
->getAggregateElement(I
);
377 if (isa_and_nonnull
<UndefValue
>(CElt
)) {
378 ShiftAmts
.push_back(-1);
382 auto *COp
= dyn_cast_or_null
<ConstantInt
>(CElt
);
386 // Handle out of range shifts.
387 // If LogicalShift - set to BitWidth (special case).
388 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389 APInt ShiftVal
= COp
->getValue();
390 if (ShiftVal
.uge(BitWidth
)) {
391 AnyOutOfRange
= LogicalShift
;
392 ShiftAmts
.push_back(LogicalShift
? BitWidth
: BitWidth
- 1);
396 ShiftAmts
.push_back((int)ShiftVal
.getZExtValue());
399 // If all elements out of range or UNDEF, return vector of zeros/undefs.
400 // ArithmeticShift should only hit this if they are all UNDEF.
401 auto OutOfRange
= [&](int Idx
) { return (Idx
< 0) || (BitWidth
<= Idx
); };
402 if (llvm::all_of(ShiftAmts
, OutOfRange
)) {
403 SmallVector
<Constant
*, 8> ConstantVec
;
404 for (int Idx
: ShiftAmts
) {
406 ConstantVec
.push_back(UndefValue::get(SVT
));
408 assert(LogicalShift
&& "Logical shift expected");
409 ConstantVec
.push_back(ConstantInt::getNullValue(SVT
));
412 return ConstantVector::get(ConstantVec
);
415 // We can't handle only some out of range values with generic logical shifts.
419 // Build the shift amount constant vector.
420 SmallVector
<Constant
*, 8> ShiftVecAmts
;
421 for (int Idx
: ShiftAmts
) {
423 ShiftVecAmts
.push_back(UndefValue::get(SVT
));
425 ShiftVecAmts
.push_back(ConstantInt::get(SVT
, Idx
));
427 auto ShiftVec
= ConstantVector::get(ShiftVecAmts
);
430 return Builder
.CreateShl(Vec
, ShiftVec
);
433 return Builder
.CreateLShr(Vec
, ShiftVec
);
435 return Builder
.CreateAShr(Vec
, ShiftVec
);
438 static Value
*simplifyX86pack(IntrinsicInst
&II
,
439 InstCombiner::BuilderTy
&Builder
, bool IsSigned
) {
440 Value
*Arg0
= II
.getArgOperand(0);
441 Value
*Arg1
= II
.getArgOperand(1);
442 Type
*ResTy
= II
.getType();
444 // Fast all undef handling.
445 if (isa
<UndefValue
>(Arg0
) && isa
<UndefValue
>(Arg1
))
446 return UndefValue::get(ResTy
);
448 auto *ArgTy
= cast
<FixedVectorType
>(Arg0
->getType());
449 unsigned NumLanes
= ResTy
->getPrimitiveSizeInBits() / 128;
450 unsigned NumSrcElts
= ArgTy
->getNumElements();
451 assert(cast
<FixedVectorType
>(ResTy
)->getNumElements() == (2 * NumSrcElts
) &&
452 "Unexpected packing types");
454 unsigned NumSrcEltsPerLane
= NumSrcElts
/ NumLanes
;
455 unsigned DstScalarSizeInBits
= ResTy
->getScalarSizeInBits();
456 unsigned SrcScalarSizeInBits
= ArgTy
->getScalarSizeInBits();
457 assert(SrcScalarSizeInBits
== (2 * DstScalarSizeInBits
) &&
458 "Unexpected packing types");
461 if (!isa
<Constant
>(Arg0
) || !isa
<Constant
>(Arg1
))
464 // Clamp Values - signed/unsigned both use signed clamp values, but they
465 // differ on the min/max values.
466 APInt MinValue
, MaxValue
;
468 // PACKSS: Truncate signed value with signed saturation.
469 // Source values less than dst minint are saturated to minint.
470 // Source values greater than dst maxint are saturated to maxint.
472 APInt::getSignedMinValue(DstScalarSizeInBits
).sext(SrcScalarSizeInBits
);
474 APInt::getSignedMaxValue(DstScalarSizeInBits
).sext(SrcScalarSizeInBits
);
476 // PACKUS: Truncate signed value with unsigned saturation.
477 // Source values less than zero are saturated to zero.
478 // Source values greater than dst maxuint are saturated to maxuint.
479 MinValue
= APInt::getZero(SrcScalarSizeInBits
);
480 MaxValue
= APInt::getLowBitsSet(SrcScalarSizeInBits
, DstScalarSizeInBits
);
483 auto *MinC
= Constant::getIntegerValue(ArgTy
, MinValue
);
484 auto *MaxC
= Constant::getIntegerValue(ArgTy
, MaxValue
);
485 Arg0
= Builder
.CreateSelect(Builder
.CreateICmpSLT(Arg0
, MinC
), MinC
, Arg0
);
486 Arg1
= Builder
.CreateSelect(Builder
.CreateICmpSLT(Arg1
, MinC
), MinC
, Arg1
);
487 Arg0
= Builder
.CreateSelect(Builder
.CreateICmpSGT(Arg0
, MaxC
), MaxC
, Arg0
);
488 Arg1
= Builder
.CreateSelect(Builder
.CreateICmpSGT(Arg1
, MaxC
), MaxC
, Arg1
);
490 // Shuffle clamped args together at the lane level.
491 SmallVector
<int, 32> PackMask
;
492 for (unsigned Lane
= 0; Lane
!= NumLanes
; ++Lane
) {
493 for (unsigned Elt
= 0; Elt
!= NumSrcEltsPerLane
; ++Elt
)
494 PackMask
.push_back(Elt
+ (Lane
* NumSrcEltsPerLane
));
495 for (unsigned Elt
= 0; Elt
!= NumSrcEltsPerLane
; ++Elt
)
496 PackMask
.push_back(Elt
+ (Lane
* NumSrcEltsPerLane
) + NumSrcElts
);
498 auto *Shuffle
= Builder
.CreateShuffleVector(Arg0
, Arg1
, PackMask
);
500 // Truncate to dst size.
501 return Builder
.CreateTrunc(Shuffle
, ResTy
);
504 static Value
*simplifyX86movmsk(const IntrinsicInst
&II
,
505 InstCombiner::BuilderTy
&Builder
) {
506 Value
*Arg
= II
.getArgOperand(0);
507 Type
*ResTy
= II
.getType();
509 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510 if (isa
<UndefValue
>(Arg
))
511 return Constant::getNullValue(ResTy
);
513 auto *ArgTy
= dyn_cast
<FixedVectorType
>(Arg
->getType());
514 // We can't easily peek through x86_mmx types.
518 // Expand MOVMSK to compare/bitcast/zext:
519 // e.g. PMOVMSKB(v16i8 x):
520 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521 // %int = bitcast <16 x i1> %cmp to i16
522 // %res = zext i16 %int to i32
523 unsigned NumElts
= ArgTy
->getNumElements();
524 Type
*IntegerTy
= Builder
.getIntNTy(NumElts
);
526 Value
*Res
= Builder
.CreateBitCast(Arg
, VectorType::getInteger(ArgTy
));
527 Res
= Builder
.CreateIsNeg(Res
);
528 Res
= Builder
.CreateBitCast(Res
, IntegerTy
);
529 Res
= Builder
.CreateZExtOrTrunc(Res
, ResTy
);
533 static Value
*simplifyX86addcarry(const IntrinsicInst
&II
,
534 InstCombiner::BuilderTy
&Builder
) {
535 Value
*CarryIn
= II
.getArgOperand(0);
536 Value
*Op1
= II
.getArgOperand(1);
537 Value
*Op2
= II
.getArgOperand(2);
538 Type
*RetTy
= II
.getType();
539 Type
*OpTy
= Op1
->getType();
540 assert(RetTy
->getStructElementType(0)->isIntegerTy(8) &&
541 RetTy
->getStructElementType(1) == OpTy
&& OpTy
== Op2
->getType() &&
542 "Unexpected types for x86 addcarry");
544 // If carry-in is zero, this is just an unsigned add with overflow.
545 if (match(CarryIn
, PatternMatch::m_ZeroInt())) {
546 Value
*UAdd
= Builder
.CreateIntrinsic(Intrinsic::uadd_with_overflow
, OpTy
,
548 // The types have to be adjusted to match the x86 call types.
549 Value
*UAddResult
= Builder
.CreateExtractValue(UAdd
, 0);
550 Value
*UAddOV
= Builder
.CreateZExt(Builder
.CreateExtractValue(UAdd
, 1),
551 Builder
.getInt8Ty());
552 Value
*Res
= PoisonValue::get(RetTy
);
553 Res
= Builder
.CreateInsertValue(Res
, UAddOV
, 0);
554 return Builder
.CreateInsertValue(Res
, UAddResult
, 1);
560 static Value
*simplifyTernarylogic(const IntrinsicInst
&II
,
561 InstCombiner::BuilderTy
&Builder
) {
563 auto *ArgImm
= dyn_cast
<ConstantInt
>(II
.getArgOperand(3));
564 if (!ArgImm
|| ArgImm
->getValue().uge(256))
567 Value
*ArgA
= II
.getArgOperand(0);
568 Value
*ArgB
= II
.getArgOperand(1);
569 Value
*ArgC
= II
.getArgOperand(2);
571 Type
*Ty
= II
.getType();
573 auto Or
= [&](auto Lhs
, auto Rhs
) -> std::pair
<Value
*, uint8_t> {
574 return {Builder
.CreateOr(Lhs
.first
, Rhs
.first
), Lhs
.second
| Rhs
.second
};
576 auto Xor
= [&](auto Lhs
, auto Rhs
) -> std::pair
<Value
*, uint8_t> {
577 return {Builder
.CreateXor(Lhs
.first
, Rhs
.first
), Lhs
.second
^ Rhs
.second
};
579 auto And
= [&](auto Lhs
, auto Rhs
) -> std::pair
<Value
*, uint8_t> {
580 return {Builder
.CreateAnd(Lhs
.first
, Rhs
.first
), Lhs
.second
& Rhs
.second
};
582 auto Not
= [&](auto V
) -> std::pair
<Value
*, uint8_t> {
583 return {Builder
.CreateNot(V
.first
), ~V
.second
};
585 auto Nor
= [&](auto Lhs
, auto Rhs
) { return Not(Or(Lhs
, Rhs
)); };
586 auto Xnor
= [&](auto Lhs
, auto Rhs
) { return Not(Xor(Lhs
, Rhs
)); };
587 auto Nand
= [&](auto Lhs
, auto Rhs
) { return Not(And(Lhs
, Rhs
)); };
589 bool AIsConst
= match(ArgA
, PatternMatch::m_ImmConstant());
590 bool BIsConst
= match(ArgB
, PatternMatch::m_ImmConstant());
591 bool CIsConst
= match(ArgC
, PatternMatch::m_ImmConstant());
593 bool ABIsConst
= AIsConst
&& BIsConst
;
594 bool ACIsConst
= AIsConst
&& CIsConst
;
595 bool BCIsConst
= BIsConst
&& CIsConst
;
596 bool ABCIsConst
= AIsConst
&& BIsConst
&& CIsConst
;
598 // Use for verification. Its a big table. Its difficult to go from Imm ->
599 // logic ops, but easy to verify that a set of logic ops is correct. We track
600 // the logic ops through the second value in the pair. At the end it should
602 std::pair
<Value
*, uint8_t> A
= {ArgA
, 0xf0};
603 std::pair
<Value
*, uint8_t> B
= {ArgB
, 0xcc};
604 std::pair
<Value
*, uint8_t> C
= {ArgC
, 0xaa};
605 std::pair
<Value
*, uint8_t> Res
= {nullptr, 0};
607 // Currently we only handle cases that convert directly to another instruction
608 // or cases where all the ops are constant. This is because we don't properly
609 // handle creating ternary ops in the backend, so splitting them here may
610 // cause regressions. As the backend improves, uncomment more cases.
612 uint8_t Imm
= ArgImm
->getValue().getZExtValue();
615 Res
= {Constant::getNullValue(Ty
), 0};
619 Res
= Nor(Or(A
, B
), C
);
623 Res
= And(Nor(A
, B
), C
);
631 Res
= And(Nor(A
, C
), B
);
639 Res
= Nor(A
, Xnor(B
, C
));
643 Res
= Nor(A
, And(B
, C
));
647 Res
= Nor(A
, Nand(B
, C
));
651 Res
= Nor(A
, Xor(B
, C
));
655 Res
= Nor(A
, Not(C
));
659 Res
= Nor(A
, Nor(C
, Not(B
)));
663 Res
= Nor(A
, Not(B
));
667 Res
= Nor(A
, Nor(B
, Not(C
)));
671 Res
= Nor(A
, Nor(B
, C
));
678 Res
= And(A
, Nor(B
, C
));
686 Res
= Nor(Xnor(A
, C
), B
);
690 Res
= Nor(And(A
, C
), B
);
694 Res
= Nor(Xnor(A
, B
), C
);
698 Res
= Nor(And(A
, B
), C
);
702 Res
= Xor(Xor(A
, B
), And(Nand(A
, B
), C
));
706 Res
= Xor(Or(A
, B
), Or(Xnor(A
, B
), C
));
710 Res
= Nor(Xnor(A
, B
), Xnor(A
, C
));
714 Res
= And(Nand(A
, B
), Xnor(B
, C
));
718 Res
= Xor(A
, Or(And(A
, B
), C
));
722 Res
= Xor(A
, Or(Xnor(A
, B
), C
));
726 Res
= Xor(A
, Or(And(A
, C
), B
));
730 Res
= Xor(A
, Or(Xnor(A
, C
), B
));
734 Res
= Xor(A
, Or(B
, C
));
738 Res
= Nand(A
, Or(B
, C
));
742 Res
= Nor(Nand(A
, C
), B
);
746 Res
= Nor(Xor(A
, C
), B
);
750 Res
= Nor(B
, Not(C
));
754 Res
= Nor(B
, Nor(C
, Not(A
)));
758 Res
= Nor(Xnor(A
, B
), Xor(A
, C
));
762 Res
= Xor(A
, Nand(Nand(A
, B
), C
));
766 Res
= And(Nand(A
, B
), Xor(B
, C
));
770 Res
= Xor(Or(Xnor(A
, B
), C
), B
);
774 Res
= And(Xor(A
, B
), C
);
778 Res
= Xor(Xor(A
, B
), Nor(And(A
, B
), C
));
782 Res
= And(Nand(A
, B
), C
);
786 Res
= Xor(Or(Xnor(A
, B
), Xor(A
, C
)), A
);
790 Res
= Nor(Xnor(A
, B
), Nor(B
, C
));
794 Res
= Xor(A
, Or(B
, Not(C
)));
798 Res
= Xor(A
, Or(Xor(A
, C
), B
));
802 Res
= Nand(A
, Or(B
, Not(C
)));
806 Res
= Nor(B
, Not(A
));
810 Res
= Nor(Nor(A
, Not(C
)), B
);
814 Res
= Nor(Nor(A
, C
), B
);
821 Res
= And(Xor(A
, B
), Nand(B
, C
));
825 Res
= Xor(B
, Or(A
, Xnor(B
, C
)));
829 Res
= Xor(Or(A
, C
), B
);
833 Res
= Nand(Or(A
, C
), B
);
837 Res
= Nor(Xnor(A
, B
), Nor(A
, C
));
841 Res
= Xor(Or(A
, Not(C
)), B
);
845 Res
= Xor(B
, Or(A
, Xor(B
, C
)));
849 Res
= Nand(Or(A
, Not(C
)), B
);
856 Res
= Xor(A
, Or(Nor(A
, C
), B
));
860 Res
= Xor(A
, Or(Nor(A
, Not(C
)), B
));
868 Res
= Nor(Nand(A
, B
), C
);
872 Res
= Nor(Xor(A
, B
), C
);
876 Res
= Nor(Xor(A
, B
), Xnor(A
, C
));
880 Res
= Xor(A
, Nand(Nand(A
, C
), B
));
884 Res
= Nor(C
, Not(B
));
888 Res
= Nor(Nor(B
, Not(A
)), C
);
892 Res
= Xor(Or(And(A
, C
), B
), C
);
896 Res
= Xor(Or(Xnor(A
, C
), B
), C
);
900 Res
= And(Xor(A
, C
), B
);
904 Res
= Xor(Or(Xnor(A
, B
), And(A
, C
)), C
);
908 Res
= Nor(Xnor(A
, C
), Nor(B
, C
));
912 Res
= Xor(A
, Or(C
, Not(B
)));
916 Res
= And(Nand(A
, C
), B
);
920 Res
= Xor(Or(Xor(A
, B
), Xnor(A
, C
)), A
);
924 Res
= Xor(A
, Or(Xor(A
, B
), C
));
928 Res
= Nand(A
, Nand(B
, Not(C
)));
932 Res
= Nor(C
, Not(A
));
936 Res
= Nor(Nor(A
, Not(B
)), C
);
940 Res
= And(Xor(A
, C
), Nand(B
, C
));
944 Res
= Xor(Or(Xnor(B
, C
), A
), C
);
948 Res
= Nor(Nor(A
, B
), C
);
955 Res
= Xor(Or(A
, B
), C
);
959 Res
= Nand(Or(A
, B
), C
);
963 Res
= Nor(Nor(A
, B
), Xnor(A
, C
));
967 Res
= Xor(Or(A
, Not(B
)), C
);
974 Res
= Xor(A
, Or(Nor(A
, B
), C
));
978 Res
= Xor(Or(Xor(B
, C
), A
), C
);
982 Res
= Nand(Or(A
, Not(B
)), C
);
986 Res
= Xor(A
, Or(Nor(A
, Not(B
)), C
));
994 Res
= And(A
, Xor(B
, C
));
998 Res
= Xor(Or(Xnor(A
, B
), And(B
, C
)), C
);
1002 Res
= Nor(Nor(A
, C
), Xnor(B
, C
));
1006 Res
= Xor(B
, Or(C
, Not(A
)));
1010 Res
= Nor(Nor(A
, B
), Xnor(B
, C
));
1014 Res
= Xor(Or(B
, Not(A
)), C
);
1021 Res
= Or(Nor(A
, B
), Xor(B
, C
));
1025 Res
= Xor(Xor(A
, B
), Nor(Nor(A
, B
), C
));
1029 Res
= Xor(Xnor(A
, B
), C
);
1033 Res
= Xor(And(A
, B
), C
);
1037 Res
= Or(Nor(A
, B
), Xor(Xnor(A
, B
), C
));
1041 Res
= Xor(And(A
, C
), B
);
1045 Res
= Xor(Or(Xnor(A
, B
), Nor(A
, C
)), C
);
1049 Res
= Or(Nor(A
, Not(B
)), Xor(B
, C
));
1053 Res
= Nand(A
, Xnor(B
, C
));
1057 Res
= And(A
, Nand(B
, C
));
1061 Res
= Xor(Nor(Xor(A
, B
), Xor(A
, C
)), A
);
1065 Res
= Xor(Or(Xor(A
, B
), C
), B
);
1069 Res
= Nand(Nand(A
, Not(C
)), B
);
1073 Res
= Xor(Or(Xor(A
, C
), B
), C
);
1077 Res
= Nand(Nand(A
, Not(B
)), C
);
1081 Res
= Xor(B
, Or(Nor(B
, Not(A
)), C
));
1089 Res
= Xor(A
, And(B
, C
));
1093 Res
= Xor(Or(Xnor(A
, B
), Nor(B
, C
)), C
);
1097 Res
= Or(Xor(A
, C
), Nor(B
, Not(A
)));
1101 Res
= Nand(Xnor(A
, C
), B
);
1105 Res
= Or(Xor(A
, B
), Nor(C
, Not(A
)));
1109 Res
= Nand(Xnor(A
, B
), C
);
1113 Res
= Or(Xor(A
, B
), Xor(A
, C
));
1117 Res
= Nand(And(A
, B
), C
);
1121 Res
= And(And(A
, B
), C
);
1125 Res
= Nor(Xor(A
, B
), Xor(A
, C
));
1129 Res
= And(Xnor(A
, B
), C
);
1133 Res
= Nor(Xor(A
, B
), Nor(C
, Not(A
)));
1137 Res
= And(Xnor(A
, C
), B
);
1141 Res
= Nor(Xor(A
, C
), Nor(B
, Not(A
)));
1145 Res
= Xor(Nor(Xnor(A
, B
), Nor(B
, C
)), C
);
1149 Res
= Xor(A
, Nand(B
, C
));
1156 Res
= Xor(B
, Nor(Nor(B
, Not(A
)), C
));
1160 Res
= And(Nand(A
, Not(B
)), C
);
1164 Res
= Xor(Nor(Xor(A
, C
), B
), C
);
1168 Res
= And(Nand(A
, Not(C
)), B
);
1172 Res
= Xor(Nor(Xor(A
, B
), C
), B
);
1176 Res
= Xor(Or(Xor(A
, B
), Xor(A
, C
)), A
);
1180 Res
= Nand(A
, Nand(B
, C
));
1184 Res
= And(A
, Xnor(B
, C
));
1188 Res
= Nor(Nor(A
, Not(B
)), Xor(B
, C
));
1192 Res
= Xor(Nor(Xnor(A
, B
), Nor(A
, C
)), C
);
1196 Res
= Xor(Nand(A
, C
), B
);
1200 Res
= Nor(Nor(A
, B
), Xor(Xnor(A
, B
), C
));
1204 Res
= Xor(Nand(A
, B
), C
);
1208 Res
= Xor(Xor(A
, B
), C
);
1212 Res
= Xor(Xor(A
, B
), Or(Nor(A
, B
), C
));
1216 Res
= Nor(Nor(A
, B
), Xor(B
, C
));
1224 Res
= Xor(Nor(B
, Not(A
)), C
);
1228 Res
= Or(Nor(A
, B
), Xnor(B
, C
));
1232 Res
= Xor(B
, Nor(C
, Not(A
)));
1236 Res
= Or(Nor(A
, C
), Xnor(B
, C
));
1240 Res
= Xor(And(Xor(A
, B
), Nand(B
, C
)), C
);
1244 Res
= Nand(A
, Xor(B
, C
));
1251 Res
= Xor(A
, Nor(Nor(A
, Not(B
)), C
));
1255 Res
= And(Or(A
, Not(B
)), C
);
1259 Res
= Xor(Nor(Xor(B
, C
), A
), C
);
1263 Res
= Xor(A
, Nor(Nor(A
, B
), C
));
1271 Res
= Xor(Nor(A
, Not(B
)), C
);
1275 Res
= Or(Nor(A
, B
), Xnor(A
, C
));
1279 Res
= And(Or(A
, B
), C
);
1283 Res
= Xor(Nor(A
, B
), C
);
1290 Res
= Or(Nor(A
, B
), C
);
1294 Res
= Xor(Nor(Xnor(B
, C
), A
), C
);
1298 Res
= Or(Xnor(A
, C
), And(B
, C
));
1302 Res
= Or(Nor(A
, Not(B
)), C
);
1306 Res
= Or(C
, Not(A
));
1310 Res
= And(A
, Nand(B
, Not(C
)));
1314 Res
= Xor(A
, Nor(Xor(A
, B
), C
));
1318 Res
= Xor(Nor(Xor(A
, B
), Xnor(A
, C
)), A
);
1322 Res
= Nand(Nand(A
, C
), B
);
1326 Res
= Xor(A
, Nor(C
, Not(B
)));
1330 Res
= Or(Xnor(A
, C
), Nor(B
, C
));
1334 Res
= Xor(And(Xor(A
, B
), Nand(A
, C
)), C
);
1338 Res
= Nand(Xor(A
, C
), B
);
1342 Res
= Xor(Nor(Xnor(A
, C
), B
), C
);
1346 Res
= Xor(Nor(And(A
, C
), B
), C
);
1350 Res
= Or(Nor(B
, Not(A
)), C
);
1354 Res
= Or(C
, Not(B
));
1358 Res
= Xor(A
, And(Nand(A
, C
), B
));
1362 Res
= Or(Xor(A
, B
), Xnor(A
, C
));
1366 Res
= Or(Xor(A
, B
), C
);
1370 Res
= Or(Nand(A
, B
), C
);
1377 Res
= Xor(A
, Nor(Nor(A
, Not(C
)), B
));
1381 Res
= Xor(A
, Nor(Nor(A
, C
), B
));
1389 Res
= And(Or(A
, Not(C
)), B
);
1393 Res
= Xor(B
, Nor(A
, Xor(B
, C
)));
1397 Res
= Xor(Nor(A
, Not(C
)), B
);
1401 Res
= Or(Xnor(A
, B
), Nor(A
, C
));
1405 Res
= And(Or(A
, C
), B
);
1409 Res
= Xor(Nor(A
, C
), B
);
1413 Res
= Xor(B
, Nor(A
, Xnor(B
, C
)));
1417 Res
= Or(Xnor(A
, B
), And(B
, C
));
1424 Res
= Or(Nor(A
, C
), B
);
1428 Res
= Or(Nor(A
, Not(C
)), B
);
1432 Res
= Or(B
, Not(A
));
1436 Res
= And(A
, Or(B
, Not(C
)));
1440 Res
= Xor(A
, Nor(Xor(A
, C
), B
));
1444 Res
= Xor(A
, Nor(B
, Not(C
)));
1448 Res
= Or(Xnor(A
, B
), Nor(B
, C
));
1452 Res
= Xor(Nor(Xnor(A
, B
), Xor(A
, C
)), A
);
1456 Res
= Nand(Nand(A
, B
), C
);
1460 Res
= Xor(Xor(A
, B
), Or(And(A
, B
), C
));
1464 Res
= Nand(Xor(A
, B
), C
);
1468 Res
= Xor(Nor(Xnor(A
, B
), C
), B
);
1472 Res
= Or(And(A
, B
), Xnor(B
, C
));
1476 Res
= Xor(A
, And(Nand(A
, B
), C
));
1480 Res
= Or(Xnor(A
, B
), Xor(A
, C
));
1484 Res
= Or(B
, Nor(C
, Not(A
)));
1488 Res
= Or(B
, Not(C
));
1492 Res
= Or(Xor(A
, C
), B
);
1496 Res
= Or(Nand(A
, C
), B
);
1500 Res
= And(A
, Or(B
, C
));
1504 Res
= Xor(A
, Nor(B
, C
));
1508 Res
= Xor(A
, Nor(Xnor(A
, C
), B
));
1512 Res
= Xor(A
, Nor(And(A
, C
), B
));
1516 Res
= Xor(A
, Nor(Xnor(A
, B
), C
));
1520 Res
= Xor(A
, Nor(And(A
, B
), C
));
1524 Res
= Or(And(A
, B
), Xor(B
, C
));
1528 Res
= Or(Xnor(A
, B
), Xnor(A
, C
));
1532 Res
= Xor(Or(A
, B
), Nor(Xnor(A
, B
), C
));
1536 Res
= Xor(Xor(A
, B
), Nand(Nand(A
, B
), C
));
1540 Res
= Or(And(A
, B
), C
);
1544 Res
= Or(Xnor(A
, B
), C
);
1548 Res
= Or(And(A
, C
), B
);
1552 Res
= Or(Xnor(A
, C
), B
);
1559 Res
= Nand(A
, Nor(B
, C
));
1566 Res
= Or(A
, Nor(B
, C
));
1570 Res
= Or(A
, Nor(B
, Not(C
)));
1574 Res
= Or(A
, Not(B
));
1578 Res
= Or(A
, Nor(C
, Not(B
)));
1582 Res
= Or(A
, Not(C
));
1586 Res
= Or(A
, Xor(B
, C
));
1590 Res
= Or(A
, Nand(B
, C
));
1594 Res
= Or(A
, And(B
, C
));
1598 Res
= Or(A
, Xnor(B
, C
));
1605 Res
= Nand(Nor(A
, C
), B
);
1612 Res
= Nand(Nor(A
, B
), C
);
1616 Res
= Or(Or(A
, B
), C
);
1619 Res
= {Constant::getAllOnesValue(Ty
), 0xff};
1623 assert((Res
.first
== nullptr || Res
.second
== Imm
) &&
1624 "Simplification of ternary logic does not verify!");
1628 static Value
*simplifyX86insertps(const IntrinsicInst
&II
,
1629 InstCombiner::BuilderTy
&Builder
) {
1630 auto *CInt
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
1634 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
1635 assert(VecTy
->getNumElements() == 4 && "insertps with wrong vector type");
1637 // The immediate permute control byte looks like this:
1638 // [3:0] - zero mask for each 32-bit lane
1639 // [5:4] - select one 32-bit destination lane
1640 // [7:6] - select one 32-bit source lane
1642 uint8_t Imm
= CInt
->getZExtValue();
1643 uint8_t ZMask
= Imm
& 0xf;
1644 uint8_t DestLane
= (Imm
>> 4) & 0x3;
1645 uint8_t SourceLane
= (Imm
>> 6) & 0x3;
1647 ConstantAggregateZero
*ZeroVector
= ConstantAggregateZero::get(VecTy
);
1649 // If all zero mask bits are set, this was just a weird way to
1650 // generate a zero vector.
1654 // Initialize by passing all of the first source bits through.
1655 int ShuffleMask
[4] = {0, 1, 2, 3};
1657 // We may replace the second operand with the zero vector.
1658 Value
*V1
= II
.getArgOperand(1);
1661 // If the zero mask is being used with a single input or the zero mask
1662 // overrides the destination lane, this is a shuffle with the zero vector.
1663 if ((II
.getArgOperand(0) == II
.getArgOperand(1)) ||
1664 (ZMask
& (1 << DestLane
))) {
1666 // We may still move 32-bits of the first source vector from one lane
1668 ShuffleMask
[DestLane
] = SourceLane
;
1669 // The zero mask may override the previous insert operation.
1670 for (unsigned i
= 0; i
< 4; ++i
)
1671 if ((ZMask
>> i
) & 0x1)
1672 ShuffleMask
[i
] = i
+ 4;
1674 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1678 // Replace the selected destination lane with the selected source lane.
1679 ShuffleMask
[DestLane
] = SourceLane
+ 4;
1682 return Builder
.CreateShuffleVector(II
.getArgOperand(0), V1
, ShuffleMask
);
1685 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1686 /// or conversion to a shuffle vector.
1687 static Value
*simplifyX86extrq(IntrinsicInst
&II
, Value
*Op0
,
1688 ConstantInt
*CILength
, ConstantInt
*CIIndex
,
1689 InstCombiner::BuilderTy
&Builder
) {
1690 auto LowConstantHighUndef
= [&](uint64_t Val
) {
1691 Type
*IntTy64
= Type::getInt64Ty(II
.getContext());
1692 Constant
*Args
[] = {ConstantInt::get(IntTy64
, Val
),
1693 UndefValue::get(IntTy64
)};
1694 return ConstantVector::get(Args
);
1697 // See if we're dealing with constant values.
1698 auto *C0
= dyn_cast
<Constant
>(Op0
);
1700 C0
? dyn_cast_or_null
<ConstantInt
>(C0
->getAggregateElement((unsigned)0))
1703 // Attempt to constant fold.
1704 if (CILength
&& CIIndex
) {
1705 // From AMD documentation: "The bit index and field length are each six
1706 // bits in length other bits of the field are ignored."
1707 APInt APIndex
= CIIndex
->getValue().zextOrTrunc(6);
1708 APInt APLength
= CILength
->getValue().zextOrTrunc(6);
1710 unsigned Index
= APIndex
.getZExtValue();
1712 // From AMD documentation: "a value of zero in the field length is
1713 // defined as length of 64".
1714 unsigned Length
= APLength
== 0 ? 64 : APLength
.getZExtValue();
1716 // From AMD documentation: "If the sum of the bit index + length field
1717 // is greater than 64, the results are undefined".
1718 unsigned End
= Index
+ Length
;
1720 // Note that both field index and field length are 8-bit quantities.
1721 // Since variables 'Index' and 'Length' are unsigned values
1722 // obtained from zero-extending field index and field length
1723 // respectively, their sum should never wrap around.
1725 return UndefValue::get(II
.getType());
1727 // If we are inserting whole bytes, we can convert this to a shuffle.
1728 // Lowering can recognize EXTRQI shuffle masks.
1729 if ((Length
% 8) == 0 && (Index
% 8) == 0) {
1730 // Convert bit indices to byte indices.
1734 Type
*IntTy8
= Type::getInt8Ty(II
.getContext());
1735 auto *ShufTy
= FixedVectorType::get(IntTy8
, 16);
1737 SmallVector
<int, 16> ShuffleMask
;
1738 for (int i
= 0; i
!= (int)Length
; ++i
)
1739 ShuffleMask
.push_back(i
+ Index
);
1740 for (int i
= Length
; i
!= 8; ++i
)
1741 ShuffleMask
.push_back(i
+ 16);
1742 for (int i
= 8; i
!= 16; ++i
)
1743 ShuffleMask
.push_back(-1);
1745 Value
*SV
= Builder
.CreateShuffleVector(
1746 Builder
.CreateBitCast(Op0
, ShufTy
),
1747 ConstantAggregateZero::get(ShufTy
), ShuffleMask
);
1748 return Builder
.CreateBitCast(SV
, II
.getType());
1751 // Constant Fold - shift Index'th bit to lowest position and mask off
1754 APInt Elt
= CI0
->getValue();
1755 Elt
.lshrInPlace(Index
);
1756 Elt
= Elt
.zextOrTrunc(Length
);
1757 return LowConstantHighUndef(Elt
.getZExtValue());
1760 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1761 if (II
.getIntrinsicID() == Intrinsic::x86_sse4a_extrq
) {
1762 Value
*Args
[] = {Op0
, CILength
, CIIndex
};
1763 Module
*M
= II
.getModule();
1764 Function
*F
= Intrinsic::getDeclaration(M
, Intrinsic::x86_sse4a_extrqi
);
1765 return Builder
.CreateCall(F
, Args
);
1769 // Constant Fold - extraction from zero is always {zero, undef}.
1770 if (CI0
&& CI0
->isZero())
1771 return LowConstantHighUndef(0);
1776 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1777 /// folding or conversion to a shuffle vector.
1778 static Value
*simplifyX86insertq(IntrinsicInst
&II
, Value
*Op0
, Value
*Op1
,
1779 APInt APLength
, APInt APIndex
,
1780 InstCombiner::BuilderTy
&Builder
) {
1781 // From AMD documentation: "The bit index and field length are each six bits
1782 // in length other bits of the field are ignored."
1783 APIndex
= APIndex
.zextOrTrunc(6);
1784 APLength
= APLength
.zextOrTrunc(6);
1786 // Attempt to constant fold.
1787 unsigned Index
= APIndex
.getZExtValue();
1789 // From AMD documentation: "a value of zero in the field length is
1790 // defined as length of 64".
1791 unsigned Length
= APLength
== 0 ? 64 : APLength
.getZExtValue();
1793 // From AMD documentation: "If the sum of the bit index + length field
1794 // is greater than 64, the results are undefined".
1795 unsigned End
= Index
+ Length
;
1797 // Note that both field index and field length are 8-bit quantities.
1798 // Since variables 'Index' and 'Length' are unsigned values
1799 // obtained from zero-extending field index and field length
1800 // respectively, their sum should never wrap around.
1802 return UndefValue::get(II
.getType());
1804 // If we are inserting whole bytes, we can convert this to a shuffle.
1805 // Lowering can recognize INSERTQI shuffle masks.
1806 if ((Length
% 8) == 0 && (Index
% 8) == 0) {
1807 // Convert bit indices to byte indices.
1811 Type
*IntTy8
= Type::getInt8Ty(II
.getContext());
1812 auto *ShufTy
= FixedVectorType::get(IntTy8
, 16);
1814 SmallVector
<int, 16> ShuffleMask
;
1815 for (int i
= 0; i
!= (int)Index
; ++i
)
1816 ShuffleMask
.push_back(i
);
1817 for (int i
= 0; i
!= (int)Length
; ++i
)
1818 ShuffleMask
.push_back(i
+ 16);
1819 for (int i
= Index
+ Length
; i
!= 8; ++i
)
1820 ShuffleMask
.push_back(i
);
1821 for (int i
= 8; i
!= 16; ++i
)
1822 ShuffleMask
.push_back(-1);
1824 Value
*SV
= Builder
.CreateShuffleVector(Builder
.CreateBitCast(Op0
, ShufTy
),
1825 Builder
.CreateBitCast(Op1
, ShufTy
),
1827 return Builder
.CreateBitCast(SV
, II
.getType());
1830 // See if we're dealing with constant values.
1831 auto *C0
= dyn_cast
<Constant
>(Op0
);
1832 auto *C1
= dyn_cast
<Constant
>(Op1
);
1834 C0
? dyn_cast_or_null
<ConstantInt
>(C0
->getAggregateElement((unsigned)0))
1837 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)0))
1840 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1842 APInt V00
= CI00
->getValue();
1843 APInt V10
= CI10
->getValue();
1844 APInt Mask
= APInt::getLowBitsSet(64, Length
).shl(Index
);
1846 V10
= V10
.zextOrTrunc(Length
).zextOrTrunc(64).shl(Index
);
1847 APInt Val
= V00
| V10
;
1848 Type
*IntTy64
= Type::getInt64Ty(II
.getContext());
1849 Constant
*Args
[] = {ConstantInt::get(IntTy64
, Val
.getZExtValue()),
1850 UndefValue::get(IntTy64
)};
1851 return ConstantVector::get(Args
);
1854 // If we were an INSERTQ call, we'll save demanded elements if we convert to
1856 if (II
.getIntrinsicID() == Intrinsic::x86_sse4a_insertq
) {
1857 Type
*IntTy8
= Type::getInt8Ty(II
.getContext());
1858 Constant
*CILength
= ConstantInt::get(IntTy8
, Length
, false);
1859 Constant
*CIIndex
= ConstantInt::get(IntTy8
, Index
, false);
1861 Value
*Args
[] = {Op0
, Op1
, CILength
, CIIndex
};
1862 Module
*M
= II
.getModule();
1863 Function
*F
= Intrinsic::getDeclaration(M
, Intrinsic::x86_sse4a_insertqi
);
1864 return Builder
.CreateCall(F
, Args
);
1870 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1871 static Value
*simplifyX86pshufb(const IntrinsicInst
&II
,
1872 InstCombiner::BuilderTy
&Builder
) {
1873 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
1877 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
1878 unsigned NumElts
= VecTy
->getNumElements();
1879 assert((NumElts
== 16 || NumElts
== 32 || NumElts
== 64) &&
1880 "Unexpected number of elements in shuffle mask!");
1882 // Construct a shuffle mask from constant integers or UNDEFs.
1885 // Each byte in the shuffle control mask forms an index to permute the
1886 // corresponding byte in the destination operand.
1887 for (unsigned I
= 0; I
< NumElts
; ++I
) {
1888 Constant
*COp
= V
->getAggregateElement(I
);
1889 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
1892 if (isa
<UndefValue
>(COp
)) {
1897 int8_t Index
= cast
<ConstantInt
>(COp
)->getValue().getZExtValue();
1899 // If the most significant bit (bit[7]) of each byte of the shuffle
1900 // control mask is set, then zero is written in the result byte.
1901 // The zero vector is in the right-hand side of the resulting
1904 // The value of each index for the high 128-bit lane is the least
1905 // significant 4 bits of the respective shuffle control byte.
1906 Index
= ((Index
< 0) ? NumElts
: Index
& 0x0F) + (I
& 0xF0);
1910 auto V1
= II
.getArgOperand(0);
1911 auto V2
= Constant::getNullValue(VecTy
);
1912 return Builder
.CreateShuffleVector(V1
, V2
, ArrayRef(Indexes
, NumElts
));
1915 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1916 static Value
*simplifyX86vpermilvar(const IntrinsicInst
&II
,
1917 InstCombiner::BuilderTy
&Builder
) {
1918 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
1922 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
1923 unsigned NumElts
= VecTy
->getNumElements();
1924 bool IsPD
= VecTy
->getScalarType()->isDoubleTy();
1925 unsigned NumLaneElts
= IsPD
? 2 : 4;
1926 assert(NumElts
== 16 || NumElts
== 8 || NumElts
== 4 || NumElts
== 2);
1928 // Construct a shuffle mask from constant integers or UNDEFs.
1931 // The intrinsics only read one or two bits, clear the rest.
1932 for (unsigned I
= 0; I
< NumElts
; ++I
) {
1933 Constant
*COp
= V
->getAggregateElement(I
);
1934 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
1937 if (isa
<UndefValue
>(COp
)) {
1942 APInt Index
= cast
<ConstantInt
>(COp
)->getValue();
1943 Index
= Index
.zextOrTrunc(32).getLoBits(2);
1945 // The PD variants uses bit 1 to select per-lane element index, so
1946 // shift down to convert to generic shuffle mask index.
1948 Index
.lshrInPlace(1);
1950 // The _256 variants are a bit trickier since the mask bits always index
1951 // into the corresponding 128 half. In order to convert to a generic
1952 // shuffle, we have to make that explicit.
1953 Index
+= APInt(32, (I
/ NumLaneElts
) * NumLaneElts
);
1955 Indexes
[I
] = Index
.getZExtValue();
1958 auto V1
= II
.getArgOperand(0);
1959 return Builder
.CreateShuffleVector(V1
, ArrayRef(Indexes
, NumElts
));
1962 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1963 static Value
*simplifyX86vpermv(const IntrinsicInst
&II
,
1964 InstCombiner::BuilderTy
&Builder
) {
1965 auto *V
= dyn_cast
<Constant
>(II
.getArgOperand(1));
1969 auto *VecTy
= cast
<FixedVectorType
>(II
.getType());
1970 unsigned Size
= VecTy
->getNumElements();
1971 assert((Size
== 4 || Size
== 8 || Size
== 16 || Size
== 32 || Size
== 64) &&
1972 "Unexpected shuffle mask size");
1974 // Construct a shuffle mask from constant integers or UNDEFs.
1977 for (unsigned I
= 0; I
< Size
; ++I
) {
1978 Constant
*COp
= V
->getAggregateElement(I
);
1979 if (!COp
|| (!isa
<UndefValue
>(COp
) && !isa
<ConstantInt
>(COp
)))
1982 if (isa
<UndefValue
>(COp
)) {
1987 uint32_t Index
= cast
<ConstantInt
>(COp
)->getZExtValue();
1992 auto V1
= II
.getArgOperand(0);
1993 return Builder
.CreateShuffleVector(V1
, ArrayRef(Indexes
, Size
));
1996 std::optional
<Instruction
*>
1997 X86TTIImpl::instCombineIntrinsic(InstCombiner
&IC
, IntrinsicInst
&II
) const {
1998 auto SimplifyDemandedVectorEltsLow
= [&IC
](Value
*Op
, unsigned Width
,
1999 unsigned DemandedWidth
) {
2000 APInt
UndefElts(Width
, 0);
2001 APInt DemandedElts
= APInt::getLowBitsSet(Width
, DemandedWidth
);
2002 return IC
.SimplifyDemandedVectorElts(Op
, DemandedElts
, UndefElts
);
2005 Intrinsic::ID IID
= II
.getIntrinsicID();
2007 case Intrinsic::x86_bmi_bextr_32
:
2008 case Intrinsic::x86_bmi_bextr_64
:
2009 case Intrinsic::x86_tbm_bextri_u32
:
2010 case Intrinsic::x86_tbm_bextri_u64
:
2011 // If the RHS is a constant we can try some simplifications.
2012 if (auto *C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2013 uint64_t Shift
= C
->getZExtValue();
2014 uint64_t Length
= (Shift
>> 8) & 0xff;
2016 unsigned BitWidth
= II
.getType()->getIntegerBitWidth();
2017 // If the length is 0 or the shift is out of range, replace with zero.
2018 if (Length
== 0 || Shift
>= BitWidth
) {
2019 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2021 // If the LHS is also a constant, we can completely constant fold this.
2022 if (auto *InC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2023 uint64_t Result
= InC
->getZExtValue() >> Shift
;
2024 if (Length
> BitWidth
)
2026 Result
&= maskTrailingOnes
<uint64_t>(Length
);
2027 return IC
.replaceInstUsesWith(II
,
2028 ConstantInt::get(II
.getType(), Result
));
2030 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2031 // are only masking bits that a shift already cleared?
2035 case Intrinsic::x86_bmi_bzhi_32
:
2036 case Intrinsic::x86_bmi_bzhi_64
:
2037 // If the RHS is a constant we can try some simplifications.
2038 if (auto *C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2039 uint64_t Index
= C
->getZExtValue() & 0xff;
2040 unsigned BitWidth
= II
.getType()->getIntegerBitWidth();
2041 if (Index
>= BitWidth
) {
2042 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
2045 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2047 // If the LHS is also a constant, we can completely constant fold this.
2048 if (auto *InC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2049 uint64_t Result
= InC
->getZExtValue();
2050 Result
&= maskTrailingOnes
<uint64_t>(Index
);
2051 return IC
.replaceInstUsesWith(II
,
2052 ConstantInt::get(II
.getType(), Result
));
2054 // TODO should we convert this to an AND if the RHS is constant?
2057 case Intrinsic::x86_bmi_pext_32
:
2058 case Intrinsic::x86_bmi_pext_64
:
2059 if (auto *MaskC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2060 if (MaskC
->isNullValue()) {
2061 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2063 if (MaskC
->isAllOnesValue()) {
2064 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
2067 unsigned MaskIdx
, MaskLen
;
2068 if (MaskC
->getValue().isShiftedMask(MaskIdx
, MaskLen
)) {
2069 // any single contingous sequence of 1s anywhere in the mask simply
2070 // describes a subset of the input bits shifted to the appropriate
2071 // position. Replace with the straight forward IR.
2072 Value
*Input
= II
.getArgOperand(0);
2073 Value
*Masked
= IC
.Builder
.CreateAnd(Input
, II
.getArgOperand(1));
2074 Value
*ShiftAmt
= ConstantInt::get(II
.getType(), MaskIdx
);
2075 Value
*Shifted
= IC
.Builder
.CreateLShr(Masked
, ShiftAmt
);
2076 return IC
.replaceInstUsesWith(II
, Shifted
);
2079 if (auto *SrcC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2080 uint64_t Src
= SrcC
->getZExtValue();
2081 uint64_t Mask
= MaskC
->getZExtValue();
2082 uint64_t Result
= 0;
2083 uint64_t BitToSet
= 1;
2086 // Isolate lowest set bit.
2087 uint64_t BitToTest
= Mask
& -Mask
;
2088 if (BitToTest
& Src
)
2092 // Clear lowest set bit.
2096 return IC
.replaceInstUsesWith(II
,
2097 ConstantInt::get(II
.getType(), Result
));
2101 case Intrinsic::x86_bmi_pdep_32
:
2102 case Intrinsic::x86_bmi_pdep_64
:
2103 if (auto *MaskC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1))) {
2104 if (MaskC
->isNullValue()) {
2105 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), 0));
2107 if (MaskC
->isAllOnesValue()) {
2108 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
2111 unsigned MaskIdx
, MaskLen
;
2112 if (MaskC
->getValue().isShiftedMask(MaskIdx
, MaskLen
)) {
2113 // any single contingous sequence of 1s anywhere in the mask simply
2114 // describes a subset of the input bits shifted to the appropriate
2115 // position. Replace with the straight forward IR.
2116 Value
*Input
= II
.getArgOperand(0);
2117 Value
*ShiftAmt
= ConstantInt::get(II
.getType(), MaskIdx
);
2118 Value
*Shifted
= IC
.Builder
.CreateShl(Input
, ShiftAmt
);
2119 Value
*Masked
= IC
.Builder
.CreateAnd(Shifted
, II
.getArgOperand(1));
2120 return IC
.replaceInstUsesWith(II
, Masked
);
2123 if (auto *SrcC
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
2124 uint64_t Src
= SrcC
->getZExtValue();
2125 uint64_t Mask
= MaskC
->getZExtValue();
2126 uint64_t Result
= 0;
2127 uint64_t BitToTest
= 1;
2130 // Isolate lowest set bit.
2131 uint64_t BitToSet
= Mask
& -Mask
;
2132 if (BitToTest
& Src
)
2136 // Clear lowest set bit;
2140 return IC
.replaceInstUsesWith(II
,
2141 ConstantInt::get(II
.getType(), Result
));
2146 case Intrinsic::x86_sse_cvtss2si
:
2147 case Intrinsic::x86_sse_cvtss2si64
:
2148 case Intrinsic::x86_sse_cvttss2si
:
2149 case Intrinsic::x86_sse_cvttss2si64
:
2150 case Intrinsic::x86_sse2_cvtsd2si
:
2151 case Intrinsic::x86_sse2_cvtsd2si64
:
2152 case Intrinsic::x86_sse2_cvttsd2si
:
2153 case Intrinsic::x86_sse2_cvttsd2si64
:
2154 case Intrinsic::x86_avx512_vcvtss2si32
:
2155 case Intrinsic::x86_avx512_vcvtss2si64
:
2156 case Intrinsic::x86_avx512_vcvtss2usi32
:
2157 case Intrinsic::x86_avx512_vcvtss2usi64
:
2158 case Intrinsic::x86_avx512_vcvtsd2si32
:
2159 case Intrinsic::x86_avx512_vcvtsd2si64
:
2160 case Intrinsic::x86_avx512_vcvtsd2usi32
:
2161 case Intrinsic::x86_avx512_vcvtsd2usi64
:
2162 case Intrinsic::x86_avx512_cvttss2si
:
2163 case Intrinsic::x86_avx512_cvttss2si64
:
2164 case Intrinsic::x86_avx512_cvttss2usi
:
2165 case Intrinsic::x86_avx512_cvttss2usi64
:
2166 case Intrinsic::x86_avx512_cvttsd2si
:
2167 case Intrinsic::x86_avx512_cvttsd2si64
:
2168 case Intrinsic::x86_avx512_cvttsd2usi
:
2169 case Intrinsic::x86_avx512_cvttsd2usi64
: {
2170 // These intrinsics only demand the 0th element of their input vectors. If
2171 // we can simplify the input based on that, do so now.
2172 Value
*Arg
= II
.getArgOperand(0);
2173 unsigned VWidth
= cast
<FixedVectorType
>(Arg
->getType())->getNumElements();
2174 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg
, VWidth
, 1)) {
2175 return IC
.replaceOperand(II
, 0, V
);
2180 case Intrinsic::x86_mmx_pmovmskb
:
2181 case Intrinsic::x86_sse_movmsk_ps
:
2182 case Intrinsic::x86_sse2_movmsk_pd
:
2183 case Intrinsic::x86_sse2_pmovmskb_128
:
2184 case Intrinsic::x86_avx_movmsk_pd_256
:
2185 case Intrinsic::x86_avx_movmsk_ps_256
:
2186 case Intrinsic::x86_avx2_pmovmskb
:
2187 if (Value
*V
= simplifyX86movmsk(II
, IC
.Builder
)) {
2188 return IC
.replaceInstUsesWith(II
, V
);
2192 case Intrinsic::x86_sse_comieq_ss
:
2193 case Intrinsic::x86_sse_comige_ss
:
2194 case Intrinsic::x86_sse_comigt_ss
:
2195 case Intrinsic::x86_sse_comile_ss
:
2196 case Intrinsic::x86_sse_comilt_ss
:
2197 case Intrinsic::x86_sse_comineq_ss
:
2198 case Intrinsic::x86_sse_ucomieq_ss
:
2199 case Intrinsic::x86_sse_ucomige_ss
:
2200 case Intrinsic::x86_sse_ucomigt_ss
:
2201 case Intrinsic::x86_sse_ucomile_ss
:
2202 case Intrinsic::x86_sse_ucomilt_ss
:
2203 case Intrinsic::x86_sse_ucomineq_ss
:
2204 case Intrinsic::x86_sse2_comieq_sd
:
2205 case Intrinsic::x86_sse2_comige_sd
:
2206 case Intrinsic::x86_sse2_comigt_sd
:
2207 case Intrinsic::x86_sse2_comile_sd
:
2208 case Intrinsic::x86_sse2_comilt_sd
:
2209 case Intrinsic::x86_sse2_comineq_sd
:
2210 case Intrinsic::x86_sse2_ucomieq_sd
:
2211 case Intrinsic::x86_sse2_ucomige_sd
:
2212 case Intrinsic::x86_sse2_ucomigt_sd
:
2213 case Intrinsic::x86_sse2_ucomile_sd
:
2214 case Intrinsic::x86_sse2_ucomilt_sd
:
2215 case Intrinsic::x86_sse2_ucomineq_sd
:
2216 case Intrinsic::x86_avx512_vcomi_ss
:
2217 case Intrinsic::x86_avx512_vcomi_sd
:
2218 case Intrinsic::x86_avx512_mask_cmp_ss
:
2219 case Intrinsic::x86_avx512_mask_cmp_sd
: {
2220 // These intrinsics only demand the 0th element of their input vectors. If
2221 // we can simplify the input based on that, do so now.
2222 bool MadeChange
= false;
2223 Value
*Arg0
= II
.getArgOperand(0);
2224 Value
*Arg1
= II
.getArgOperand(1);
2225 unsigned VWidth
= cast
<FixedVectorType
>(Arg0
->getType())->getNumElements();
2226 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg0
, VWidth
, 1)) {
2227 IC
.replaceOperand(II
, 0, V
);
2230 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg1
, VWidth
, 1)) {
2231 IC
.replaceOperand(II
, 1, V
);
2240 case Intrinsic::x86_avx512_add_ps_512
:
2241 case Intrinsic::x86_avx512_div_ps_512
:
2242 case Intrinsic::x86_avx512_mul_ps_512
:
2243 case Intrinsic::x86_avx512_sub_ps_512
:
2244 case Intrinsic::x86_avx512_add_pd_512
:
2245 case Intrinsic::x86_avx512_div_pd_512
:
2246 case Intrinsic::x86_avx512_mul_pd_512
:
2247 case Intrinsic::x86_avx512_sub_pd_512
:
2248 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2250 if (auto *R
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2))) {
2251 if (R
->getValue() == 4) {
2252 Value
*Arg0
= II
.getArgOperand(0);
2253 Value
*Arg1
= II
.getArgOperand(1);
2258 llvm_unreachable("Case stmts out of sync!");
2259 case Intrinsic::x86_avx512_add_ps_512
:
2260 case Intrinsic::x86_avx512_add_pd_512
:
2261 V
= IC
.Builder
.CreateFAdd(Arg0
, Arg1
);
2263 case Intrinsic::x86_avx512_sub_ps_512
:
2264 case Intrinsic::x86_avx512_sub_pd_512
:
2265 V
= IC
.Builder
.CreateFSub(Arg0
, Arg1
);
2267 case Intrinsic::x86_avx512_mul_ps_512
:
2268 case Intrinsic::x86_avx512_mul_pd_512
:
2269 V
= IC
.Builder
.CreateFMul(Arg0
, Arg1
);
2271 case Intrinsic::x86_avx512_div_ps_512
:
2272 case Intrinsic::x86_avx512_div_pd_512
:
2273 V
= IC
.Builder
.CreateFDiv(Arg0
, Arg1
);
2277 return IC
.replaceInstUsesWith(II
, V
);
2282 case Intrinsic::x86_avx512_mask_add_ss_round
:
2283 case Intrinsic::x86_avx512_mask_div_ss_round
:
2284 case Intrinsic::x86_avx512_mask_mul_ss_round
:
2285 case Intrinsic::x86_avx512_mask_sub_ss_round
:
2286 case Intrinsic::x86_avx512_mask_add_sd_round
:
2287 case Intrinsic::x86_avx512_mask_div_sd_round
:
2288 case Intrinsic::x86_avx512_mask_mul_sd_round
:
2289 case Intrinsic::x86_avx512_mask_sub_sd_round
:
2290 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2292 if (auto *R
= dyn_cast
<ConstantInt
>(II
.getArgOperand(4))) {
2293 if (R
->getValue() == 4) {
2294 // Extract the element as scalars.
2295 Value
*Arg0
= II
.getArgOperand(0);
2296 Value
*Arg1
= II
.getArgOperand(1);
2297 Value
*LHS
= IC
.Builder
.CreateExtractElement(Arg0
, (uint64_t)0);
2298 Value
*RHS
= IC
.Builder
.CreateExtractElement(Arg1
, (uint64_t)0);
2303 llvm_unreachable("Case stmts out of sync!");
2304 case Intrinsic::x86_avx512_mask_add_ss_round
:
2305 case Intrinsic::x86_avx512_mask_add_sd_round
:
2306 V
= IC
.Builder
.CreateFAdd(LHS
, RHS
);
2308 case Intrinsic::x86_avx512_mask_sub_ss_round
:
2309 case Intrinsic::x86_avx512_mask_sub_sd_round
:
2310 V
= IC
.Builder
.CreateFSub(LHS
, RHS
);
2312 case Intrinsic::x86_avx512_mask_mul_ss_round
:
2313 case Intrinsic::x86_avx512_mask_mul_sd_round
:
2314 V
= IC
.Builder
.CreateFMul(LHS
, RHS
);
2316 case Intrinsic::x86_avx512_mask_div_ss_round
:
2317 case Intrinsic::x86_avx512_mask_div_sd_round
:
2318 V
= IC
.Builder
.CreateFDiv(LHS
, RHS
);
2322 // Handle the masking aspect of the intrinsic.
2323 Value
*Mask
= II
.getArgOperand(3);
2324 auto *C
= dyn_cast
<ConstantInt
>(Mask
);
2325 // We don't need a select if we know the mask bit is a 1.
2326 if (!C
|| !C
->getValue()[0]) {
2327 // Cast the mask to an i1 vector and then extract the lowest element.
2328 auto *MaskTy
= FixedVectorType::get(
2329 IC
.Builder
.getInt1Ty(),
2330 cast
<IntegerType
>(Mask
->getType())->getBitWidth());
2331 Mask
= IC
.Builder
.CreateBitCast(Mask
, MaskTy
);
2332 Mask
= IC
.Builder
.CreateExtractElement(Mask
, (uint64_t)0);
2333 // Extract the lowest element from the passthru operand.
2335 IC
.Builder
.CreateExtractElement(II
.getArgOperand(2), (uint64_t)0);
2336 V
= IC
.Builder
.CreateSelect(Mask
, V
, Passthru
);
2339 // Insert the result back into the original argument 0.
2340 V
= IC
.Builder
.CreateInsertElement(Arg0
, V
, (uint64_t)0);
2342 return IC
.replaceInstUsesWith(II
, V
);
2347 // Constant fold ashr( <A x Bi>, Ci ).
2348 // Constant fold lshr( <A x Bi>, Ci ).
2349 // Constant fold shl( <A x Bi>, Ci ).
2350 case Intrinsic::x86_sse2_psrai_d
:
2351 case Intrinsic::x86_sse2_psrai_w
:
2352 case Intrinsic::x86_avx2_psrai_d
:
2353 case Intrinsic::x86_avx2_psrai_w
:
2354 case Intrinsic::x86_avx512_psrai_q_128
:
2355 case Intrinsic::x86_avx512_psrai_q_256
:
2356 case Intrinsic::x86_avx512_psrai_d_512
:
2357 case Intrinsic::x86_avx512_psrai_q_512
:
2358 case Intrinsic::x86_avx512_psrai_w_512
:
2359 case Intrinsic::x86_sse2_psrli_d
:
2360 case Intrinsic::x86_sse2_psrli_q
:
2361 case Intrinsic::x86_sse2_psrli_w
:
2362 case Intrinsic::x86_avx2_psrli_d
:
2363 case Intrinsic::x86_avx2_psrli_q
:
2364 case Intrinsic::x86_avx2_psrli_w
:
2365 case Intrinsic::x86_avx512_psrli_d_512
:
2366 case Intrinsic::x86_avx512_psrli_q_512
:
2367 case Intrinsic::x86_avx512_psrli_w_512
:
2368 case Intrinsic::x86_sse2_pslli_d
:
2369 case Intrinsic::x86_sse2_pslli_q
:
2370 case Intrinsic::x86_sse2_pslli_w
:
2371 case Intrinsic::x86_avx2_pslli_d
:
2372 case Intrinsic::x86_avx2_pslli_q
:
2373 case Intrinsic::x86_avx2_pslli_w
:
2374 case Intrinsic::x86_avx512_pslli_d_512
:
2375 case Intrinsic::x86_avx512_pslli_q_512
:
2376 case Intrinsic::x86_avx512_pslli_w_512
:
2377 if (Value
*V
= simplifyX86immShift(II
, IC
.Builder
)) {
2378 return IC
.replaceInstUsesWith(II
, V
);
2382 case Intrinsic::x86_sse2_psra_d
:
2383 case Intrinsic::x86_sse2_psra_w
:
2384 case Intrinsic::x86_avx2_psra_d
:
2385 case Intrinsic::x86_avx2_psra_w
:
2386 case Intrinsic::x86_avx512_psra_q_128
:
2387 case Intrinsic::x86_avx512_psra_q_256
:
2388 case Intrinsic::x86_avx512_psra_d_512
:
2389 case Intrinsic::x86_avx512_psra_q_512
:
2390 case Intrinsic::x86_avx512_psra_w_512
:
2391 case Intrinsic::x86_sse2_psrl_d
:
2392 case Intrinsic::x86_sse2_psrl_q
:
2393 case Intrinsic::x86_sse2_psrl_w
:
2394 case Intrinsic::x86_avx2_psrl_d
:
2395 case Intrinsic::x86_avx2_psrl_q
:
2396 case Intrinsic::x86_avx2_psrl_w
:
2397 case Intrinsic::x86_avx512_psrl_d_512
:
2398 case Intrinsic::x86_avx512_psrl_q_512
:
2399 case Intrinsic::x86_avx512_psrl_w_512
:
2400 case Intrinsic::x86_sse2_psll_d
:
2401 case Intrinsic::x86_sse2_psll_q
:
2402 case Intrinsic::x86_sse2_psll_w
:
2403 case Intrinsic::x86_avx2_psll_d
:
2404 case Intrinsic::x86_avx2_psll_q
:
2405 case Intrinsic::x86_avx2_psll_w
:
2406 case Intrinsic::x86_avx512_psll_d_512
:
2407 case Intrinsic::x86_avx512_psll_q_512
:
2408 case Intrinsic::x86_avx512_psll_w_512
: {
2409 if (Value
*V
= simplifyX86immShift(II
, IC
.Builder
)) {
2410 return IC
.replaceInstUsesWith(II
, V
);
2413 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2414 // operand to compute the shift amount.
2415 Value
*Arg1
= II
.getArgOperand(1);
2416 assert(Arg1
->getType()->getPrimitiveSizeInBits() == 128 &&
2417 "Unexpected packed shift size");
2418 unsigned VWidth
= cast
<FixedVectorType
>(Arg1
->getType())->getNumElements();
2420 if (Value
*V
= SimplifyDemandedVectorEltsLow(Arg1
, VWidth
, VWidth
/ 2)) {
2421 return IC
.replaceOperand(II
, 1, V
);
2426 case Intrinsic::x86_avx2_psllv_d
:
2427 case Intrinsic::x86_avx2_psllv_d_256
:
2428 case Intrinsic::x86_avx2_psllv_q
:
2429 case Intrinsic::x86_avx2_psllv_q_256
:
2430 case Intrinsic::x86_avx512_psllv_d_512
:
2431 case Intrinsic::x86_avx512_psllv_q_512
:
2432 case Intrinsic::x86_avx512_psllv_w_128
:
2433 case Intrinsic::x86_avx512_psllv_w_256
:
2434 case Intrinsic::x86_avx512_psllv_w_512
:
2435 case Intrinsic::x86_avx2_psrav_d
:
2436 case Intrinsic::x86_avx2_psrav_d_256
:
2437 case Intrinsic::x86_avx512_psrav_q_128
:
2438 case Intrinsic::x86_avx512_psrav_q_256
:
2439 case Intrinsic::x86_avx512_psrav_d_512
:
2440 case Intrinsic::x86_avx512_psrav_q_512
:
2441 case Intrinsic::x86_avx512_psrav_w_128
:
2442 case Intrinsic::x86_avx512_psrav_w_256
:
2443 case Intrinsic::x86_avx512_psrav_w_512
:
2444 case Intrinsic::x86_avx2_psrlv_d
:
2445 case Intrinsic::x86_avx2_psrlv_d_256
:
2446 case Intrinsic::x86_avx2_psrlv_q
:
2447 case Intrinsic::x86_avx2_psrlv_q_256
:
2448 case Intrinsic::x86_avx512_psrlv_d_512
:
2449 case Intrinsic::x86_avx512_psrlv_q_512
:
2450 case Intrinsic::x86_avx512_psrlv_w_128
:
2451 case Intrinsic::x86_avx512_psrlv_w_256
:
2452 case Intrinsic::x86_avx512_psrlv_w_512
:
2453 if (Value
*V
= simplifyX86varShift(II
, IC
.Builder
)) {
2454 return IC
.replaceInstUsesWith(II
, V
);
2458 case Intrinsic::x86_sse2_packssdw_128
:
2459 case Intrinsic::x86_sse2_packsswb_128
:
2460 case Intrinsic::x86_avx2_packssdw
:
2461 case Intrinsic::x86_avx2_packsswb
:
2462 case Intrinsic::x86_avx512_packssdw_512
:
2463 case Intrinsic::x86_avx512_packsswb_512
:
2464 if (Value
*V
= simplifyX86pack(II
, IC
.Builder
, true)) {
2465 return IC
.replaceInstUsesWith(II
, V
);
2469 case Intrinsic::x86_sse2_packuswb_128
:
2470 case Intrinsic::x86_sse41_packusdw
:
2471 case Intrinsic::x86_avx2_packusdw
:
2472 case Intrinsic::x86_avx2_packuswb
:
2473 case Intrinsic::x86_avx512_packusdw_512
:
2474 case Intrinsic::x86_avx512_packuswb_512
:
2475 if (Value
*V
= simplifyX86pack(II
, IC
.Builder
, false)) {
2476 return IC
.replaceInstUsesWith(II
, V
);
2480 case Intrinsic::x86_pclmulqdq
:
2481 case Intrinsic::x86_pclmulqdq_256
:
2482 case Intrinsic::x86_pclmulqdq_512
: {
2483 if (auto *C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2))) {
2484 unsigned Imm
= C
->getZExtValue();
2486 bool MadeChange
= false;
2487 Value
*Arg0
= II
.getArgOperand(0);
2488 Value
*Arg1
= II
.getArgOperand(1);
2490 cast
<FixedVectorType
>(Arg0
->getType())->getNumElements();
2492 APInt
UndefElts1(VWidth
, 0);
2493 APInt DemandedElts1
=
2494 APInt::getSplat(VWidth
, APInt(2, (Imm
& 0x01) ? 2 : 1));
2496 IC
.SimplifyDemandedVectorElts(Arg0
, DemandedElts1
, UndefElts1
)) {
2497 IC
.replaceOperand(II
, 0, V
);
2501 APInt
UndefElts2(VWidth
, 0);
2502 APInt DemandedElts2
=
2503 APInt::getSplat(VWidth
, APInt(2, (Imm
& 0x10) ? 2 : 1));
2505 IC
.SimplifyDemandedVectorElts(Arg1
, DemandedElts2
, UndefElts2
)) {
2506 IC
.replaceOperand(II
, 1, V
);
2510 // If either input elements are undef, the result is zero.
2511 if (DemandedElts1
.isSubsetOf(UndefElts1
) ||
2512 DemandedElts2
.isSubsetOf(UndefElts2
)) {
2513 return IC
.replaceInstUsesWith(II
,
2514 ConstantAggregateZero::get(II
.getType()));
2524 case Intrinsic::x86_sse41_insertps
:
2525 if (Value
*V
= simplifyX86insertps(II
, IC
.Builder
)) {
2526 return IC
.replaceInstUsesWith(II
, V
);
2530 case Intrinsic::x86_sse4a_extrq
: {
2531 Value
*Op0
= II
.getArgOperand(0);
2532 Value
*Op1
= II
.getArgOperand(1);
2533 unsigned VWidth0
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2534 unsigned VWidth1
= cast
<FixedVectorType
>(Op1
->getType())->getNumElements();
2535 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 &&
2536 Op1
->getType()->getPrimitiveSizeInBits() == 128 && VWidth0
== 2 &&
2537 VWidth1
== 16 && "Unexpected operand sizes");
2539 // See if we're dealing with constant values.
2540 auto *C1
= dyn_cast
<Constant
>(Op1
);
2542 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)0))
2545 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)1))
2548 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2549 if (Value
*V
= simplifyX86extrq(II
, Op0
, CILength
, CIIndex
, IC
.Builder
)) {
2550 return IC
.replaceInstUsesWith(II
, V
);
2553 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2554 // operands and the lowest 16-bits of the second.
2555 bool MadeChange
= false;
2556 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth0
, 1)) {
2557 IC
.replaceOperand(II
, 0, V
);
2560 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op1
, VWidth1
, 2)) {
2561 IC
.replaceOperand(II
, 1, V
);
2570 case Intrinsic::x86_sse4a_extrqi
: {
2571 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2572 // bits of the lower 64-bits. The upper 64-bits are undefined.
2573 Value
*Op0
= II
.getArgOperand(0);
2574 unsigned VWidth
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2575 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 && VWidth
== 2 &&
2576 "Unexpected operand size");
2578 // See if we're dealing with constant values.
2579 auto *CILength
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1));
2580 auto *CIIndex
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
2582 // Attempt to simplify to a constant or shuffle vector.
2583 if (Value
*V
= simplifyX86extrq(II
, Op0
, CILength
, CIIndex
, IC
.Builder
)) {
2584 return IC
.replaceInstUsesWith(II
, V
);
2587 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2589 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth
, 1)) {
2590 return IC
.replaceOperand(II
, 0, V
);
2595 case Intrinsic::x86_sse4a_insertq
: {
2596 Value
*Op0
= II
.getArgOperand(0);
2597 Value
*Op1
= II
.getArgOperand(1);
2598 unsigned VWidth
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2599 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 &&
2600 Op1
->getType()->getPrimitiveSizeInBits() == 128 && VWidth
== 2 &&
2601 cast
<FixedVectorType
>(Op1
->getType())->getNumElements() == 2 &&
2602 "Unexpected operand size");
2604 // See if we're dealing with constant values.
2605 auto *C1
= dyn_cast
<Constant
>(Op1
);
2607 C1
? dyn_cast_or_null
<ConstantInt
>(C1
->getAggregateElement((unsigned)1))
2610 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2612 const APInt
&V11
= CI11
->getValue();
2613 APInt Len
= V11
.zextOrTrunc(6);
2614 APInt Idx
= V11
.lshr(8).zextOrTrunc(6);
2615 if (Value
*V
= simplifyX86insertq(II
, Op0
, Op1
, Len
, Idx
, IC
.Builder
)) {
2616 return IC
.replaceInstUsesWith(II
, V
);
2620 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2622 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth
, 1)) {
2623 return IC
.replaceOperand(II
, 0, V
);
2628 case Intrinsic::x86_sse4a_insertqi
: {
2629 // INSERTQI: Extract lowest Length bits from lower half of second source and
2630 // insert over first source starting at Index bit. The upper 64-bits are
2632 Value
*Op0
= II
.getArgOperand(0);
2633 Value
*Op1
= II
.getArgOperand(1);
2634 unsigned VWidth0
= cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
2635 unsigned VWidth1
= cast
<FixedVectorType
>(Op1
->getType())->getNumElements();
2636 assert(Op0
->getType()->getPrimitiveSizeInBits() == 128 &&
2637 Op1
->getType()->getPrimitiveSizeInBits() == 128 && VWidth0
== 2 &&
2638 VWidth1
== 2 && "Unexpected operand sizes");
2640 // See if we're dealing with constant values.
2641 auto *CILength
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
2642 auto *CIIndex
= dyn_cast
<ConstantInt
>(II
.getArgOperand(3));
2644 // Attempt to simplify to a constant or shuffle vector.
2645 if (CILength
&& CIIndex
) {
2646 APInt Len
= CILength
->getValue().zextOrTrunc(6);
2647 APInt Idx
= CIIndex
->getValue().zextOrTrunc(6);
2648 if (Value
*V
= simplifyX86insertq(II
, Op0
, Op1
, Len
, Idx
, IC
.Builder
)) {
2649 return IC
.replaceInstUsesWith(II
, V
);
2653 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2655 bool MadeChange
= false;
2656 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op0
, VWidth0
, 1)) {
2657 IC
.replaceOperand(II
, 0, V
);
2660 if (Value
*V
= SimplifyDemandedVectorEltsLow(Op1
, VWidth1
, 1)) {
2661 IC
.replaceOperand(II
, 1, V
);
2670 case Intrinsic::x86_sse41_pblendvb
:
2671 case Intrinsic::x86_sse41_blendvps
:
2672 case Intrinsic::x86_sse41_blendvpd
:
2673 case Intrinsic::x86_avx_blendv_ps_256
:
2674 case Intrinsic::x86_avx_blendv_pd_256
:
2675 case Intrinsic::x86_avx2_pblendvb
: {
2676 // fold (blend A, A, Mask) -> A
2677 Value
*Op0
= II
.getArgOperand(0);
2678 Value
*Op1
= II
.getArgOperand(1);
2679 Value
*Mask
= II
.getArgOperand(2);
2681 return IC
.replaceInstUsesWith(II
, Op0
);
2684 // Zero Mask - select 1st argument.
2685 if (isa
<ConstantAggregateZero
>(Mask
)) {
2686 return IC
.replaceInstUsesWith(II
, Op0
);
2689 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2690 if (auto *ConstantMask
= dyn_cast
<ConstantDataVector
>(Mask
)) {
2691 Constant
*NewSelector
= getNegativeIsTrueBoolVec(ConstantMask
);
2692 return SelectInst::Create(NewSelector
, Op1
, Op0
, "blendv");
2695 // Convert to a vector select if we can bypass casts and find a boolean
2696 // vector condition value.
2698 Mask
= InstCombiner::peekThroughBitcast(Mask
);
2699 if (match(Mask
, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec
))) &&
2700 BoolVec
->getType()->isVectorTy() &&
2701 BoolVec
->getType()->getScalarSizeInBits() == 1) {
2702 assert(Mask
->getType()->getPrimitiveSizeInBits() ==
2703 II
.getType()->getPrimitiveSizeInBits() &&
2704 "Not expecting mask and operands with different sizes");
2706 unsigned NumMaskElts
=
2707 cast
<FixedVectorType
>(Mask
->getType())->getNumElements();
2708 unsigned NumOperandElts
=
2709 cast
<FixedVectorType
>(II
.getType())->getNumElements();
2710 if (NumMaskElts
== NumOperandElts
) {
2711 return SelectInst::Create(BoolVec
, Op1
, Op0
);
2714 // If the mask has less elements than the operands, each mask bit maps to
2715 // multiple elements of the operands. Bitcast back and forth.
2716 if (NumMaskElts
< NumOperandElts
) {
2717 Value
*CastOp0
= IC
.Builder
.CreateBitCast(Op0
, Mask
->getType());
2718 Value
*CastOp1
= IC
.Builder
.CreateBitCast(Op1
, Mask
->getType());
2719 Value
*Sel
= IC
.Builder
.CreateSelect(BoolVec
, CastOp1
, CastOp0
);
2720 return new BitCastInst(Sel
, II
.getType());
2727 case Intrinsic::x86_ssse3_pshuf_b_128
:
2728 case Intrinsic::x86_avx2_pshuf_b
:
2729 case Intrinsic::x86_avx512_pshuf_b_512
:
2730 if (Value
*V
= simplifyX86pshufb(II
, IC
.Builder
)) {
2731 return IC
.replaceInstUsesWith(II
, V
);
2735 case Intrinsic::x86_avx_vpermilvar_ps
:
2736 case Intrinsic::x86_avx_vpermilvar_ps_256
:
2737 case Intrinsic::x86_avx512_vpermilvar_ps_512
:
2738 case Intrinsic::x86_avx_vpermilvar_pd
:
2739 case Intrinsic::x86_avx_vpermilvar_pd_256
:
2740 case Intrinsic::x86_avx512_vpermilvar_pd_512
:
2741 if (Value
*V
= simplifyX86vpermilvar(II
, IC
.Builder
)) {
2742 return IC
.replaceInstUsesWith(II
, V
);
2746 case Intrinsic::x86_avx2_permd
:
2747 case Intrinsic::x86_avx2_permps
:
2748 case Intrinsic::x86_avx512_permvar_df_256
:
2749 case Intrinsic::x86_avx512_permvar_df_512
:
2750 case Intrinsic::x86_avx512_permvar_di_256
:
2751 case Intrinsic::x86_avx512_permvar_di_512
:
2752 case Intrinsic::x86_avx512_permvar_hi_128
:
2753 case Intrinsic::x86_avx512_permvar_hi_256
:
2754 case Intrinsic::x86_avx512_permvar_hi_512
:
2755 case Intrinsic::x86_avx512_permvar_qi_128
:
2756 case Intrinsic::x86_avx512_permvar_qi_256
:
2757 case Intrinsic::x86_avx512_permvar_qi_512
:
2758 case Intrinsic::x86_avx512_permvar_sf_512
:
2759 case Intrinsic::x86_avx512_permvar_si_512
:
2760 if (Value
*V
= simplifyX86vpermv(II
, IC
.Builder
)) {
2761 return IC
.replaceInstUsesWith(II
, V
);
2765 case Intrinsic::x86_avx_maskload_ps
:
2766 case Intrinsic::x86_avx_maskload_pd
:
2767 case Intrinsic::x86_avx_maskload_ps_256
:
2768 case Intrinsic::x86_avx_maskload_pd_256
:
2769 case Intrinsic::x86_avx2_maskload_d
:
2770 case Intrinsic::x86_avx2_maskload_q
:
2771 case Intrinsic::x86_avx2_maskload_d_256
:
2772 case Intrinsic::x86_avx2_maskload_q_256
:
2773 if (Instruction
*I
= simplifyX86MaskedLoad(II
, IC
)) {
2778 case Intrinsic::x86_sse2_maskmov_dqu
:
2779 case Intrinsic::x86_avx_maskstore_ps
:
2780 case Intrinsic::x86_avx_maskstore_pd
:
2781 case Intrinsic::x86_avx_maskstore_ps_256
:
2782 case Intrinsic::x86_avx_maskstore_pd_256
:
2783 case Intrinsic::x86_avx2_maskstore_d
:
2784 case Intrinsic::x86_avx2_maskstore_q
:
2785 case Intrinsic::x86_avx2_maskstore_d_256
:
2786 case Intrinsic::x86_avx2_maskstore_q_256
:
2787 if (simplifyX86MaskedStore(II
, IC
)) {
2792 case Intrinsic::x86_addcarry_32
:
2793 case Intrinsic::x86_addcarry_64
:
2794 if (Value
*V
= simplifyX86addcarry(II
, IC
.Builder
)) {
2795 return IC
.replaceInstUsesWith(II
, V
);
2799 case Intrinsic::x86_avx512_pternlog_d_128
:
2800 case Intrinsic::x86_avx512_pternlog_d_256
:
2801 case Intrinsic::x86_avx512_pternlog_d_512
:
2802 case Intrinsic::x86_avx512_pternlog_q_128
:
2803 case Intrinsic::x86_avx512_pternlog_q_256
:
2804 case Intrinsic::x86_avx512_pternlog_q_512
:
2805 if (Value
*V
= simplifyTernarylogic(II
, IC
.Builder
)) {
2806 return IC
.replaceInstUsesWith(II
, V
);
2812 return std::nullopt
;
2815 std::optional
<Value
*> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2816 InstCombiner
&IC
, IntrinsicInst
&II
, APInt DemandedMask
, KnownBits
&Known
,
2817 bool &KnownBitsComputed
) const {
2818 switch (II
.getIntrinsicID()) {
2821 case Intrinsic::x86_mmx_pmovmskb
:
2822 case Intrinsic::x86_sse_movmsk_ps
:
2823 case Intrinsic::x86_sse2_movmsk_pd
:
2824 case Intrinsic::x86_sse2_pmovmskb_128
:
2825 case Intrinsic::x86_avx_movmsk_ps_256
:
2826 case Intrinsic::x86_avx_movmsk_pd_256
:
2827 case Intrinsic::x86_avx2_pmovmskb
: {
2828 // MOVMSK copies the vector elements' sign bits to the low bits
2829 // and zeros the high bits.
2831 if (II
.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb
) {
2832 ArgWidth
= 8; // Arg is x86_mmx, but treated as <8 x i8>.
2834 auto *ArgType
= cast
<FixedVectorType
>(II
.getArgOperand(0)->getType());
2835 ArgWidth
= ArgType
->getNumElements();
2838 // If we don't need any of low bits then return zero,
2839 // we know that DemandedMask is non-zero already.
2840 APInt DemandedElts
= DemandedMask
.zextOrTrunc(ArgWidth
);
2841 Type
*VTy
= II
.getType();
2842 if (DemandedElts
.isZero()) {
2843 return ConstantInt::getNullValue(VTy
);
2846 // We know that the upper bits are set to zero.
2847 Known
.Zero
.setBitsFrom(ArgWidth
);
2848 KnownBitsComputed
= true;
2852 return std::nullopt
;
2855 std::optional
<Value
*> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2856 InstCombiner
&IC
, IntrinsicInst
&II
, APInt DemandedElts
, APInt
&UndefElts
,
2857 APInt
&UndefElts2
, APInt
&UndefElts3
,
2858 std::function
<void(Instruction
*, unsigned, APInt
, APInt
&)>
2859 simplifyAndSetOp
) const {
2860 unsigned VWidth
= cast
<FixedVectorType
>(II
.getType())->getNumElements();
2861 switch (II
.getIntrinsicID()) {
2864 case Intrinsic::x86_xop_vfrcz_ss
:
2865 case Intrinsic::x86_xop_vfrcz_sd
:
2866 // The instructions for these intrinsics are speced to zero upper bits not
2867 // pass them through like other scalar intrinsics. So we shouldn't just
2868 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2869 // Instead we should return a zero vector.
2870 if (!DemandedElts
[0]) {
2871 IC
.addToWorklist(&II
);
2872 return ConstantAggregateZero::get(II
.getType());
2875 // Only the lower element is used.
2877 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
2879 // Only the lower element is undefined. The high elements are zero.
2880 UndefElts
= UndefElts
[0];
2883 // Unary scalar-as-vector operations that work column-wise.
2884 case Intrinsic::x86_sse_rcp_ss
:
2885 case Intrinsic::x86_sse_rsqrt_ss
:
2886 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
2888 // If lowest element of a scalar op isn't used then use Arg0.
2889 if (!DemandedElts
[0]) {
2890 IC
.addToWorklist(&II
);
2891 return II
.getArgOperand(0);
2893 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2897 // Binary scalar-as-vector operations that work column-wise. The high
2898 // elements come from operand 0. The low element is a function of both
2900 case Intrinsic::x86_sse_min_ss
:
2901 case Intrinsic::x86_sse_max_ss
:
2902 case Intrinsic::x86_sse_cmp_ss
:
2903 case Intrinsic::x86_sse2_min_sd
:
2904 case Intrinsic::x86_sse2_max_sd
:
2905 case Intrinsic::x86_sse2_cmp_sd
: {
2906 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
2908 // If lowest element of a scalar op isn't used then use Arg0.
2909 if (!DemandedElts
[0]) {
2910 IC
.addToWorklist(&II
);
2911 return II
.getArgOperand(0);
2914 // Only lower element is used for operand 1.
2916 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
2918 // Lower element is undefined if both lower elements are undefined.
2919 // Consider things like undef&0. The result is known zero, not undef.
2921 UndefElts
.clearBit(0);
2926 // Binary scalar-as-vector operations that work column-wise. The high
2927 // elements come from operand 0 and the low element comes from operand 1.
2928 case Intrinsic::x86_sse41_round_ss
:
2929 case Intrinsic::x86_sse41_round_sd
: {
2930 // Don't use the low element of operand 0.
2931 APInt DemandedElts2
= DemandedElts
;
2932 DemandedElts2
.clearBit(0);
2933 simplifyAndSetOp(&II
, 0, DemandedElts2
, UndefElts
);
2935 // If lowest element of a scalar op isn't used then use Arg0.
2936 if (!DemandedElts
[0]) {
2937 IC
.addToWorklist(&II
);
2938 return II
.getArgOperand(0);
2941 // Only lower element is used for operand 1.
2943 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
2945 // Take the high undef elements from operand 0 and take the lower element
2947 UndefElts
.clearBit(0);
2948 UndefElts
|= UndefElts2
[0];
2952 // Three input scalar-as-vector operations that work column-wise. The high
2953 // elements come from operand 0 and the low element is a function of all
2955 case Intrinsic::x86_avx512_mask_add_ss_round
:
2956 case Intrinsic::x86_avx512_mask_div_ss_round
:
2957 case Intrinsic::x86_avx512_mask_mul_ss_round
:
2958 case Intrinsic::x86_avx512_mask_sub_ss_round
:
2959 case Intrinsic::x86_avx512_mask_max_ss_round
:
2960 case Intrinsic::x86_avx512_mask_min_ss_round
:
2961 case Intrinsic::x86_avx512_mask_add_sd_round
:
2962 case Intrinsic::x86_avx512_mask_div_sd_round
:
2963 case Intrinsic::x86_avx512_mask_mul_sd_round
:
2964 case Intrinsic::x86_avx512_mask_sub_sd_round
:
2965 case Intrinsic::x86_avx512_mask_max_sd_round
:
2966 case Intrinsic::x86_avx512_mask_min_sd_round
:
2967 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
2969 // If lowest element of a scalar op isn't used then use Arg0.
2970 if (!DemandedElts
[0]) {
2971 IC
.addToWorklist(&II
);
2972 return II
.getArgOperand(0);
2975 // Only lower element is used for operand 1 and 2.
2977 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
2978 simplifyAndSetOp(&II
, 2, DemandedElts
, UndefElts3
);
2980 // Lower element is undefined if all three lower elements are undefined.
2981 // Consider things like undef&0. The result is known zero, not undef.
2982 if (!UndefElts2
[0] || !UndefElts3
[0])
2983 UndefElts
.clearBit(0);
2986 // TODO: Add fmaddsub support?
2987 case Intrinsic::x86_sse3_addsub_pd
:
2988 case Intrinsic::x86_sse3_addsub_ps
:
2989 case Intrinsic::x86_avx_addsub_pd_256
:
2990 case Intrinsic::x86_avx_addsub_ps_256
: {
2991 // If none of the even or none of the odd lanes are required, turn this
2992 // into a generic FP math instruction.
2993 APInt SubMask
= APInt::getSplat(VWidth
, APInt(2, 0x1));
2994 APInt AddMask
= APInt::getSplat(VWidth
, APInt(2, 0x2));
2995 bool IsSubOnly
= DemandedElts
.isSubsetOf(SubMask
);
2996 bool IsAddOnly
= DemandedElts
.isSubsetOf(AddMask
);
2997 if (IsSubOnly
|| IsAddOnly
) {
2998 assert((IsSubOnly
^ IsAddOnly
) && "Can't be both add-only and sub-only");
2999 IRBuilderBase::InsertPointGuard
Guard(IC
.Builder
);
3000 IC
.Builder
.SetInsertPoint(&II
);
3001 Value
*Arg0
= II
.getArgOperand(0), *Arg1
= II
.getArgOperand(1);
3002 return IC
.Builder
.CreateBinOp(
3003 IsSubOnly
? Instruction::FSub
: Instruction::FAdd
, Arg0
, Arg1
);
3006 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3007 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3008 UndefElts
&= UndefElts2
;
3012 // General per-element vector operations.
3013 case Intrinsic::x86_avx2_psllv_d
:
3014 case Intrinsic::x86_avx2_psllv_d_256
:
3015 case Intrinsic::x86_avx2_psllv_q
:
3016 case Intrinsic::x86_avx2_psllv_q_256
:
3017 case Intrinsic::x86_avx2_psrlv_d
:
3018 case Intrinsic::x86_avx2_psrlv_d_256
:
3019 case Intrinsic::x86_avx2_psrlv_q
:
3020 case Intrinsic::x86_avx2_psrlv_q_256
:
3021 case Intrinsic::x86_avx2_psrav_d
:
3022 case Intrinsic::x86_avx2_psrav_d_256
: {
3023 simplifyAndSetOp(&II
, 0, DemandedElts
, UndefElts
);
3024 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts2
);
3025 UndefElts
&= UndefElts2
;
3029 case Intrinsic::x86_sse2_packssdw_128
:
3030 case Intrinsic::x86_sse2_packsswb_128
:
3031 case Intrinsic::x86_sse2_packuswb_128
:
3032 case Intrinsic::x86_sse41_packusdw
:
3033 case Intrinsic::x86_avx2_packssdw
:
3034 case Intrinsic::x86_avx2_packsswb
:
3035 case Intrinsic::x86_avx2_packusdw
:
3036 case Intrinsic::x86_avx2_packuswb
:
3037 case Intrinsic::x86_avx512_packssdw_512
:
3038 case Intrinsic::x86_avx512_packsswb_512
:
3039 case Intrinsic::x86_avx512_packusdw_512
:
3040 case Intrinsic::x86_avx512_packuswb_512
: {
3041 auto *Ty0
= II
.getArgOperand(0)->getType();
3042 unsigned InnerVWidth
= cast
<FixedVectorType
>(Ty0
)->getNumElements();
3043 assert(VWidth
== (InnerVWidth
* 2) && "Unexpected input size");
3045 unsigned NumLanes
= Ty0
->getPrimitiveSizeInBits() / 128;
3046 unsigned VWidthPerLane
= VWidth
/ NumLanes
;
3047 unsigned InnerVWidthPerLane
= InnerVWidth
/ NumLanes
;
3049 // Per lane, pack the elements of the first input and then the second.
3051 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3052 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3053 for (int OpNum
= 0; OpNum
!= 2; ++OpNum
) {
3054 APInt
OpDemandedElts(InnerVWidth
, 0);
3055 for (unsigned Lane
= 0; Lane
!= NumLanes
; ++Lane
) {
3056 unsigned LaneIdx
= Lane
* VWidthPerLane
;
3057 for (unsigned Elt
= 0; Elt
!= InnerVWidthPerLane
; ++Elt
) {
3058 unsigned Idx
= LaneIdx
+ Elt
+ InnerVWidthPerLane
* OpNum
;
3059 if (DemandedElts
[Idx
])
3060 OpDemandedElts
.setBit((Lane
* InnerVWidthPerLane
) + Elt
);
3064 // Demand elements from the operand.
3065 APInt
OpUndefElts(InnerVWidth
, 0);
3066 simplifyAndSetOp(&II
, OpNum
, OpDemandedElts
, OpUndefElts
);
3068 // Pack the operand's UNDEF elements, one lane at a time.
3069 OpUndefElts
= OpUndefElts
.zext(VWidth
);
3070 for (unsigned Lane
= 0; Lane
!= NumLanes
; ++Lane
) {
3071 APInt LaneElts
= OpUndefElts
.lshr(InnerVWidthPerLane
* Lane
);
3072 LaneElts
= LaneElts
.getLoBits(InnerVWidthPerLane
);
3073 LaneElts
<<= InnerVWidthPerLane
* (2 * Lane
+ OpNum
);
3074 UndefElts
|= LaneElts
;
3081 case Intrinsic::x86_ssse3_pshuf_b_128
:
3082 case Intrinsic::x86_avx2_pshuf_b
:
3083 case Intrinsic::x86_avx512_pshuf_b_512
:
3085 case Intrinsic::x86_avx_vpermilvar_ps
:
3086 case Intrinsic::x86_avx_vpermilvar_ps_256
:
3087 case Intrinsic::x86_avx512_vpermilvar_ps_512
:
3088 case Intrinsic::x86_avx_vpermilvar_pd
:
3089 case Intrinsic::x86_avx_vpermilvar_pd_256
:
3090 case Intrinsic::x86_avx512_vpermilvar_pd_512
:
3092 case Intrinsic::x86_avx2_permd
:
3093 case Intrinsic::x86_avx2_permps
: {
3094 simplifyAndSetOp(&II
, 1, DemandedElts
, UndefElts
);
3098 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3099 // in an undefined state.
3100 case Intrinsic::x86_sse4a_extrq
:
3101 case Intrinsic::x86_sse4a_extrqi
:
3102 case Intrinsic::x86_sse4a_insertq
:
3103 case Intrinsic::x86_sse4a_insertqi
:
3104 UndefElts
.setHighBits(VWidth
/ 2);
3107 return std::nullopt
;