[llvm-shlib] Fix the version naming style of libLLVM for Windows (#85710)
[llvm-project.git] / llvm / lib / Target / X86 / X86InstCombineIntrinsic.cpp
blobe46fc034cc2696a9ef6de2ed19a57f7ea9734065
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
23 using namespace llvm;
25 #define DEBUG_TYPE "x86tti"
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
29 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31 V = ConstantExpr::getBitCast(V, IntTy);
32 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
33 V);
34 return V;
37 /// Convert the x86 XMM integer vector mask to a vector of bools based on
38 /// each element's most significant bit (the sign bit).
39 static Value *getBoolVecFromMask(Value *Mask) {
40 // Fold Constant Mask.
41 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
42 return getNegativeIsTrueBoolVec(ConstantMask);
44 // Mask was extended from a boolean vector.
45 Value *ExtMask;
46 if (PatternMatch::match(
47 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
48 ExtMask->getType()->isIntOrIntVectorTy(1))
49 return ExtMask;
51 return nullptr;
54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
55 // XMM register mask efficiently, we could transform all x86 masked intrinsics
56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
57 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
58 Value *Ptr = II.getOperand(0);
59 Value *Mask = II.getOperand(1);
60 Constant *ZeroVec = Constant::getNullValue(II.getType());
62 // Zero Mask - masked load instruction creates a zero vector.
63 if (isa<ConstantAggregateZero>(Mask))
64 return IC.replaceInstUsesWith(II, ZeroVec);
66 // The mask is constant or extended from a bool vector. Convert this x86
67 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
68 if (Value *BoolMask = getBoolVecFromMask(Mask)) {
69 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
70 // the LLVM intrinsic definition for the pointer argument.
71 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
72 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
73 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75 // The pass-through vector for an x86 masked load is a zero vector.
76 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
77 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
78 return IC.replaceInstUsesWith(II, NewMaskedLoad);
81 return nullptr;
84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
85 // XMM register mask efficiently, we could transform all x86 masked intrinsics
86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
87 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
88 Value *Ptr = II.getOperand(0);
89 Value *Mask = II.getOperand(1);
90 Value *Vec = II.getOperand(2);
92 // Zero Mask - this masked store instruction does nothing.
93 if (isa<ConstantAggregateZero>(Mask)) {
94 IC.eraseInstFromFunction(II);
95 return true;
98 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
99 // anything else at this level.
100 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
101 return false;
103 // The mask is constant or extended from a bool vector. Convert this x86
104 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
105 if (Value *BoolMask = getBoolVecFromMask(Mask)) {
106 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
107 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
108 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112 // 'Replace uses' doesn't work for stores. Erase the original masked store.
113 IC.eraseInstFromFunction(II);
114 return true;
117 return false;
120 static Value *simplifyX86immShift(const IntrinsicInst &II,
121 InstCombiner::BuilderTy &Builder) {
122 bool LogicalShift = false;
123 bool ShiftLeft = false;
124 bool IsImm = false;
126 switch (II.getIntrinsicID()) {
127 default:
128 llvm_unreachable("Unexpected intrinsic!");
129 case Intrinsic::x86_sse2_psrai_d:
130 case Intrinsic::x86_sse2_psrai_w:
131 case Intrinsic::x86_avx2_psrai_d:
132 case Intrinsic::x86_avx2_psrai_w:
133 case Intrinsic::x86_avx512_psrai_q_128:
134 case Intrinsic::x86_avx512_psrai_q_256:
135 case Intrinsic::x86_avx512_psrai_d_512:
136 case Intrinsic::x86_avx512_psrai_q_512:
137 case Intrinsic::x86_avx512_psrai_w_512:
138 IsImm = true;
139 [[fallthrough]];
140 case Intrinsic::x86_sse2_psra_d:
141 case Intrinsic::x86_sse2_psra_w:
142 case Intrinsic::x86_avx2_psra_d:
143 case Intrinsic::x86_avx2_psra_w:
144 case Intrinsic::x86_avx512_psra_q_128:
145 case Intrinsic::x86_avx512_psra_q_256:
146 case Intrinsic::x86_avx512_psra_d_512:
147 case Intrinsic::x86_avx512_psra_q_512:
148 case Intrinsic::x86_avx512_psra_w_512:
149 LogicalShift = false;
150 ShiftLeft = false;
151 break;
152 case Intrinsic::x86_sse2_psrli_d:
153 case Intrinsic::x86_sse2_psrli_q:
154 case Intrinsic::x86_sse2_psrli_w:
155 case Intrinsic::x86_avx2_psrli_d:
156 case Intrinsic::x86_avx2_psrli_q:
157 case Intrinsic::x86_avx2_psrli_w:
158 case Intrinsic::x86_avx512_psrli_d_512:
159 case Intrinsic::x86_avx512_psrli_q_512:
160 case Intrinsic::x86_avx512_psrli_w_512:
161 IsImm = true;
162 [[fallthrough]];
163 case Intrinsic::x86_sse2_psrl_d:
164 case Intrinsic::x86_sse2_psrl_q:
165 case Intrinsic::x86_sse2_psrl_w:
166 case Intrinsic::x86_avx2_psrl_d:
167 case Intrinsic::x86_avx2_psrl_q:
168 case Intrinsic::x86_avx2_psrl_w:
169 case Intrinsic::x86_avx512_psrl_d_512:
170 case Intrinsic::x86_avx512_psrl_q_512:
171 case Intrinsic::x86_avx512_psrl_w_512:
172 LogicalShift = true;
173 ShiftLeft = false;
174 break;
175 case Intrinsic::x86_sse2_pslli_d:
176 case Intrinsic::x86_sse2_pslli_q:
177 case Intrinsic::x86_sse2_pslli_w:
178 case Intrinsic::x86_avx2_pslli_d:
179 case Intrinsic::x86_avx2_pslli_q:
180 case Intrinsic::x86_avx2_pslli_w:
181 case Intrinsic::x86_avx512_pslli_d_512:
182 case Intrinsic::x86_avx512_pslli_q_512:
183 case Intrinsic::x86_avx512_pslli_w_512:
184 IsImm = true;
185 [[fallthrough]];
186 case Intrinsic::x86_sse2_psll_d:
187 case Intrinsic::x86_sse2_psll_q:
188 case Intrinsic::x86_sse2_psll_w:
189 case Intrinsic::x86_avx2_psll_d:
190 case Intrinsic::x86_avx2_psll_q:
191 case Intrinsic::x86_avx2_psll_w:
192 case Intrinsic::x86_avx512_psll_d_512:
193 case Intrinsic::x86_avx512_psll_q_512:
194 case Intrinsic::x86_avx512_psll_w_512:
195 LogicalShift = true;
196 ShiftLeft = true;
197 break;
199 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201 Value *Vec = II.getArgOperand(0);
202 Value *Amt = II.getArgOperand(1);
203 auto *VT = cast<FixedVectorType>(Vec->getType());
204 Type *SVT = VT->getElementType();
205 Type *AmtVT = Amt->getType();
206 unsigned VWidth = VT->getNumElements();
207 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209 // If the shift amount is guaranteed to be in-range we can replace it with a
210 // generic shift. If its guaranteed to be out of range, logical shifts combine
211 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
212 if (IsImm) {
213 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
214 KnownBits KnownAmtBits =
215 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
216 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
217 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
218 Amt = Builder.CreateVectorSplat(VWidth, Amt);
219 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
220 : Builder.CreateLShr(Vec, Amt))
221 : Builder.CreateAShr(Vec, Amt));
223 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
224 if (LogicalShift)
225 return ConstantAggregateZero::get(VT);
226 Amt = ConstantInt::get(SVT, BitWidth - 1);
227 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229 } else {
230 // Ensure the first element has an in-range value and the rest of the
231 // elements in the bottom 64 bits are zero.
232 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
233 cast<VectorType>(AmtVT)->getElementType() == SVT &&
234 "Unexpected shift-by-scalar type");
235 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
236 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
237 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
238 KnownBits KnownLowerBits = llvm::computeKnownBits(
239 Amt, DemandedLower, II.getModule()->getDataLayout());
240 KnownBits KnownUpperBits = llvm::computeKnownBits(
241 Amt, DemandedUpper, II.getModule()->getDataLayout());
242 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
243 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
244 SmallVector<int, 16> ZeroSplat(VWidth, 0);
245 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
246 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
247 : Builder.CreateLShr(Vec, Amt))
248 : Builder.CreateAShr(Vec, Amt));
252 // Simplify if count is constant vector.
253 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
254 if (!CDV)
255 return nullptr;
257 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
258 // operand to compute the shift amount.
259 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
260 cast<VectorType>(AmtVT)->getElementType() == SVT &&
261 "Unexpected shift-by-scalar type");
263 // Concatenate the sub-elements to create the 64-bit value.
264 APInt Count(64, 0);
265 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
266 unsigned SubEltIdx = (NumSubElts - 1) - i;
267 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
268 Count <<= BitWidth;
269 Count |= SubElt->getValue().zextOrTrunc(64);
272 // If shift-by-zero then just return the original value.
273 if (Count.isZero())
274 return Vec;
276 // Handle cases when Shift >= BitWidth.
277 if (Count.uge(BitWidth)) {
278 // If LogicalShift - just return zero.
279 if (LogicalShift)
280 return ConstantAggregateZero::get(VT);
282 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
283 Count = APInt(64, BitWidth - 1);
286 // Get a constant vector of the same type as the first operand.
287 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
288 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290 if (ShiftLeft)
291 return Builder.CreateShl(Vec, ShiftVec);
293 if (LogicalShift)
294 return Builder.CreateLShr(Vec, ShiftVec);
296 return Builder.CreateAShr(Vec, ShiftVec);
299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
302 static Value *simplifyX86varShift(const IntrinsicInst &II,
303 InstCombiner::BuilderTy &Builder) {
304 bool LogicalShift = false;
305 bool ShiftLeft = false;
307 switch (II.getIntrinsicID()) {
308 default:
309 llvm_unreachable("Unexpected intrinsic!");
310 case Intrinsic::x86_avx2_psrav_d:
311 case Intrinsic::x86_avx2_psrav_d_256:
312 case Intrinsic::x86_avx512_psrav_q_128:
313 case Intrinsic::x86_avx512_psrav_q_256:
314 case Intrinsic::x86_avx512_psrav_d_512:
315 case Intrinsic::x86_avx512_psrav_q_512:
316 case Intrinsic::x86_avx512_psrav_w_128:
317 case Intrinsic::x86_avx512_psrav_w_256:
318 case Intrinsic::x86_avx512_psrav_w_512:
319 LogicalShift = false;
320 ShiftLeft = false;
321 break;
322 case Intrinsic::x86_avx2_psrlv_d:
323 case Intrinsic::x86_avx2_psrlv_d_256:
324 case Intrinsic::x86_avx2_psrlv_q:
325 case Intrinsic::x86_avx2_psrlv_q_256:
326 case Intrinsic::x86_avx512_psrlv_d_512:
327 case Intrinsic::x86_avx512_psrlv_q_512:
328 case Intrinsic::x86_avx512_psrlv_w_128:
329 case Intrinsic::x86_avx512_psrlv_w_256:
330 case Intrinsic::x86_avx512_psrlv_w_512:
331 LogicalShift = true;
332 ShiftLeft = false;
333 break;
334 case Intrinsic::x86_avx2_psllv_d:
335 case Intrinsic::x86_avx2_psllv_d_256:
336 case Intrinsic::x86_avx2_psllv_q:
337 case Intrinsic::x86_avx2_psllv_q_256:
338 case Intrinsic::x86_avx512_psllv_d_512:
339 case Intrinsic::x86_avx512_psllv_q_512:
340 case Intrinsic::x86_avx512_psllv_w_128:
341 case Intrinsic::x86_avx512_psllv_w_256:
342 case Intrinsic::x86_avx512_psllv_w_512:
343 LogicalShift = true;
344 ShiftLeft = true;
345 break;
347 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349 Value *Vec = II.getArgOperand(0);
350 Value *Amt = II.getArgOperand(1);
351 auto *VT = cast<FixedVectorType>(II.getType());
352 Type *SVT = VT->getElementType();
353 int NumElts = VT->getNumElements();
354 int BitWidth = SVT->getIntegerBitWidth();
356 // If the shift amount is guaranteed to be in-range we can replace it with a
357 // generic shift.
358 KnownBits KnownAmt =
359 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
360 if (KnownAmt.getMaxValue().ult(BitWidth)) {
361 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362 : Builder.CreateLShr(Vec, Amt))
363 : Builder.CreateAShr(Vec, Amt));
366 // Simplify if all shift amounts are constant/undef.
367 auto *CShift = dyn_cast<Constant>(Amt);
368 if (!CShift)
369 return nullptr;
371 // Collect each element's shift amount.
372 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373 bool AnyOutOfRange = false;
374 SmallVector<int, 8> ShiftAmts;
375 for (int I = 0; I < NumElts; ++I) {
376 auto *CElt = CShift->getAggregateElement(I);
377 if (isa_and_nonnull<UndefValue>(CElt)) {
378 ShiftAmts.push_back(-1);
379 continue;
382 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383 if (!COp)
384 return nullptr;
386 // Handle out of range shifts.
387 // If LogicalShift - set to BitWidth (special case).
388 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389 APInt ShiftVal = COp->getValue();
390 if (ShiftVal.uge(BitWidth)) {
391 AnyOutOfRange = LogicalShift;
392 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393 continue;
396 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
399 // If all elements out of range or UNDEF, return vector of zeros/undefs.
400 // ArithmeticShift should only hit this if they are all UNDEF.
401 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402 if (llvm::all_of(ShiftAmts, OutOfRange)) {
403 SmallVector<Constant *, 8> ConstantVec;
404 for (int Idx : ShiftAmts) {
405 if (Idx < 0) {
406 ConstantVec.push_back(UndefValue::get(SVT));
407 } else {
408 assert(LogicalShift && "Logical shift expected");
409 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
412 return ConstantVector::get(ConstantVec);
415 // We can't handle only some out of range values with generic logical shifts.
416 if (AnyOutOfRange)
417 return nullptr;
419 // Build the shift amount constant vector.
420 SmallVector<Constant *, 8> ShiftVecAmts;
421 for (int Idx : ShiftAmts) {
422 if (Idx < 0)
423 ShiftVecAmts.push_back(UndefValue::get(SVT));
424 else
425 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429 if (ShiftLeft)
430 return Builder.CreateShl(Vec, ShiftVec);
432 if (LogicalShift)
433 return Builder.CreateLShr(Vec, ShiftVec);
435 return Builder.CreateAShr(Vec, ShiftVec);
438 static Value *simplifyX86pack(IntrinsicInst &II,
439 InstCombiner::BuilderTy &Builder, bool IsSigned) {
440 Value *Arg0 = II.getArgOperand(0);
441 Value *Arg1 = II.getArgOperand(1);
442 Type *ResTy = II.getType();
444 // Fast all undef handling.
445 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446 return UndefValue::get(ResTy);
448 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450 unsigned NumSrcElts = ArgTy->getNumElements();
451 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452 "Unexpected packing types");
454 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458 "Unexpected packing types");
460 // Constant folding.
461 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462 return nullptr;
464 // Clamp Values - signed/unsigned both use signed clamp values, but they
465 // differ on the min/max values.
466 APInt MinValue, MaxValue;
467 if (IsSigned) {
468 // PACKSS: Truncate signed value with signed saturation.
469 // Source values less than dst minint are saturated to minint.
470 // Source values greater than dst maxint are saturated to maxint.
471 MinValue =
472 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473 MaxValue =
474 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475 } else {
476 // PACKUS: Truncate signed value with unsigned saturation.
477 // Source values less than zero are saturated to zero.
478 // Source values greater than dst maxuint are saturated to maxuint.
479 MinValue = APInt::getZero(SrcScalarSizeInBits);
480 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
483 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490 // Shuffle clamped args together at the lane level.
491 SmallVector<int, 32> PackMask;
492 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500 // Truncate to dst size.
501 return Builder.CreateTrunc(Shuffle, ResTy);
504 static Value *simplifyX86movmsk(const IntrinsicInst &II,
505 InstCombiner::BuilderTy &Builder) {
506 Value *Arg = II.getArgOperand(0);
507 Type *ResTy = II.getType();
509 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510 if (isa<UndefValue>(Arg))
511 return Constant::getNullValue(ResTy);
513 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514 // We can't easily peek through x86_mmx types.
515 if (!ArgTy)
516 return nullptr;
518 // Expand MOVMSK to compare/bitcast/zext:
519 // e.g. PMOVMSKB(v16i8 x):
520 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521 // %int = bitcast <16 x i1> %cmp to i16
522 // %res = zext i16 %int to i32
523 unsigned NumElts = ArgTy->getNumElements();
524 Type *IntegerTy = Builder.getIntNTy(NumElts);
526 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
527 Res = Builder.CreateIsNeg(Res);
528 Res = Builder.CreateBitCast(Res, IntegerTy);
529 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
530 return Res;
533 static Value *simplifyX86addcarry(const IntrinsicInst &II,
534 InstCombiner::BuilderTy &Builder) {
535 Value *CarryIn = II.getArgOperand(0);
536 Value *Op1 = II.getArgOperand(1);
537 Value *Op2 = II.getArgOperand(2);
538 Type *RetTy = II.getType();
539 Type *OpTy = Op1->getType();
540 assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
541 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
542 "Unexpected types for x86 addcarry");
544 // If carry-in is zero, this is just an unsigned add with overflow.
545 if (match(CarryIn, PatternMatch::m_ZeroInt())) {
546 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
547 {Op1, Op2});
548 // The types have to be adjusted to match the x86 call types.
549 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
550 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
551 Builder.getInt8Ty());
552 Value *Res = PoisonValue::get(RetTy);
553 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
554 return Builder.CreateInsertValue(Res, UAddResult, 1);
557 return nullptr;
560 static Value *simplifyTernarylogic(const IntrinsicInst &II,
561 InstCombiner::BuilderTy &Builder) {
563 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
564 if (!ArgImm || ArgImm->getValue().uge(256))
565 return nullptr;
567 Value *ArgA = II.getArgOperand(0);
568 Value *ArgB = II.getArgOperand(1);
569 Value *ArgC = II.getArgOperand(2);
571 Type *Ty = II.getType();
573 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
574 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
576 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
577 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
579 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
580 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
582 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
583 return {Builder.CreateNot(V.first), ~V.second};
585 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
586 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
587 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
589 bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
590 bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
591 bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
593 bool ABIsConst = AIsConst && BIsConst;
594 bool ACIsConst = AIsConst && CIsConst;
595 bool BCIsConst = BIsConst && CIsConst;
596 bool ABCIsConst = AIsConst && BIsConst && CIsConst;
598 // Use for verification. Its a big table. Its difficult to go from Imm ->
599 // logic ops, but easy to verify that a set of logic ops is correct. We track
600 // the logic ops through the second value in the pair. At the end it should
601 // equal Imm.
602 std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
603 std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
604 std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
605 std::pair<Value *, uint8_t> Res = {nullptr, 0};
607 // Currently we only handle cases that convert directly to another instruction
608 // or cases where all the ops are constant. This is because we don't properly
609 // handle creating ternary ops in the backend, so splitting them here may
610 // cause regressions. As the backend improves, uncomment more cases.
612 uint8_t Imm = ArgImm->getValue().getZExtValue();
613 switch (Imm) {
614 case 0x0:
615 Res = {Constant::getNullValue(Ty), 0};
616 break;
617 case 0x1:
618 if (ABCIsConst)
619 Res = Nor(Or(A, B), C);
620 break;
621 case 0x2:
622 if (ABCIsConst)
623 Res = And(Nor(A, B), C);
624 break;
625 case 0x3:
626 if (ABIsConst)
627 Res = Nor(A, B);
628 break;
629 case 0x4:
630 if (ABCIsConst)
631 Res = And(Nor(A, C), B);
632 break;
633 case 0x5:
634 if (ACIsConst)
635 Res = Nor(A, C);
636 break;
637 case 0x6:
638 if (ABCIsConst)
639 Res = Nor(A, Xnor(B, C));
640 break;
641 case 0x7:
642 if (ABCIsConst)
643 Res = Nor(A, And(B, C));
644 break;
645 case 0x8:
646 if (ABCIsConst)
647 Res = Nor(A, Nand(B, C));
648 break;
649 case 0x9:
650 if (ABCIsConst)
651 Res = Nor(A, Xor(B, C));
652 break;
653 case 0xa:
654 if (ACIsConst)
655 Res = Nor(A, Not(C));
656 break;
657 case 0xb:
658 if (ABCIsConst)
659 Res = Nor(A, Nor(C, Not(B)));
660 break;
661 case 0xc:
662 if (ABIsConst)
663 Res = Nor(A, Not(B));
664 break;
665 case 0xd:
666 if (ABCIsConst)
667 Res = Nor(A, Nor(B, Not(C)));
668 break;
669 case 0xe:
670 if (ABCIsConst)
671 Res = Nor(A, Nor(B, C));
672 break;
673 case 0xf:
674 Res = Not(A);
675 break;
676 case 0x10:
677 if (ABCIsConst)
678 Res = And(A, Nor(B, C));
679 break;
680 case 0x11:
681 if (BCIsConst)
682 Res = Nor(B, C);
683 break;
684 case 0x12:
685 if (ABCIsConst)
686 Res = Nor(Xnor(A, C), B);
687 break;
688 case 0x13:
689 if (ABCIsConst)
690 Res = Nor(And(A, C), B);
691 break;
692 case 0x14:
693 if (ABCIsConst)
694 Res = Nor(Xnor(A, B), C);
695 break;
696 case 0x15:
697 if (ABCIsConst)
698 Res = Nor(And(A, B), C);
699 break;
700 case 0x16:
701 if (ABCIsConst)
702 Res = Xor(Xor(A, B), And(Nand(A, B), C));
703 break;
704 case 0x17:
705 if (ABCIsConst)
706 Res = Xor(Or(A, B), Or(Xnor(A, B), C));
707 break;
708 case 0x18:
709 if (ABCIsConst)
710 Res = Nor(Xnor(A, B), Xnor(A, C));
711 break;
712 case 0x19:
713 if (ABCIsConst)
714 Res = And(Nand(A, B), Xnor(B, C));
715 break;
716 case 0x1a:
717 if (ABCIsConst)
718 Res = Xor(A, Or(And(A, B), C));
719 break;
720 case 0x1b:
721 if (ABCIsConst)
722 Res = Xor(A, Or(Xnor(A, B), C));
723 break;
724 case 0x1c:
725 if (ABCIsConst)
726 Res = Xor(A, Or(And(A, C), B));
727 break;
728 case 0x1d:
729 if (ABCIsConst)
730 Res = Xor(A, Or(Xnor(A, C), B));
731 break;
732 case 0x1e:
733 if (ABCIsConst)
734 Res = Xor(A, Or(B, C));
735 break;
736 case 0x1f:
737 if (ABCIsConst)
738 Res = Nand(A, Or(B, C));
739 break;
740 case 0x20:
741 if (ABCIsConst)
742 Res = Nor(Nand(A, C), B);
743 break;
744 case 0x21:
745 if (ABCIsConst)
746 Res = Nor(Xor(A, C), B);
747 break;
748 case 0x22:
749 if (BCIsConst)
750 Res = Nor(B, Not(C));
751 break;
752 case 0x23:
753 if (ABCIsConst)
754 Res = Nor(B, Nor(C, Not(A)));
755 break;
756 case 0x24:
757 if (ABCIsConst)
758 Res = Nor(Xnor(A, B), Xor(A, C));
759 break;
760 case 0x25:
761 if (ABCIsConst)
762 Res = Xor(A, Nand(Nand(A, B), C));
763 break;
764 case 0x26:
765 if (ABCIsConst)
766 Res = And(Nand(A, B), Xor(B, C));
767 break;
768 case 0x27:
769 if (ABCIsConst)
770 Res = Xor(Or(Xnor(A, B), C), B);
771 break;
772 case 0x28:
773 if (ABCIsConst)
774 Res = And(Xor(A, B), C);
775 break;
776 case 0x29:
777 if (ABCIsConst)
778 Res = Xor(Xor(A, B), Nor(And(A, B), C));
779 break;
780 case 0x2a:
781 if (ABCIsConst)
782 Res = And(Nand(A, B), C);
783 break;
784 case 0x2b:
785 if (ABCIsConst)
786 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
787 break;
788 case 0x2c:
789 if (ABCIsConst)
790 Res = Nor(Xnor(A, B), Nor(B, C));
791 break;
792 case 0x2d:
793 if (ABCIsConst)
794 Res = Xor(A, Or(B, Not(C)));
795 break;
796 case 0x2e:
797 if (ABCIsConst)
798 Res = Xor(A, Or(Xor(A, C), B));
799 break;
800 case 0x2f:
801 if (ABCIsConst)
802 Res = Nand(A, Or(B, Not(C)));
803 break;
804 case 0x30:
805 if (ABIsConst)
806 Res = Nor(B, Not(A));
807 break;
808 case 0x31:
809 if (ABCIsConst)
810 Res = Nor(Nor(A, Not(C)), B);
811 break;
812 case 0x32:
813 if (ABCIsConst)
814 Res = Nor(Nor(A, C), B);
815 break;
816 case 0x33:
817 Res = Not(B);
818 break;
819 case 0x34:
820 if (ABCIsConst)
821 Res = And(Xor(A, B), Nand(B, C));
822 break;
823 case 0x35:
824 if (ABCIsConst)
825 Res = Xor(B, Or(A, Xnor(B, C)));
826 break;
827 case 0x36:
828 if (ABCIsConst)
829 Res = Xor(Or(A, C), B);
830 break;
831 case 0x37:
832 if (ABCIsConst)
833 Res = Nand(Or(A, C), B);
834 break;
835 case 0x38:
836 if (ABCIsConst)
837 Res = Nor(Xnor(A, B), Nor(A, C));
838 break;
839 case 0x39:
840 if (ABCIsConst)
841 Res = Xor(Or(A, Not(C)), B);
842 break;
843 case 0x3a:
844 if (ABCIsConst)
845 Res = Xor(B, Or(A, Xor(B, C)));
846 break;
847 case 0x3b:
848 if (ABCIsConst)
849 Res = Nand(Or(A, Not(C)), B);
850 break;
851 case 0x3c:
852 Res = Xor(A, B);
853 break;
854 case 0x3d:
855 if (ABCIsConst)
856 Res = Xor(A, Or(Nor(A, C), B));
857 break;
858 case 0x3e:
859 if (ABCIsConst)
860 Res = Xor(A, Or(Nor(A, Not(C)), B));
861 break;
862 case 0x3f:
863 if (ABIsConst)
864 Res = Nand(A, B);
865 break;
866 case 0x40:
867 if (ABCIsConst)
868 Res = Nor(Nand(A, B), C);
869 break;
870 case 0x41:
871 if (ABCIsConst)
872 Res = Nor(Xor(A, B), C);
873 break;
874 case 0x42:
875 if (ABCIsConst)
876 Res = Nor(Xor(A, B), Xnor(A, C));
877 break;
878 case 0x43:
879 if (ABCIsConst)
880 Res = Xor(A, Nand(Nand(A, C), B));
881 break;
882 case 0x44:
883 if (BCIsConst)
884 Res = Nor(C, Not(B));
885 break;
886 case 0x45:
887 if (ABCIsConst)
888 Res = Nor(Nor(B, Not(A)), C);
889 break;
890 case 0x46:
891 if (ABCIsConst)
892 Res = Xor(Or(And(A, C), B), C);
893 break;
894 case 0x47:
895 if (ABCIsConst)
896 Res = Xor(Or(Xnor(A, C), B), C);
897 break;
898 case 0x48:
899 if (ABCIsConst)
900 Res = And(Xor(A, C), B);
901 break;
902 case 0x49:
903 if (ABCIsConst)
904 Res = Xor(Or(Xnor(A, B), And(A, C)), C);
905 break;
906 case 0x4a:
907 if (ABCIsConst)
908 Res = Nor(Xnor(A, C), Nor(B, C));
909 break;
910 case 0x4b:
911 if (ABCIsConst)
912 Res = Xor(A, Or(C, Not(B)));
913 break;
914 case 0x4c:
915 if (ABCIsConst)
916 Res = And(Nand(A, C), B);
917 break;
918 case 0x4d:
919 if (ABCIsConst)
920 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
921 break;
922 case 0x4e:
923 if (ABCIsConst)
924 Res = Xor(A, Or(Xor(A, B), C));
925 break;
926 case 0x4f:
927 if (ABCIsConst)
928 Res = Nand(A, Nand(B, Not(C)));
929 break;
930 case 0x50:
931 if (ACIsConst)
932 Res = Nor(C, Not(A));
933 break;
934 case 0x51:
935 if (ABCIsConst)
936 Res = Nor(Nor(A, Not(B)), C);
937 break;
938 case 0x52:
939 if (ABCIsConst)
940 Res = And(Xor(A, C), Nand(B, C));
941 break;
942 case 0x53:
943 if (ABCIsConst)
944 Res = Xor(Or(Xnor(B, C), A), C);
945 break;
946 case 0x54:
947 if (ABCIsConst)
948 Res = Nor(Nor(A, B), C);
949 break;
950 case 0x55:
951 Res = Not(C);
952 break;
953 case 0x56:
954 if (ABCIsConst)
955 Res = Xor(Or(A, B), C);
956 break;
957 case 0x57:
958 if (ABCIsConst)
959 Res = Nand(Or(A, B), C);
960 break;
961 case 0x58:
962 if (ABCIsConst)
963 Res = Nor(Nor(A, B), Xnor(A, C));
964 break;
965 case 0x59:
966 if (ABCIsConst)
967 Res = Xor(Or(A, Not(B)), C);
968 break;
969 case 0x5a:
970 Res = Xor(A, C);
971 break;
972 case 0x5b:
973 if (ABCIsConst)
974 Res = Xor(A, Or(Nor(A, B), C));
975 break;
976 case 0x5c:
977 if (ABCIsConst)
978 Res = Xor(Or(Xor(B, C), A), C);
979 break;
980 case 0x5d:
981 if (ABCIsConst)
982 Res = Nand(Or(A, Not(B)), C);
983 break;
984 case 0x5e:
985 if (ABCIsConst)
986 Res = Xor(A, Or(Nor(A, Not(B)), C));
987 break;
988 case 0x5f:
989 if (ACIsConst)
990 Res = Nand(A, C);
991 break;
992 case 0x60:
993 if (ABCIsConst)
994 Res = And(A, Xor(B, C));
995 break;
996 case 0x61:
997 if (ABCIsConst)
998 Res = Xor(Or(Xnor(A, B), And(B, C)), C);
999 break;
1000 case 0x62:
1001 if (ABCIsConst)
1002 Res = Nor(Nor(A, C), Xnor(B, C));
1003 break;
1004 case 0x63:
1005 if (ABCIsConst)
1006 Res = Xor(B, Or(C, Not(A)));
1007 break;
1008 case 0x64:
1009 if (ABCIsConst)
1010 Res = Nor(Nor(A, B), Xnor(B, C));
1011 break;
1012 case 0x65:
1013 if (ABCIsConst)
1014 Res = Xor(Or(B, Not(A)), C);
1015 break;
1016 case 0x66:
1017 Res = Xor(B, C);
1018 break;
1019 case 0x67:
1020 if (ABCIsConst)
1021 Res = Or(Nor(A, B), Xor(B, C));
1022 break;
1023 case 0x68:
1024 if (ABCIsConst)
1025 Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1026 break;
1027 case 0x69:
1028 if (ABCIsConst)
1029 Res = Xor(Xnor(A, B), C);
1030 break;
1031 case 0x6a:
1032 if (ABCIsConst)
1033 Res = Xor(And(A, B), C);
1034 break;
1035 case 0x6b:
1036 if (ABCIsConst)
1037 Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1038 break;
1039 case 0x6c:
1040 if (ABCIsConst)
1041 Res = Xor(And(A, C), B);
1042 break;
1043 case 0x6d:
1044 if (ABCIsConst)
1045 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1046 break;
1047 case 0x6e:
1048 if (ABCIsConst)
1049 Res = Or(Nor(A, Not(B)), Xor(B, C));
1050 break;
1051 case 0x6f:
1052 if (ABCIsConst)
1053 Res = Nand(A, Xnor(B, C));
1054 break;
1055 case 0x70:
1056 if (ABCIsConst)
1057 Res = And(A, Nand(B, C));
1058 break;
1059 case 0x71:
1060 if (ABCIsConst)
1061 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1062 break;
1063 case 0x72:
1064 if (ABCIsConst)
1065 Res = Xor(Or(Xor(A, B), C), B);
1066 break;
1067 case 0x73:
1068 if (ABCIsConst)
1069 Res = Nand(Nand(A, Not(C)), B);
1070 break;
1071 case 0x74:
1072 if (ABCIsConst)
1073 Res = Xor(Or(Xor(A, C), B), C);
1074 break;
1075 case 0x75:
1076 if (ABCIsConst)
1077 Res = Nand(Nand(A, Not(B)), C);
1078 break;
1079 case 0x76:
1080 if (ABCIsConst)
1081 Res = Xor(B, Or(Nor(B, Not(A)), C));
1082 break;
1083 case 0x77:
1084 if (BCIsConst)
1085 Res = Nand(B, C);
1086 break;
1087 case 0x78:
1088 if (ABCIsConst)
1089 Res = Xor(A, And(B, C));
1090 break;
1091 case 0x79:
1092 if (ABCIsConst)
1093 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1094 break;
1095 case 0x7a:
1096 if (ABCIsConst)
1097 Res = Or(Xor(A, C), Nor(B, Not(A)));
1098 break;
1099 case 0x7b:
1100 if (ABCIsConst)
1101 Res = Nand(Xnor(A, C), B);
1102 break;
1103 case 0x7c:
1104 if (ABCIsConst)
1105 Res = Or(Xor(A, B), Nor(C, Not(A)));
1106 break;
1107 case 0x7d:
1108 if (ABCIsConst)
1109 Res = Nand(Xnor(A, B), C);
1110 break;
1111 case 0x7e:
1112 if (ABCIsConst)
1113 Res = Or(Xor(A, B), Xor(A, C));
1114 break;
1115 case 0x7f:
1116 if (ABCIsConst)
1117 Res = Nand(And(A, B), C);
1118 break;
1119 case 0x80:
1120 if (ABCIsConst)
1121 Res = And(And(A, B), C);
1122 break;
1123 case 0x81:
1124 if (ABCIsConst)
1125 Res = Nor(Xor(A, B), Xor(A, C));
1126 break;
1127 case 0x82:
1128 if (ABCIsConst)
1129 Res = And(Xnor(A, B), C);
1130 break;
1131 case 0x83:
1132 if (ABCIsConst)
1133 Res = Nor(Xor(A, B), Nor(C, Not(A)));
1134 break;
1135 case 0x84:
1136 if (ABCIsConst)
1137 Res = And(Xnor(A, C), B);
1138 break;
1139 case 0x85:
1140 if (ABCIsConst)
1141 Res = Nor(Xor(A, C), Nor(B, Not(A)));
1142 break;
1143 case 0x86:
1144 if (ABCIsConst)
1145 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1146 break;
1147 case 0x87:
1148 if (ABCIsConst)
1149 Res = Xor(A, Nand(B, C));
1150 break;
1151 case 0x88:
1152 Res = And(B, C);
1153 break;
1154 case 0x89:
1155 if (ABCIsConst)
1156 Res = Xor(B, Nor(Nor(B, Not(A)), C));
1157 break;
1158 case 0x8a:
1159 if (ABCIsConst)
1160 Res = And(Nand(A, Not(B)), C);
1161 break;
1162 case 0x8b:
1163 if (ABCIsConst)
1164 Res = Xor(Nor(Xor(A, C), B), C);
1165 break;
1166 case 0x8c:
1167 if (ABCIsConst)
1168 Res = And(Nand(A, Not(C)), B);
1169 break;
1170 case 0x8d:
1171 if (ABCIsConst)
1172 Res = Xor(Nor(Xor(A, B), C), B);
1173 break;
1174 case 0x8e:
1175 if (ABCIsConst)
1176 Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1177 break;
1178 case 0x8f:
1179 if (ABCIsConst)
1180 Res = Nand(A, Nand(B, C));
1181 break;
1182 case 0x90:
1183 if (ABCIsConst)
1184 Res = And(A, Xnor(B, C));
1185 break;
1186 case 0x91:
1187 if (ABCIsConst)
1188 Res = Nor(Nor(A, Not(B)), Xor(B, C));
1189 break;
1190 case 0x92:
1191 if (ABCIsConst)
1192 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1193 break;
1194 case 0x93:
1195 if (ABCIsConst)
1196 Res = Xor(Nand(A, C), B);
1197 break;
1198 case 0x94:
1199 if (ABCIsConst)
1200 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1201 break;
1202 case 0x95:
1203 if (ABCIsConst)
1204 Res = Xor(Nand(A, B), C);
1205 break;
1206 case 0x96:
1207 if (ABCIsConst)
1208 Res = Xor(Xor(A, B), C);
1209 break;
1210 case 0x97:
1211 if (ABCIsConst)
1212 Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1213 break;
1214 case 0x98:
1215 if (ABCIsConst)
1216 Res = Nor(Nor(A, B), Xor(B, C));
1217 break;
1218 case 0x99:
1219 if (BCIsConst)
1220 Res = Xnor(B, C);
1221 break;
1222 case 0x9a:
1223 if (ABCIsConst)
1224 Res = Xor(Nor(B, Not(A)), C);
1225 break;
1226 case 0x9b:
1227 if (ABCIsConst)
1228 Res = Or(Nor(A, B), Xnor(B, C));
1229 break;
1230 case 0x9c:
1231 if (ABCIsConst)
1232 Res = Xor(B, Nor(C, Not(A)));
1233 break;
1234 case 0x9d:
1235 if (ABCIsConst)
1236 Res = Or(Nor(A, C), Xnor(B, C));
1237 break;
1238 case 0x9e:
1239 if (ABCIsConst)
1240 Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1241 break;
1242 case 0x9f:
1243 if (ABCIsConst)
1244 Res = Nand(A, Xor(B, C));
1245 break;
1246 case 0xa0:
1247 Res = And(A, C);
1248 break;
1249 case 0xa1:
1250 if (ABCIsConst)
1251 Res = Xor(A, Nor(Nor(A, Not(B)), C));
1252 break;
1253 case 0xa2:
1254 if (ABCIsConst)
1255 Res = And(Or(A, Not(B)), C);
1256 break;
1257 case 0xa3:
1258 if (ABCIsConst)
1259 Res = Xor(Nor(Xor(B, C), A), C);
1260 break;
1261 case 0xa4:
1262 if (ABCIsConst)
1263 Res = Xor(A, Nor(Nor(A, B), C));
1264 break;
1265 case 0xa5:
1266 if (ACIsConst)
1267 Res = Xnor(A, C);
1268 break;
1269 case 0xa6:
1270 if (ABCIsConst)
1271 Res = Xor(Nor(A, Not(B)), C);
1272 break;
1273 case 0xa7:
1274 if (ABCIsConst)
1275 Res = Or(Nor(A, B), Xnor(A, C));
1276 break;
1277 case 0xa8:
1278 if (ABCIsConst)
1279 Res = And(Or(A, B), C);
1280 break;
1281 case 0xa9:
1282 if (ABCIsConst)
1283 Res = Xor(Nor(A, B), C);
1284 break;
1285 case 0xaa:
1286 Res = C;
1287 break;
1288 case 0xab:
1289 if (ABCIsConst)
1290 Res = Or(Nor(A, B), C);
1291 break;
1292 case 0xac:
1293 if (ABCIsConst)
1294 Res = Xor(Nor(Xnor(B, C), A), C);
1295 break;
1296 case 0xad:
1297 if (ABCIsConst)
1298 Res = Or(Xnor(A, C), And(B, C));
1299 break;
1300 case 0xae:
1301 if (ABCIsConst)
1302 Res = Or(Nor(A, Not(B)), C);
1303 break;
1304 case 0xaf:
1305 if (ACIsConst)
1306 Res = Or(C, Not(A));
1307 break;
1308 case 0xb0:
1309 if (ABCIsConst)
1310 Res = And(A, Nand(B, Not(C)));
1311 break;
1312 case 0xb1:
1313 if (ABCIsConst)
1314 Res = Xor(A, Nor(Xor(A, B), C));
1315 break;
1316 case 0xb2:
1317 if (ABCIsConst)
1318 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1319 break;
1320 case 0xb3:
1321 if (ABCIsConst)
1322 Res = Nand(Nand(A, C), B);
1323 break;
1324 case 0xb4:
1325 if (ABCIsConst)
1326 Res = Xor(A, Nor(C, Not(B)));
1327 break;
1328 case 0xb5:
1329 if (ABCIsConst)
1330 Res = Or(Xnor(A, C), Nor(B, C));
1331 break;
1332 case 0xb6:
1333 if (ABCIsConst)
1334 Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1335 break;
1336 case 0xb7:
1337 if (ABCIsConst)
1338 Res = Nand(Xor(A, C), B);
1339 break;
1340 case 0xb8:
1341 if (ABCIsConst)
1342 Res = Xor(Nor(Xnor(A, C), B), C);
1343 break;
1344 case 0xb9:
1345 if (ABCIsConst)
1346 Res = Xor(Nor(And(A, C), B), C);
1347 break;
1348 case 0xba:
1349 if (ABCIsConst)
1350 Res = Or(Nor(B, Not(A)), C);
1351 break;
1352 case 0xbb:
1353 if (BCIsConst)
1354 Res = Or(C, Not(B));
1355 break;
1356 case 0xbc:
1357 if (ABCIsConst)
1358 Res = Xor(A, And(Nand(A, C), B));
1359 break;
1360 case 0xbd:
1361 if (ABCIsConst)
1362 Res = Or(Xor(A, B), Xnor(A, C));
1363 break;
1364 case 0xbe:
1365 if (ABCIsConst)
1366 Res = Or(Xor(A, B), C);
1367 break;
1368 case 0xbf:
1369 if (ABCIsConst)
1370 Res = Or(Nand(A, B), C);
1371 break;
1372 case 0xc0:
1373 Res = And(A, B);
1374 break;
1375 case 0xc1:
1376 if (ABCIsConst)
1377 Res = Xor(A, Nor(Nor(A, Not(C)), B));
1378 break;
1379 case 0xc2:
1380 if (ABCIsConst)
1381 Res = Xor(A, Nor(Nor(A, C), B));
1382 break;
1383 case 0xc3:
1384 if (ABIsConst)
1385 Res = Xnor(A, B);
1386 break;
1387 case 0xc4:
1388 if (ABCIsConst)
1389 Res = And(Or(A, Not(C)), B);
1390 break;
1391 case 0xc5:
1392 if (ABCIsConst)
1393 Res = Xor(B, Nor(A, Xor(B, C)));
1394 break;
1395 case 0xc6:
1396 if (ABCIsConst)
1397 Res = Xor(Nor(A, Not(C)), B);
1398 break;
1399 case 0xc7:
1400 if (ABCIsConst)
1401 Res = Or(Xnor(A, B), Nor(A, C));
1402 break;
1403 case 0xc8:
1404 if (ABCIsConst)
1405 Res = And(Or(A, C), B);
1406 break;
1407 case 0xc9:
1408 if (ABCIsConst)
1409 Res = Xor(Nor(A, C), B);
1410 break;
1411 case 0xca:
1412 if (ABCIsConst)
1413 Res = Xor(B, Nor(A, Xnor(B, C)));
1414 break;
1415 case 0xcb:
1416 if (ABCIsConst)
1417 Res = Or(Xnor(A, B), And(B, C));
1418 break;
1419 case 0xcc:
1420 Res = B;
1421 break;
1422 case 0xcd:
1423 if (ABCIsConst)
1424 Res = Or(Nor(A, C), B);
1425 break;
1426 case 0xce:
1427 if (ABCIsConst)
1428 Res = Or(Nor(A, Not(C)), B);
1429 break;
1430 case 0xcf:
1431 if (ABIsConst)
1432 Res = Or(B, Not(A));
1433 break;
1434 case 0xd0:
1435 if (ABCIsConst)
1436 Res = And(A, Or(B, Not(C)));
1437 break;
1438 case 0xd1:
1439 if (ABCIsConst)
1440 Res = Xor(A, Nor(Xor(A, C), B));
1441 break;
1442 case 0xd2:
1443 if (ABCIsConst)
1444 Res = Xor(A, Nor(B, Not(C)));
1445 break;
1446 case 0xd3:
1447 if (ABCIsConst)
1448 Res = Or(Xnor(A, B), Nor(B, C));
1449 break;
1450 case 0xd4:
1451 if (ABCIsConst)
1452 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1453 break;
1454 case 0xd5:
1455 if (ABCIsConst)
1456 Res = Nand(Nand(A, B), C);
1457 break;
1458 case 0xd6:
1459 if (ABCIsConst)
1460 Res = Xor(Xor(A, B), Or(And(A, B), C));
1461 break;
1462 case 0xd7:
1463 if (ABCIsConst)
1464 Res = Nand(Xor(A, B), C);
1465 break;
1466 case 0xd8:
1467 if (ABCIsConst)
1468 Res = Xor(Nor(Xnor(A, B), C), B);
1469 break;
1470 case 0xd9:
1471 if (ABCIsConst)
1472 Res = Or(And(A, B), Xnor(B, C));
1473 break;
1474 case 0xda:
1475 if (ABCIsConst)
1476 Res = Xor(A, And(Nand(A, B), C));
1477 break;
1478 case 0xdb:
1479 if (ABCIsConst)
1480 Res = Or(Xnor(A, B), Xor(A, C));
1481 break;
1482 case 0xdc:
1483 if (ABCIsConst)
1484 Res = Or(B, Nor(C, Not(A)));
1485 break;
1486 case 0xdd:
1487 if (BCIsConst)
1488 Res = Or(B, Not(C));
1489 break;
1490 case 0xde:
1491 if (ABCIsConst)
1492 Res = Or(Xor(A, C), B);
1493 break;
1494 case 0xdf:
1495 if (ABCIsConst)
1496 Res = Or(Nand(A, C), B);
1497 break;
1498 case 0xe0:
1499 if (ABCIsConst)
1500 Res = And(A, Or(B, C));
1501 break;
1502 case 0xe1:
1503 if (ABCIsConst)
1504 Res = Xor(A, Nor(B, C));
1505 break;
1506 case 0xe2:
1507 if (ABCIsConst)
1508 Res = Xor(A, Nor(Xnor(A, C), B));
1509 break;
1510 case 0xe3:
1511 if (ABCIsConst)
1512 Res = Xor(A, Nor(And(A, C), B));
1513 break;
1514 case 0xe4:
1515 if (ABCIsConst)
1516 Res = Xor(A, Nor(Xnor(A, B), C));
1517 break;
1518 case 0xe5:
1519 if (ABCIsConst)
1520 Res = Xor(A, Nor(And(A, B), C));
1521 break;
1522 case 0xe6:
1523 if (ABCIsConst)
1524 Res = Or(And(A, B), Xor(B, C));
1525 break;
1526 case 0xe7:
1527 if (ABCIsConst)
1528 Res = Or(Xnor(A, B), Xnor(A, C));
1529 break;
1530 case 0xe8:
1531 if (ABCIsConst)
1532 Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1533 break;
1534 case 0xe9:
1535 if (ABCIsConst)
1536 Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1537 break;
1538 case 0xea:
1539 if (ABCIsConst)
1540 Res = Or(And(A, B), C);
1541 break;
1542 case 0xeb:
1543 if (ABCIsConst)
1544 Res = Or(Xnor(A, B), C);
1545 break;
1546 case 0xec:
1547 if (ABCIsConst)
1548 Res = Or(And(A, C), B);
1549 break;
1550 case 0xed:
1551 if (ABCIsConst)
1552 Res = Or(Xnor(A, C), B);
1553 break;
1554 case 0xee:
1555 Res = Or(B, C);
1556 break;
1557 case 0xef:
1558 if (ABCIsConst)
1559 Res = Nand(A, Nor(B, C));
1560 break;
1561 case 0xf0:
1562 Res = A;
1563 break;
1564 case 0xf1:
1565 if (ABCIsConst)
1566 Res = Or(A, Nor(B, C));
1567 break;
1568 case 0xf2:
1569 if (ABCIsConst)
1570 Res = Or(A, Nor(B, Not(C)));
1571 break;
1572 case 0xf3:
1573 if (ABIsConst)
1574 Res = Or(A, Not(B));
1575 break;
1576 case 0xf4:
1577 if (ABCIsConst)
1578 Res = Or(A, Nor(C, Not(B)));
1579 break;
1580 case 0xf5:
1581 if (ACIsConst)
1582 Res = Or(A, Not(C));
1583 break;
1584 case 0xf6:
1585 if (ABCIsConst)
1586 Res = Or(A, Xor(B, C));
1587 break;
1588 case 0xf7:
1589 if (ABCIsConst)
1590 Res = Or(A, Nand(B, C));
1591 break;
1592 case 0xf8:
1593 if (ABCIsConst)
1594 Res = Or(A, And(B, C));
1595 break;
1596 case 0xf9:
1597 if (ABCIsConst)
1598 Res = Or(A, Xnor(B, C));
1599 break;
1600 case 0xfa:
1601 Res = Or(A, C);
1602 break;
1603 case 0xfb:
1604 if (ABCIsConst)
1605 Res = Nand(Nor(A, C), B);
1606 break;
1607 case 0xfc:
1608 Res = Or(A, B);
1609 break;
1610 case 0xfd:
1611 if (ABCIsConst)
1612 Res = Nand(Nor(A, B), C);
1613 break;
1614 case 0xfe:
1615 if (ABCIsConst)
1616 Res = Or(Or(A, B), C);
1617 break;
1618 case 0xff:
1619 Res = {Constant::getAllOnesValue(Ty), 0xff};
1620 break;
1623 assert((Res.first == nullptr || Res.second == Imm) &&
1624 "Simplification of ternary logic does not verify!");
1625 return Res.first;
1628 static Value *simplifyX86insertps(const IntrinsicInst &II,
1629 InstCombiner::BuilderTy &Builder) {
1630 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1631 if (!CInt)
1632 return nullptr;
1634 auto *VecTy = cast<FixedVectorType>(II.getType());
1635 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1637 // The immediate permute control byte looks like this:
1638 // [3:0] - zero mask for each 32-bit lane
1639 // [5:4] - select one 32-bit destination lane
1640 // [7:6] - select one 32-bit source lane
1642 uint8_t Imm = CInt->getZExtValue();
1643 uint8_t ZMask = Imm & 0xf;
1644 uint8_t DestLane = (Imm >> 4) & 0x3;
1645 uint8_t SourceLane = (Imm >> 6) & 0x3;
1647 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1649 // If all zero mask bits are set, this was just a weird way to
1650 // generate a zero vector.
1651 if (ZMask == 0xf)
1652 return ZeroVector;
1654 // Initialize by passing all of the first source bits through.
1655 int ShuffleMask[4] = {0, 1, 2, 3};
1657 // We may replace the second operand with the zero vector.
1658 Value *V1 = II.getArgOperand(1);
1660 if (ZMask) {
1661 // If the zero mask is being used with a single input or the zero mask
1662 // overrides the destination lane, this is a shuffle with the zero vector.
1663 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1664 (ZMask & (1 << DestLane))) {
1665 V1 = ZeroVector;
1666 // We may still move 32-bits of the first source vector from one lane
1667 // to another.
1668 ShuffleMask[DestLane] = SourceLane;
1669 // The zero mask may override the previous insert operation.
1670 for (unsigned i = 0; i < 4; ++i)
1671 if ((ZMask >> i) & 0x1)
1672 ShuffleMask[i] = i + 4;
1673 } else {
1674 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1675 return nullptr;
1677 } else {
1678 // Replace the selected destination lane with the selected source lane.
1679 ShuffleMask[DestLane] = SourceLane + 4;
1682 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1685 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1686 /// or conversion to a shuffle vector.
1687 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1688 ConstantInt *CILength, ConstantInt *CIIndex,
1689 InstCombiner::BuilderTy &Builder) {
1690 auto LowConstantHighUndef = [&](uint64_t Val) {
1691 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1692 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1693 UndefValue::get(IntTy64)};
1694 return ConstantVector::get(Args);
1697 // See if we're dealing with constant values.
1698 auto *C0 = dyn_cast<Constant>(Op0);
1699 auto *CI0 =
1700 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1701 : nullptr;
1703 // Attempt to constant fold.
1704 if (CILength && CIIndex) {
1705 // From AMD documentation: "The bit index and field length are each six
1706 // bits in length other bits of the field are ignored."
1707 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1708 APInt APLength = CILength->getValue().zextOrTrunc(6);
1710 unsigned Index = APIndex.getZExtValue();
1712 // From AMD documentation: "a value of zero in the field length is
1713 // defined as length of 64".
1714 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1716 // From AMD documentation: "If the sum of the bit index + length field
1717 // is greater than 64, the results are undefined".
1718 unsigned End = Index + Length;
1720 // Note that both field index and field length are 8-bit quantities.
1721 // Since variables 'Index' and 'Length' are unsigned values
1722 // obtained from zero-extending field index and field length
1723 // respectively, their sum should never wrap around.
1724 if (End > 64)
1725 return UndefValue::get(II.getType());
1727 // If we are inserting whole bytes, we can convert this to a shuffle.
1728 // Lowering can recognize EXTRQI shuffle masks.
1729 if ((Length % 8) == 0 && (Index % 8) == 0) {
1730 // Convert bit indices to byte indices.
1731 Length /= 8;
1732 Index /= 8;
1734 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1735 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1737 SmallVector<int, 16> ShuffleMask;
1738 for (int i = 0; i != (int)Length; ++i)
1739 ShuffleMask.push_back(i + Index);
1740 for (int i = Length; i != 8; ++i)
1741 ShuffleMask.push_back(i + 16);
1742 for (int i = 8; i != 16; ++i)
1743 ShuffleMask.push_back(-1);
1745 Value *SV = Builder.CreateShuffleVector(
1746 Builder.CreateBitCast(Op0, ShufTy),
1747 ConstantAggregateZero::get(ShufTy), ShuffleMask);
1748 return Builder.CreateBitCast(SV, II.getType());
1751 // Constant Fold - shift Index'th bit to lowest position and mask off
1752 // Length bits.
1753 if (CI0) {
1754 APInt Elt = CI0->getValue();
1755 Elt.lshrInPlace(Index);
1756 Elt = Elt.zextOrTrunc(Length);
1757 return LowConstantHighUndef(Elt.getZExtValue());
1760 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1761 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1762 Value *Args[] = {Op0, CILength, CIIndex};
1763 Module *M = II.getModule();
1764 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1765 return Builder.CreateCall(F, Args);
1769 // Constant Fold - extraction from zero is always {zero, undef}.
1770 if (CI0 && CI0->isZero())
1771 return LowConstantHighUndef(0);
1773 return nullptr;
1776 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1777 /// folding or conversion to a shuffle vector.
1778 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1779 APInt APLength, APInt APIndex,
1780 InstCombiner::BuilderTy &Builder) {
1781 // From AMD documentation: "The bit index and field length are each six bits
1782 // in length other bits of the field are ignored."
1783 APIndex = APIndex.zextOrTrunc(6);
1784 APLength = APLength.zextOrTrunc(6);
1786 // Attempt to constant fold.
1787 unsigned Index = APIndex.getZExtValue();
1789 // From AMD documentation: "a value of zero in the field length is
1790 // defined as length of 64".
1791 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1793 // From AMD documentation: "If the sum of the bit index + length field
1794 // is greater than 64, the results are undefined".
1795 unsigned End = Index + Length;
1797 // Note that both field index and field length are 8-bit quantities.
1798 // Since variables 'Index' and 'Length' are unsigned values
1799 // obtained from zero-extending field index and field length
1800 // respectively, their sum should never wrap around.
1801 if (End > 64)
1802 return UndefValue::get(II.getType());
1804 // If we are inserting whole bytes, we can convert this to a shuffle.
1805 // Lowering can recognize INSERTQI shuffle masks.
1806 if ((Length % 8) == 0 && (Index % 8) == 0) {
1807 // Convert bit indices to byte indices.
1808 Length /= 8;
1809 Index /= 8;
1811 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1812 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1814 SmallVector<int, 16> ShuffleMask;
1815 for (int i = 0; i != (int)Index; ++i)
1816 ShuffleMask.push_back(i);
1817 for (int i = 0; i != (int)Length; ++i)
1818 ShuffleMask.push_back(i + 16);
1819 for (int i = Index + Length; i != 8; ++i)
1820 ShuffleMask.push_back(i);
1821 for (int i = 8; i != 16; ++i)
1822 ShuffleMask.push_back(-1);
1824 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1825 Builder.CreateBitCast(Op1, ShufTy),
1826 ShuffleMask);
1827 return Builder.CreateBitCast(SV, II.getType());
1830 // See if we're dealing with constant values.
1831 auto *C0 = dyn_cast<Constant>(Op0);
1832 auto *C1 = dyn_cast<Constant>(Op1);
1833 auto *CI00 =
1834 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1835 : nullptr;
1836 auto *CI10 =
1837 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1838 : nullptr;
1840 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1841 if (CI00 && CI10) {
1842 APInt V00 = CI00->getValue();
1843 APInt V10 = CI10->getValue();
1844 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1845 V00 = V00 & ~Mask;
1846 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1847 APInt Val = V00 | V10;
1848 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1849 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1850 UndefValue::get(IntTy64)};
1851 return ConstantVector::get(Args);
1854 // If we were an INSERTQ call, we'll save demanded elements if we convert to
1855 // INSERTQI.
1856 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1857 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1858 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1859 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1861 Value *Args[] = {Op0, Op1, CILength, CIIndex};
1862 Module *M = II.getModule();
1863 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1864 return Builder.CreateCall(F, Args);
1867 return nullptr;
1870 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1871 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1872 InstCombiner::BuilderTy &Builder) {
1873 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1874 if (!V)
1875 return nullptr;
1877 auto *VecTy = cast<FixedVectorType>(II.getType());
1878 unsigned NumElts = VecTy->getNumElements();
1879 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1880 "Unexpected number of elements in shuffle mask!");
1882 // Construct a shuffle mask from constant integers or UNDEFs.
1883 int Indexes[64];
1885 // Each byte in the shuffle control mask forms an index to permute the
1886 // corresponding byte in the destination operand.
1887 for (unsigned I = 0; I < NumElts; ++I) {
1888 Constant *COp = V->getAggregateElement(I);
1889 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1890 return nullptr;
1892 if (isa<UndefValue>(COp)) {
1893 Indexes[I] = -1;
1894 continue;
1897 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1899 // If the most significant bit (bit[7]) of each byte of the shuffle
1900 // control mask is set, then zero is written in the result byte.
1901 // The zero vector is in the right-hand side of the resulting
1902 // shufflevector.
1904 // The value of each index for the high 128-bit lane is the least
1905 // significant 4 bits of the respective shuffle control byte.
1906 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1907 Indexes[I] = Index;
1910 auto V1 = II.getArgOperand(0);
1911 auto V2 = Constant::getNullValue(VecTy);
1912 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1915 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1916 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1917 InstCombiner::BuilderTy &Builder) {
1918 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1919 if (!V)
1920 return nullptr;
1922 auto *VecTy = cast<FixedVectorType>(II.getType());
1923 unsigned NumElts = VecTy->getNumElements();
1924 bool IsPD = VecTy->getScalarType()->isDoubleTy();
1925 unsigned NumLaneElts = IsPD ? 2 : 4;
1926 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1928 // Construct a shuffle mask from constant integers or UNDEFs.
1929 int Indexes[16];
1931 // The intrinsics only read one or two bits, clear the rest.
1932 for (unsigned I = 0; I < NumElts; ++I) {
1933 Constant *COp = V->getAggregateElement(I);
1934 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1935 return nullptr;
1937 if (isa<UndefValue>(COp)) {
1938 Indexes[I] = -1;
1939 continue;
1942 APInt Index = cast<ConstantInt>(COp)->getValue();
1943 Index = Index.zextOrTrunc(32).getLoBits(2);
1945 // The PD variants uses bit 1 to select per-lane element index, so
1946 // shift down to convert to generic shuffle mask index.
1947 if (IsPD)
1948 Index.lshrInPlace(1);
1950 // The _256 variants are a bit trickier since the mask bits always index
1951 // into the corresponding 128 half. In order to convert to a generic
1952 // shuffle, we have to make that explicit.
1953 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
1955 Indexes[I] = Index.getZExtValue();
1958 auto V1 = II.getArgOperand(0);
1959 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
1962 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1963 static Value *simplifyX86vpermv(const IntrinsicInst &II,
1964 InstCombiner::BuilderTy &Builder) {
1965 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1966 if (!V)
1967 return nullptr;
1969 auto *VecTy = cast<FixedVectorType>(II.getType());
1970 unsigned Size = VecTy->getNumElements();
1971 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
1972 "Unexpected shuffle mask size");
1974 // Construct a shuffle mask from constant integers or UNDEFs.
1975 int Indexes[64];
1977 for (unsigned I = 0; I < Size; ++I) {
1978 Constant *COp = V->getAggregateElement(I);
1979 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1980 return nullptr;
1982 if (isa<UndefValue>(COp)) {
1983 Indexes[I] = -1;
1984 continue;
1987 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
1988 Index &= Size - 1;
1989 Indexes[I] = Index;
1992 auto V1 = II.getArgOperand(0);
1993 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
1996 std::optional<Instruction *>
1997 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1998 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
1999 unsigned DemandedWidth) {
2000 APInt UndefElts(Width, 0);
2001 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2002 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2005 Intrinsic::ID IID = II.getIntrinsicID();
2006 switch (IID) {
2007 case Intrinsic::x86_bmi_bextr_32:
2008 case Intrinsic::x86_bmi_bextr_64:
2009 case Intrinsic::x86_tbm_bextri_u32:
2010 case Intrinsic::x86_tbm_bextri_u64:
2011 // If the RHS is a constant we can try some simplifications.
2012 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2013 uint64_t Shift = C->getZExtValue();
2014 uint64_t Length = (Shift >> 8) & 0xff;
2015 Shift &= 0xff;
2016 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2017 // If the length is 0 or the shift is out of range, replace with zero.
2018 if (Length == 0 || Shift >= BitWidth) {
2019 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2021 // If the LHS is also a constant, we can completely constant fold this.
2022 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2023 uint64_t Result = InC->getZExtValue() >> Shift;
2024 if (Length > BitWidth)
2025 Length = BitWidth;
2026 Result &= maskTrailingOnes<uint64_t>(Length);
2027 return IC.replaceInstUsesWith(II,
2028 ConstantInt::get(II.getType(), Result));
2030 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2031 // are only masking bits that a shift already cleared?
2033 break;
2035 case Intrinsic::x86_bmi_bzhi_32:
2036 case Intrinsic::x86_bmi_bzhi_64:
2037 // If the RHS is a constant we can try some simplifications.
2038 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2039 uint64_t Index = C->getZExtValue() & 0xff;
2040 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2041 if (Index >= BitWidth) {
2042 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2044 if (Index == 0) {
2045 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2047 // If the LHS is also a constant, we can completely constant fold this.
2048 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2049 uint64_t Result = InC->getZExtValue();
2050 Result &= maskTrailingOnes<uint64_t>(Index);
2051 return IC.replaceInstUsesWith(II,
2052 ConstantInt::get(II.getType(), Result));
2054 // TODO should we convert this to an AND if the RHS is constant?
2056 break;
2057 case Intrinsic::x86_bmi_pext_32:
2058 case Intrinsic::x86_bmi_pext_64:
2059 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2060 if (MaskC->isNullValue()) {
2061 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2063 if (MaskC->isAllOnesValue()) {
2064 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2067 unsigned MaskIdx, MaskLen;
2068 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2069 // any single contingous sequence of 1s anywhere in the mask simply
2070 // describes a subset of the input bits shifted to the appropriate
2071 // position. Replace with the straight forward IR.
2072 Value *Input = II.getArgOperand(0);
2073 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2074 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2075 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2076 return IC.replaceInstUsesWith(II, Shifted);
2079 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2080 uint64_t Src = SrcC->getZExtValue();
2081 uint64_t Mask = MaskC->getZExtValue();
2082 uint64_t Result = 0;
2083 uint64_t BitToSet = 1;
2085 while (Mask) {
2086 // Isolate lowest set bit.
2087 uint64_t BitToTest = Mask & -Mask;
2088 if (BitToTest & Src)
2089 Result |= BitToSet;
2091 BitToSet <<= 1;
2092 // Clear lowest set bit.
2093 Mask &= Mask - 1;
2096 return IC.replaceInstUsesWith(II,
2097 ConstantInt::get(II.getType(), Result));
2100 break;
2101 case Intrinsic::x86_bmi_pdep_32:
2102 case Intrinsic::x86_bmi_pdep_64:
2103 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2104 if (MaskC->isNullValue()) {
2105 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2107 if (MaskC->isAllOnesValue()) {
2108 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2111 unsigned MaskIdx, MaskLen;
2112 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2113 // any single contingous sequence of 1s anywhere in the mask simply
2114 // describes a subset of the input bits shifted to the appropriate
2115 // position. Replace with the straight forward IR.
2116 Value *Input = II.getArgOperand(0);
2117 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2118 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2119 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2120 return IC.replaceInstUsesWith(II, Masked);
2123 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2124 uint64_t Src = SrcC->getZExtValue();
2125 uint64_t Mask = MaskC->getZExtValue();
2126 uint64_t Result = 0;
2127 uint64_t BitToTest = 1;
2129 while (Mask) {
2130 // Isolate lowest set bit.
2131 uint64_t BitToSet = Mask & -Mask;
2132 if (BitToTest & Src)
2133 Result |= BitToSet;
2135 BitToTest <<= 1;
2136 // Clear lowest set bit;
2137 Mask &= Mask - 1;
2140 return IC.replaceInstUsesWith(II,
2141 ConstantInt::get(II.getType(), Result));
2144 break;
2146 case Intrinsic::x86_sse_cvtss2si:
2147 case Intrinsic::x86_sse_cvtss2si64:
2148 case Intrinsic::x86_sse_cvttss2si:
2149 case Intrinsic::x86_sse_cvttss2si64:
2150 case Intrinsic::x86_sse2_cvtsd2si:
2151 case Intrinsic::x86_sse2_cvtsd2si64:
2152 case Intrinsic::x86_sse2_cvttsd2si:
2153 case Intrinsic::x86_sse2_cvttsd2si64:
2154 case Intrinsic::x86_avx512_vcvtss2si32:
2155 case Intrinsic::x86_avx512_vcvtss2si64:
2156 case Intrinsic::x86_avx512_vcvtss2usi32:
2157 case Intrinsic::x86_avx512_vcvtss2usi64:
2158 case Intrinsic::x86_avx512_vcvtsd2si32:
2159 case Intrinsic::x86_avx512_vcvtsd2si64:
2160 case Intrinsic::x86_avx512_vcvtsd2usi32:
2161 case Intrinsic::x86_avx512_vcvtsd2usi64:
2162 case Intrinsic::x86_avx512_cvttss2si:
2163 case Intrinsic::x86_avx512_cvttss2si64:
2164 case Intrinsic::x86_avx512_cvttss2usi:
2165 case Intrinsic::x86_avx512_cvttss2usi64:
2166 case Intrinsic::x86_avx512_cvttsd2si:
2167 case Intrinsic::x86_avx512_cvttsd2si64:
2168 case Intrinsic::x86_avx512_cvttsd2usi:
2169 case Intrinsic::x86_avx512_cvttsd2usi64: {
2170 // These intrinsics only demand the 0th element of their input vectors. If
2171 // we can simplify the input based on that, do so now.
2172 Value *Arg = II.getArgOperand(0);
2173 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2174 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2175 return IC.replaceOperand(II, 0, V);
2177 break;
2180 case Intrinsic::x86_mmx_pmovmskb:
2181 case Intrinsic::x86_sse_movmsk_ps:
2182 case Intrinsic::x86_sse2_movmsk_pd:
2183 case Intrinsic::x86_sse2_pmovmskb_128:
2184 case Intrinsic::x86_avx_movmsk_pd_256:
2185 case Intrinsic::x86_avx_movmsk_ps_256:
2186 case Intrinsic::x86_avx2_pmovmskb:
2187 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2188 return IC.replaceInstUsesWith(II, V);
2190 break;
2192 case Intrinsic::x86_sse_comieq_ss:
2193 case Intrinsic::x86_sse_comige_ss:
2194 case Intrinsic::x86_sse_comigt_ss:
2195 case Intrinsic::x86_sse_comile_ss:
2196 case Intrinsic::x86_sse_comilt_ss:
2197 case Intrinsic::x86_sse_comineq_ss:
2198 case Intrinsic::x86_sse_ucomieq_ss:
2199 case Intrinsic::x86_sse_ucomige_ss:
2200 case Intrinsic::x86_sse_ucomigt_ss:
2201 case Intrinsic::x86_sse_ucomile_ss:
2202 case Intrinsic::x86_sse_ucomilt_ss:
2203 case Intrinsic::x86_sse_ucomineq_ss:
2204 case Intrinsic::x86_sse2_comieq_sd:
2205 case Intrinsic::x86_sse2_comige_sd:
2206 case Intrinsic::x86_sse2_comigt_sd:
2207 case Intrinsic::x86_sse2_comile_sd:
2208 case Intrinsic::x86_sse2_comilt_sd:
2209 case Intrinsic::x86_sse2_comineq_sd:
2210 case Intrinsic::x86_sse2_ucomieq_sd:
2211 case Intrinsic::x86_sse2_ucomige_sd:
2212 case Intrinsic::x86_sse2_ucomigt_sd:
2213 case Intrinsic::x86_sse2_ucomile_sd:
2214 case Intrinsic::x86_sse2_ucomilt_sd:
2215 case Intrinsic::x86_sse2_ucomineq_sd:
2216 case Intrinsic::x86_avx512_vcomi_ss:
2217 case Intrinsic::x86_avx512_vcomi_sd:
2218 case Intrinsic::x86_avx512_mask_cmp_ss:
2219 case Intrinsic::x86_avx512_mask_cmp_sd: {
2220 // These intrinsics only demand the 0th element of their input vectors. If
2221 // we can simplify the input based on that, do so now.
2222 bool MadeChange = false;
2223 Value *Arg0 = II.getArgOperand(0);
2224 Value *Arg1 = II.getArgOperand(1);
2225 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2226 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2227 IC.replaceOperand(II, 0, V);
2228 MadeChange = true;
2230 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2231 IC.replaceOperand(II, 1, V);
2232 MadeChange = true;
2234 if (MadeChange) {
2235 return &II;
2237 break;
2240 case Intrinsic::x86_avx512_add_ps_512:
2241 case Intrinsic::x86_avx512_div_ps_512:
2242 case Intrinsic::x86_avx512_mul_ps_512:
2243 case Intrinsic::x86_avx512_sub_ps_512:
2244 case Intrinsic::x86_avx512_add_pd_512:
2245 case Intrinsic::x86_avx512_div_pd_512:
2246 case Intrinsic::x86_avx512_mul_pd_512:
2247 case Intrinsic::x86_avx512_sub_pd_512:
2248 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2249 // IR operations.
2250 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2251 if (R->getValue() == 4) {
2252 Value *Arg0 = II.getArgOperand(0);
2253 Value *Arg1 = II.getArgOperand(1);
2255 Value *V;
2256 switch (IID) {
2257 default:
2258 llvm_unreachable("Case stmts out of sync!");
2259 case Intrinsic::x86_avx512_add_ps_512:
2260 case Intrinsic::x86_avx512_add_pd_512:
2261 V = IC.Builder.CreateFAdd(Arg0, Arg1);
2262 break;
2263 case Intrinsic::x86_avx512_sub_ps_512:
2264 case Intrinsic::x86_avx512_sub_pd_512:
2265 V = IC.Builder.CreateFSub(Arg0, Arg1);
2266 break;
2267 case Intrinsic::x86_avx512_mul_ps_512:
2268 case Intrinsic::x86_avx512_mul_pd_512:
2269 V = IC.Builder.CreateFMul(Arg0, Arg1);
2270 break;
2271 case Intrinsic::x86_avx512_div_ps_512:
2272 case Intrinsic::x86_avx512_div_pd_512:
2273 V = IC.Builder.CreateFDiv(Arg0, Arg1);
2274 break;
2277 return IC.replaceInstUsesWith(II, V);
2280 break;
2282 case Intrinsic::x86_avx512_mask_add_ss_round:
2283 case Intrinsic::x86_avx512_mask_div_ss_round:
2284 case Intrinsic::x86_avx512_mask_mul_ss_round:
2285 case Intrinsic::x86_avx512_mask_sub_ss_round:
2286 case Intrinsic::x86_avx512_mask_add_sd_round:
2287 case Intrinsic::x86_avx512_mask_div_sd_round:
2288 case Intrinsic::x86_avx512_mask_mul_sd_round:
2289 case Intrinsic::x86_avx512_mask_sub_sd_round:
2290 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2291 // IR operations.
2292 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2293 if (R->getValue() == 4) {
2294 // Extract the element as scalars.
2295 Value *Arg0 = II.getArgOperand(0);
2296 Value *Arg1 = II.getArgOperand(1);
2297 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2298 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2300 Value *V;
2301 switch (IID) {
2302 default:
2303 llvm_unreachable("Case stmts out of sync!");
2304 case Intrinsic::x86_avx512_mask_add_ss_round:
2305 case Intrinsic::x86_avx512_mask_add_sd_round:
2306 V = IC.Builder.CreateFAdd(LHS, RHS);
2307 break;
2308 case Intrinsic::x86_avx512_mask_sub_ss_round:
2309 case Intrinsic::x86_avx512_mask_sub_sd_round:
2310 V = IC.Builder.CreateFSub(LHS, RHS);
2311 break;
2312 case Intrinsic::x86_avx512_mask_mul_ss_round:
2313 case Intrinsic::x86_avx512_mask_mul_sd_round:
2314 V = IC.Builder.CreateFMul(LHS, RHS);
2315 break;
2316 case Intrinsic::x86_avx512_mask_div_ss_round:
2317 case Intrinsic::x86_avx512_mask_div_sd_round:
2318 V = IC.Builder.CreateFDiv(LHS, RHS);
2319 break;
2322 // Handle the masking aspect of the intrinsic.
2323 Value *Mask = II.getArgOperand(3);
2324 auto *C = dyn_cast<ConstantInt>(Mask);
2325 // We don't need a select if we know the mask bit is a 1.
2326 if (!C || !C->getValue()[0]) {
2327 // Cast the mask to an i1 vector and then extract the lowest element.
2328 auto *MaskTy = FixedVectorType::get(
2329 IC.Builder.getInt1Ty(),
2330 cast<IntegerType>(Mask->getType())->getBitWidth());
2331 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2332 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2333 // Extract the lowest element from the passthru operand.
2334 Value *Passthru =
2335 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2336 V = IC.Builder.CreateSelect(Mask, V, Passthru);
2339 // Insert the result back into the original argument 0.
2340 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2342 return IC.replaceInstUsesWith(II, V);
2345 break;
2347 // Constant fold ashr( <A x Bi>, Ci ).
2348 // Constant fold lshr( <A x Bi>, Ci ).
2349 // Constant fold shl( <A x Bi>, Ci ).
2350 case Intrinsic::x86_sse2_psrai_d:
2351 case Intrinsic::x86_sse2_psrai_w:
2352 case Intrinsic::x86_avx2_psrai_d:
2353 case Intrinsic::x86_avx2_psrai_w:
2354 case Intrinsic::x86_avx512_psrai_q_128:
2355 case Intrinsic::x86_avx512_psrai_q_256:
2356 case Intrinsic::x86_avx512_psrai_d_512:
2357 case Intrinsic::x86_avx512_psrai_q_512:
2358 case Intrinsic::x86_avx512_psrai_w_512:
2359 case Intrinsic::x86_sse2_psrli_d:
2360 case Intrinsic::x86_sse2_psrli_q:
2361 case Intrinsic::x86_sse2_psrli_w:
2362 case Intrinsic::x86_avx2_psrli_d:
2363 case Intrinsic::x86_avx2_psrli_q:
2364 case Intrinsic::x86_avx2_psrli_w:
2365 case Intrinsic::x86_avx512_psrli_d_512:
2366 case Intrinsic::x86_avx512_psrli_q_512:
2367 case Intrinsic::x86_avx512_psrli_w_512:
2368 case Intrinsic::x86_sse2_pslli_d:
2369 case Intrinsic::x86_sse2_pslli_q:
2370 case Intrinsic::x86_sse2_pslli_w:
2371 case Intrinsic::x86_avx2_pslli_d:
2372 case Intrinsic::x86_avx2_pslli_q:
2373 case Intrinsic::x86_avx2_pslli_w:
2374 case Intrinsic::x86_avx512_pslli_d_512:
2375 case Intrinsic::x86_avx512_pslli_q_512:
2376 case Intrinsic::x86_avx512_pslli_w_512:
2377 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2378 return IC.replaceInstUsesWith(II, V);
2380 break;
2382 case Intrinsic::x86_sse2_psra_d:
2383 case Intrinsic::x86_sse2_psra_w:
2384 case Intrinsic::x86_avx2_psra_d:
2385 case Intrinsic::x86_avx2_psra_w:
2386 case Intrinsic::x86_avx512_psra_q_128:
2387 case Intrinsic::x86_avx512_psra_q_256:
2388 case Intrinsic::x86_avx512_psra_d_512:
2389 case Intrinsic::x86_avx512_psra_q_512:
2390 case Intrinsic::x86_avx512_psra_w_512:
2391 case Intrinsic::x86_sse2_psrl_d:
2392 case Intrinsic::x86_sse2_psrl_q:
2393 case Intrinsic::x86_sse2_psrl_w:
2394 case Intrinsic::x86_avx2_psrl_d:
2395 case Intrinsic::x86_avx2_psrl_q:
2396 case Intrinsic::x86_avx2_psrl_w:
2397 case Intrinsic::x86_avx512_psrl_d_512:
2398 case Intrinsic::x86_avx512_psrl_q_512:
2399 case Intrinsic::x86_avx512_psrl_w_512:
2400 case Intrinsic::x86_sse2_psll_d:
2401 case Intrinsic::x86_sse2_psll_q:
2402 case Intrinsic::x86_sse2_psll_w:
2403 case Intrinsic::x86_avx2_psll_d:
2404 case Intrinsic::x86_avx2_psll_q:
2405 case Intrinsic::x86_avx2_psll_w:
2406 case Intrinsic::x86_avx512_psll_d_512:
2407 case Intrinsic::x86_avx512_psll_q_512:
2408 case Intrinsic::x86_avx512_psll_w_512: {
2409 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2410 return IC.replaceInstUsesWith(II, V);
2413 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2414 // operand to compute the shift amount.
2415 Value *Arg1 = II.getArgOperand(1);
2416 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2417 "Unexpected packed shift size");
2418 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2420 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2421 return IC.replaceOperand(II, 1, V);
2423 break;
2426 case Intrinsic::x86_avx2_psllv_d:
2427 case Intrinsic::x86_avx2_psllv_d_256:
2428 case Intrinsic::x86_avx2_psllv_q:
2429 case Intrinsic::x86_avx2_psllv_q_256:
2430 case Intrinsic::x86_avx512_psllv_d_512:
2431 case Intrinsic::x86_avx512_psllv_q_512:
2432 case Intrinsic::x86_avx512_psllv_w_128:
2433 case Intrinsic::x86_avx512_psllv_w_256:
2434 case Intrinsic::x86_avx512_psllv_w_512:
2435 case Intrinsic::x86_avx2_psrav_d:
2436 case Intrinsic::x86_avx2_psrav_d_256:
2437 case Intrinsic::x86_avx512_psrav_q_128:
2438 case Intrinsic::x86_avx512_psrav_q_256:
2439 case Intrinsic::x86_avx512_psrav_d_512:
2440 case Intrinsic::x86_avx512_psrav_q_512:
2441 case Intrinsic::x86_avx512_psrav_w_128:
2442 case Intrinsic::x86_avx512_psrav_w_256:
2443 case Intrinsic::x86_avx512_psrav_w_512:
2444 case Intrinsic::x86_avx2_psrlv_d:
2445 case Intrinsic::x86_avx2_psrlv_d_256:
2446 case Intrinsic::x86_avx2_psrlv_q:
2447 case Intrinsic::x86_avx2_psrlv_q_256:
2448 case Intrinsic::x86_avx512_psrlv_d_512:
2449 case Intrinsic::x86_avx512_psrlv_q_512:
2450 case Intrinsic::x86_avx512_psrlv_w_128:
2451 case Intrinsic::x86_avx512_psrlv_w_256:
2452 case Intrinsic::x86_avx512_psrlv_w_512:
2453 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2454 return IC.replaceInstUsesWith(II, V);
2456 break;
2458 case Intrinsic::x86_sse2_packssdw_128:
2459 case Intrinsic::x86_sse2_packsswb_128:
2460 case Intrinsic::x86_avx2_packssdw:
2461 case Intrinsic::x86_avx2_packsswb:
2462 case Intrinsic::x86_avx512_packssdw_512:
2463 case Intrinsic::x86_avx512_packsswb_512:
2464 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2465 return IC.replaceInstUsesWith(II, V);
2467 break;
2469 case Intrinsic::x86_sse2_packuswb_128:
2470 case Intrinsic::x86_sse41_packusdw:
2471 case Intrinsic::x86_avx2_packusdw:
2472 case Intrinsic::x86_avx2_packuswb:
2473 case Intrinsic::x86_avx512_packusdw_512:
2474 case Intrinsic::x86_avx512_packuswb_512:
2475 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2476 return IC.replaceInstUsesWith(II, V);
2478 break;
2480 case Intrinsic::x86_pclmulqdq:
2481 case Intrinsic::x86_pclmulqdq_256:
2482 case Intrinsic::x86_pclmulqdq_512: {
2483 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2484 unsigned Imm = C->getZExtValue();
2486 bool MadeChange = false;
2487 Value *Arg0 = II.getArgOperand(0);
2488 Value *Arg1 = II.getArgOperand(1);
2489 unsigned VWidth =
2490 cast<FixedVectorType>(Arg0->getType())->getNumElements();
2492 APInt UndefElts1(VWidth, 0);
2493 APInt DemandedElts1 =
2494 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2495 if (Value *V =
2496 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2497 IC.replaceOperand(II, 0, V);
2498 MadeChange = true;
2501 APInt UndefElts2(VWidth, 0);
2502 APInt DemandedElts2 =
2503 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2504 if (Value *V =
2505 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2506 IC.replaceOperand(II, 1, V);
2507 MadeChange = true;
2510 // If either input elements are undef, the result is zero.
2511 if (DemandedElts1.isSubsetOf(UndefElts1) ||
2512 DemandedElts2.isSubsetOf(UndefElts2)) {
2513 return IC.replaceInstUsesWith(II,
2514 ConstantAggregateZero::get(II.getType()));
2517 if (MadeChange) {
2518 return &II;
2521 break;
2524 case Intrinsic::x86_sse41_insertps:
2525 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2526 return IC.replaceInstUsesWith(II, V);
2528 break;
2530 case Intrinsic::x86_sse4a_extrq: {
2531 Value *Op0 = II.getArgOperand(0);
2532 Value *Op1 = II.getArgOperand(1);
2533 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2534 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2535 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2536 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2537 VWidth1 == 16 && "Unexpected operand sizes");
2539 // See if we're dealing with constant values.
2540 auto *C1 = dyn_cast<Constant>(Op1);
2541 auto *CILength =
2542 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2543 : nullptr;
2544 auto *CIIndex =
2545 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2546 : nullptr;
2548 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2549 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2550 return IC.replaceInstUsesWith(II, V);
2553 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2554 // operands and the lowest 16-bits of the second.
2555 bool MadeChange = false;
2556 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2557 IC.replaceOperand(II, 0, V);
2558 MadeChange = true;
2560 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2561 IC.replaceOperand(II, 1, V);
2562 MadeChange = true;
2564 if (MadeChange) {
2565 return &II;
2567 break;
2570 case Intrinsic::x86_sse4a_extrqi: {
2571 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2572 // bits of the lower 64-bits. The upper 64-bits are undefined.
2573 Value *Op0 = II.getArgOperand(0);
2574 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2575 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2576 "Unexpected operand size");
2578 // See if we're dealing with constant values.
2579 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2580 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2582 // Attempt to simplify to a constant or shuffle vector.
2583 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2584 return IC.replaceInstUsesWith(II, V);
2587 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2588 // operand.
2589 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2590 return IC.replaceOperand(II, 0, V);
2592 break;
2595 case Intrinsic::x86_sse4a_insertq: {
2596 Value *Op0 = II.getArgOperand(0);
2597 Value *Op1 = II.getArgOperand(1);
2598 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2599 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2600 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2601 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2602 "Unexpected operand size");
2604 // See if we're dealing with constant values.
2605 auto *C1 = dyn_cast<Constant>(Op1);
2606 auto *CI11 =
2607 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2608 : nullptr;
2610 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2611 if (CI11) {
2612 const APInt &V11 = CI11->getValue();
2613 APInt Len = V11.zextOrTrunc(6);
2614 APInt Idx = V11.lshr(8).zextOrTrunc(6);
2615 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2616 return IC.replaceInstUsesWith(II, V);
2620 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2621 // operand.
2622 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2623 return IC.replaceOperand(II, 0, V);
2625 break;
2628 case Intrinsic::x86_sse4a_insertqi: {
2629 // INSERTQI: Extract lowest Length bits from lower half of second source and
2630 // insert over first source starting at Index bit. The upper 64-bits are
2631 // undefined.
2632 Value *Op0 = II.getArgOperand(0);
2633 Value *Op1 = II.getArgOperand(1);
2634 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2635 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2636 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2637 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2638 VWidth1 == 2 && "Unexpected operand sizes");
2640 // See if we're dealing with constant values.
2641 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2642 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2644 // Attempt to simplify to a constant or shuffle vector.
2645 if (CILength && CIIndex) {
2646 APInt Len = CILength->getValue().zextOrTrunc(6);
2647 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2648 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2649 return IC.replaceInstUsesWith(II, V);
2653 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2654 // operands.
2655 bool MadeChange = false;
2656 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2657 IC.replaceOperand(II, 0, V);
2658 MadeChange = true;
2660 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2661 IC.replaceOperand(II, 1, V);
2662 MadeChange = true;
2664 if (MadeChange) {
2665 return &II;
2667 break;
2670 case Intrinsic::x86_sse41_pblendvb:
2671 case Intrinsic::x86_sse41_blendvps:
2672 case Intrinsic::x86_sse41_blendvpd:
2673 case Intrinsic::x86_avx_blendv_ps_256:
2674 case Intrinsic::x86_avx_blendv_pd_256:
2675 case Intrinsic::x86_avx2_pblendvb: {
2676 // fold (blend A, A, Mask) -> A
2677 Value *Op0 = II.getArgOperand(0);
2678 Value *Op1 = II.getArgOperand(1);
2679 Value *Mask = II.getArgOperand(2);
2680 if (Op0 == Op1) {
2681 return IC.replaceInstUsesWith(II, Op0);
2684 // Zero Mask - select 1st argument.
2685 if (isa<ConstantAggregateZero>(Mask)) {
2686 return IC.replaceInstUsesWith(II, Op0);
2689 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2690 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2691 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
2692 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2695 // Convert to a vector select if we can bypass casts and find a boolean
2696 // vector condition value.
2697 Value *BoolVec;
2698 Mask = InstCombiner::peekThroughBitcast(Mask);
2699 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2700 BoolVec->getType()->isVectorTy() &&
2701 BoolVec->getType()->getScalarSizeInBits() == 1) {
2702 assert(Mask->getType()->getPrimitiveSizeInBits() ==
2703 II.getType()->getPrimitiveSizeInBits() &&
2704 "Not expecting mask and operands with different sizes");
2706 unsigned NumMaskElts =
2707 cast<FixedVectorType>(Mask->getType())->getNumElements();
2708 unsigned NumOperandElts =
2709 cast<FixedVectorType>(II.getType())->getNumElements();
2710 if (NumMaskElts == NumOperandElts) {
2711 return SelectInst::Create(BoolVec, Op1, Op0);
2714 // If the mask has less elements than the operands, each mask bit maps to
2715 // multiple elements of the operands. Bitcast back and forth.
2716 if (NumMaskElts < NumOperandElts) {
2717 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
2718 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
2719 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2720 return new BitCastInst(Sel, II.getType());
2724 break;
2727 case Intrinsic::x86_ssse3_pshuf_b_128:
2728 case Intrinsic::x86_avx2_pshuf_b:
2729 case Intrinsic::x86_avx512_pshuf_b_512:
2730 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2731 return IC.replaceInstUsesWith(II, V);
2733 break;
2735 case Intrinsic::x86_avx_vpermilvar_ps:
2736 case Intrinsic::x86_avx_vpermilvar_ps_256:
2737 case Intrinsic::x86_avx512_vpermilvar_ps_512:
2738 case Intrinsic::x86_avx_vpermilvar_pd:
2739 case Intrinsic::x86_avx_vpermilvar_pd_256:
2740 case Intrinsic::x86_avx512_vpermilvar_pd_512:
2741 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2742 return IC.replaceInstUsesWith(II, V);
2744 break;
2746 case Intrinsic::x86_avx2_permd:
2747 case Intrinsic::x86_avx2_permps:
2748 case Intrinsic::x86_avx512_permvar_df_256:
2749 case Intrinsic::x86_avx512_permvar_df_512:
2750 case Intrinsic::x86_avx512_permvar_di_256:
2751 case Intrinsic::x86_avx512_permvar_di_512:
2752 case Intrinsic::x86_avx512_permvar_hi_128:
2753 case Intrinsic::x86_avx512_permvar_hi_256:
2754 case Intrinsic::x86_avx512_permvar_hi_512:
2755 case Intrinsic::x86_avx512_permvar_qi_128:
2756 case Intrinsic::x86_avx512_permvar_qi_256:
2757 case Intrinsic::x86_avx512_permvar_qi_512:
2758 case Intrinsic::x86_avx512_permvar_sf_512:
2759 case Intrinsic::x86_avx512_permvar_si_512:
2760 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2761 return IC.replaceInstUsesWith(II, V);
2763 break;
2765 case Intrinsic::x86_avx_maskload_ps:
2766 case Intrinsic::x86_avx_maskload_pd:
2767 case Intrinsic::x86_avx_maskload_ps_256:
2768 case Intrinsic::x86_avx_maskload_pd_256:
2769 case Intrinsic::x86_avx2_maskload_d:
2770 case Intrinsic::x86_avx2_maskload_q:
2771 case Intrinsic::x86_avx2_maskload_d_256:
2772 case Intrinsic::x86_avx2_maskload_q_256:
2773 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2774 return I;
2776 break;
2778 case Intrinsic::x86_sse2_maskmov_dqu:
2779 case Intrinsic::x86_avx_maskstore_ps:
2780 case Intrinsic::x86_avx_maskstore_pd:
2781 case Intrinsic::x86_avx_maskstore_ps_256:
2782 case Intrinsic::x86_avx_maskstore_pd_256:
2783 case Intrinsic::x86_avx2_maskstore_d:
2784 case Intrinsic::x86_avx2_maskstore_q:
2785 case Intrinsic::x86_avx2_maskstore_d_256:
2786 case Intrinsic::x86_avx2_maskstore_q_256:
2787 if (simplifyX86MaskedStore(II, IC)) {
2788 return nullptr;
2790 break;
2792 case Intrinsic::x86_addcarry_32:
2793 case Intrinsic::x86_addcarry_64:
2794 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2795 return IC.replaceInstUsesWith(II, V);
2797 break;
2799 case Intrinsic::x86_avx512_pternlog_d_128:
2800 case Intrinsic::x86_avx512_pternlog_d_256:
2801 case Intrinsic::x86_avx512_pternlog_d_512:
2802 case Intrinsic::x86_avx512_pternlog_q_128:
2803 case Intrinsic::x86_avx512_pternlog_q_256:
2804 case Intrinsic::x86_avx512_pternlog_q_512:
2805 if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2806 return IC.replaceInstUsesWith(II, V);
2808 break;
2809 default:
2810 break;
2812 return std::nullopt;
2815 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2816 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2817 bool &KnownBitsComputed) const {
2818 switch (II.getIntrinsicID()) {
2819 default:
2820 break;
2821 case Intrinsic::x86_mmx_pmovmskb:
2822 case Intrinsic::x86_sse_movmsk_ps:
2823 case Intrinsic::x86_sse2_movmsk_pd:
2824 case Intrinsic::x86_sse2_pmovmskb_128:
2825 case Intrinsic::x86_avx_movmsk_ps_256:
2826 case Intrinsic::x86_avx_movmsk_pd_256:
2827 case Intrinsic::x86_avx2_pmovmskb: {
2828 // MOVMSK copies the vector elements' sign bits to the low bits
2829 // and zeros the high bits.
2830 unsigned ArgWidth;
2831 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2832 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2833 } else {
2834 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2835 ArgWidth = ArgType->getNumElements();
2838 // If we don't need any of low bits then return zero,
2839 // we know that DemandedMask is non-zero already.
2840 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
2841 Type *VTy = II.getType();
2842 if (DemandedElts.isZero()) {
2843 return ConstantInt::getNullValue(VTy);
2846 // We know that the upper bits are set to zero.
2847 Known.Zero.setBitsFrom(ArgWidth);
2848 KnownBitsComputed = true;
2849 break;
2852 return std::nullopt;
2855 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2856 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2857 APInt &UndefElts2, APInt &UndefElts3,
2858 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2859 simplifyAndSetOp) const {
2860 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
2861 switch (II.getIntrinsicID()) {
2862 default:
2863 break;
2864 case Intrinsic::x86_xop_vfrcz_ss:
2865 case Intrinsic::x86_xop_vfrcz_sd:
2866 // The instructions for these intrinsics are speced to zero upper bits not
2867 // pass them through like other scalar intrinsics. So we shouldn't just
2868 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2869 // Instead we should return a zero vector.
2870 if (!DemandedElts[0]) {
2871 IC.addToWorklist(&II);
2872 return ConstantAggregateZero::get(II.getType());
2875 // Only the lower element is used.
2876 DemandedElts = 1;
2877 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2879 // Only the lower element is undefined. The high elements are zero.
2880 UndefElts = UndefElts[0];
2881 break;
2883 // Unary scalar-as-vector operations that work column-wise.
2884 case Intrinsic::x86_sse_rcp_ss:
2885 case Intrinsic::x86_sse_rsqrt_ss:
2886 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2888 // If lowest element of a scalar op isn't used then use Arg0.
2889 if (!DemandedElts[0]) {
2890 IC.addToWorklist(&II);
2891 return II.getArgOperand(0);
2893 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2894 // checks).
2895 break;
2897 // Binary scalar-as-vector operations that work column-wise. The high
2898 // elements come from operand 0. The low element is a function of both
2899 // operands.
2900 case Intrinsic::x86_sse_min_ss:
2901 case Intrinsic::x86_sse_max_ss:
2902 case Intrinsic::x86_sse_cmp_ss:
2903 case Intrinsic::x86_sse2_min_sd:
2904 case Intrinsic::x86_sse2_max_sd:
2905 case Intrinsic::x86_sse2_cmp_sd: {
2906 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2908 // If lowest element of a scalar op isn't used then use Arg0.
2909 if (!DemandedElts[0]) {
2910 IC.addToWorklist(&II);
2911 return II.getArgOperand(0);
2914 // Only lower element is used for operand 1.
2915 DemandedElts = 1;
2916 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2918 // Lower element is undefined if both lower elements are undefined.
2919 // Consider things like undef&0. The result is known zero, not undef.
2920 if (!UndefElts2[0])
2921 UndefElts.clearBit(0);
2923 break;
2926 // Binary scalar-as-vector operations that work column-wise. The high
2927 // elements come from operand 0 and the low element comes from operand 1.
2928 case Intrinsic::x86_sse41_round_ss:
2929 case Intrinsic::x86_sse41_round_sd: {
2930 // Don't use the low element of operand 0.
2931 APInt DemandedElts2 = DemandedElts;
2932 DemandedElts2.clearBit(0);
2933 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
2935 // If lowest element of a scalar op isn't used then use Arg0.
2936 if (!DemandedElts[0]) {
2937 IC.addToWorklist(&II);
2938 return II.getArgOperand(0);
2941 // Only lower element is used for operand 1.
2942 DemandedElts = 1;
2943 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2945 // Take the high undef elements from operand 0 and take the lower element
2946 // from operand 1.
2947 UndefElts.clearBit(0);
2948 UndefElts |= UndefElts2[0];
2949 break;
2952 // Three input scalar-as-vector operations that work column-wise. The high
2953 // elements come from operand 0 and the low element is a function of all
2954 // three inputs.
2955 case Intrinsic::x86_avx512_mask_add_ss_round:
2956 case Intrinsic::x86_avx512_mask_div_ss_round:
2957 case Intrinsic::x86_avx512_mask_mul_ss_round:
2958 case Intrinsic::x86_avx512_mask_sub_ss_round:
2959 case Intrinsic::x86_avx512_mask_max_ss_round:
2960 case Intrinsic::x86_avx512_mask_min_ss_round:
2961 case Intrinsic::x86_avx512_mask_add_sd_round:
2962 case Intrinsic::x86_avx512_mask_div_sd_round:
2963 case Intrinsic::x86_avx512_mask_mul_sd_round:
2964 case Intrinsic::x86_avx512_mask_sub_sd_round:
2965 case Intrinsic::x86_avx512_mask_max_sd_round:
2966 case Intrinsic::x86_avx512_mask_min_sd_round:
2967 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2969 // If lowest element of a scalar op isn't used then use Arg0.
2970 if (!DemandedElts[0]) {
2971 IC.addToWorklist(&II);
2972 return II.getArgOperand(0);
2975 // Only lower element is used for operand 1 and 2.
2976 DemandedElts = 1;
2977 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2978 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
2980 // Lower element is undefined if all three lower elements are undefined.
2981 // Consider things like undef&0. The result is known zero, not undef.
2982 if (!UndefElts2[0] || !UndefElts3[0])
2983 UndefElts.clearBit(0);
2984 break;
2986 // TODO: Add fmaddsub support?
2987 case Intrinsic::x86_sse3_addsub_pd:
2988 case Intrinsic::x86_sse3_addsub_ps:
2989 case Intrinsic::x86_avx_addsub_pd_256:
2990 case Intrinsic::x86_avx_addsub_ps_256: {
2991 // If none of the even or none of the odd lanes are required, turn this
2992 // into a generic FP math instruction.
2993 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
2994 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
2995 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
2996 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
2997 if (IsSubOnly || IsAddOnly) {
2998 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
2999 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3000 IC.Builder.SetInsertPoint(&II);
3001 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3002 return IC.Builder.CreateBinOp(
3003 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3006 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3007 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3008 UndefElts &= UndefElts2;
3009 break;
3012 // General per-element vector operations.
3013 case Intrinsic::x86_avx2_psllv_d:
3014 case Intrinsic::x86_avx2_psllv_d_256:
3015 case Intrinsic::x86_avx2_psllv_q:
3016 case Intrinsic::x86_avx2_psllv_q_256:
3017 case Intrinsic::x86_avx2_psrlv_d:
3018 case Intrinsic::x86_avx2_psrlv_d_256:
3019 case Intrinsic::x86_avx2_psrlv_q:
3020 case Intrinsic::x86_avx2_psrlv_q_256:
3021 case Intrinsic::x86_avx2_psrav_d:
3022 case Intrinsic::x86_avx2_psrav_d_256: {
3023 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3024 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3025 UndefElts &= UndefElts2;
3026 break;
3029 case Intrinsic::x86_sse2_packssdw_128:
3030 case Intrinsic::x86_sse2_packsswb_128:
3031 case Intrinsic::x86_sse2_packuswb_128:
3032 case Intrinsic::x86_sse41_packusdw:
3033 case Intrinsic::x86_avx2_packssdw:
3034 case Intrinsic::x86_avx2_packsswb:
3035 case Intrinsic::x86_avx2_packusdw:
3036 case Intrinsic::x86_avx2_packuswb:
3037 case Intrinsic::x86_avx512_packssdw_512:
3038 case Intrinsic::x86_avx512_packsswb_512:
3039 case Intrinsic::x86_avx512_packusdw_512:
3040 case Intrinsic::x86_avx512_packuswb_512: {
3041 auto *Ty0 = II.getArgOperand(0)->getType();
3042 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3043 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3045 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3046 unsigned VWidthPerLane = VWidth / NumLanes;
3047 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3049 // Per lane, pack the elements of the first input and then the second.
3050 // e.g.
3051 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3052 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3053 for (int OpNum = 0; OpNum != 2; ++OpNum) {
3054 APInt OpDemandedElts(InnerVWidth, 0);
3055 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3056 unsigned LaneIdx = Lane * VWidthPerLane;
3057 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3058 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3059 if (DemandedElts[Idx])
3060 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3064 // Demand elements from the operand.
3065 APInt OpUndefElts(InnerVWidth, 0);
3066 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3068 // Pack the operand's UNDEF elements, one lane at a time.
3069 OpUndefElts = OpUndefElts.zext(VWidth);
3070 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3071 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3072 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3073 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3074 UndefElts |= LaneElts;
3077 break;
3080 // PSHUFB
3081 case Intrinsic::x86_ssse3_pshuf_b_128:
3082 case Intrinsic::x86_avx2_pshuf_b:
3083 case Intrinsic::x86_avx512_pshuf_b_512:
3084 // PERMILVAR
3085 case Intrinsic::x86_avx_vpermilvar_ps:
3086 case Intrinsic::x86_avx_vpermilvar_ps_256:
3087 case Intrinsic::x86_avx512_vpermilvar_ps_512:
3088 case Intrinsic::x86_avx_vpermilvar_pd:
3089 case Intrinsic::x86_avx_vpermilvar_pd_256:
3090 case Intrinsic::x86_avx512_vpermilvar_pd_512:
3091 // PERMV
3092 case Intrinsic::x86_avx2_permd:
3093 case Intrinsic::x86_avx2_permps: {
3094 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3095 break;
3098 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3099 // in an undefined state.
3100 case Intrinsic::x86_sse4a_extrq:
3101 case Intrinsic::x86_sse4a_extrqi:
3102 case Intrinsic::x86_sse4a_insertq:
3103 case Intrinsic::x86_sse4a_insertqi:
3104 UndefElts.setHighBits(VWidth / 2);
3105 break;
3107 return std::nullopt;